
    i                     0   d dl Z d dlmZ d dlmZ d dlZd dlmc mZ	 d dlmZ ddl
mZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1  e'jd                  e3      Z4e% G d de              Z5e e%d       G d de                    Z6e e%d       G d de                    Z7e e%d       G d d e                    Z8 G d! d"ejr                        Z:d#ejv                  d$e<d%ejv                  fd&Z=	 d[d'ejr                  d(ejv                  d)ejv                  d*ejv                  d+ejv                  dz  d,e>d-e>d.e"e$   fd/Z? G d0 d1ejr                        Z@ G d2 d3ejr                        ZA G d4 d5e      ZB G d6 d7ejr                        ZCe% G d8 d9e5             ZD G d: d;ejr                        ZE G d< d=ejr                        ZF G d> d?ejr                        ZG G d@ dAejr                        ZH G dB dCejr                        ZI G dD dEejr                        ZJ G dF dGejr                        ZK G dH dIejr                        ZL G dJ dKejr                        ZMee% G dL dMe                    ZN e%dN       G dO dPe5             ZO G dQ dRejr                        ZP G dS dTejr                        ZQ e%dU       G dV dWe5             ZR G dX dYe5e      ZSg dZZTy)\    N)Callable)	dataclass)nn   )initialization)ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check	torch_int)check_model_inputs   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   R     e Zd ZU eed<   dZdZdZddgZddgZ	dZ
dZdZ fd	Z xZS )
JanusPreTrainedModelconfigmodelimagetextTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )Nr   r/   )super_init_weights
isinstanceJanusVisionEmbeddingsinitcopy_position_idstorcharangeshapeexpand)selfmodule	__class__s     r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/transformers/models/janus/modeling_janus.pyr2   z"JanusPreTrainedModel._init_weightsC   s[    f%f34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5    )__name__
__module____qualname__r    __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr2   __classcell__r>   s   @r?   r$   r$   6   sO    (&*#,.GH#4m"DN!i ir@   r$   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   b    e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   y)JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)	rA   rB   rC   __doc__rR   r8   FloatTensorrD   rS    r@   r?   rQ   rQ   I   s4     6:%++d29/3NE%%,3r@   rQ   zy
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                       e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)JanusBaseModelOutputWithPasta  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_stater,   hidden_states
attentionsimage_hidden_states)rA   rB   rC   rT   rY   r8   rU   rD   r,   r	   rZ   tupler[   r\   rV   r@   r?   rX   rX   [   s|    & 37u((4/6$(OUT\(59M5**+d2926Je''(4/6;?u001D8?r@   rX   zQ
    Base class for Janus causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)	JanusCausalLMOutputWithPastae  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr,   rZ   r[   r\   )rA   rB   rC   rT   r`   r8   rU   rD   ra   r,   r	   rZ   r]   r[   r\   rV   r@   r?   r_   r_   |   s    " &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/6;?u001D8?r@   r_   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e	dej                  fd
Z
 xZS )r4   r%   c                 f   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  | j                  | j                  | j                  d      | _
        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        | j                  dt!        j"                  | j                        j%                  d      d       y )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   r7   r0   F)
persistent)r1   __init__r%   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr8   r9   r;   r<   r%   r>   s     r?   rk   zJanusVisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr@   
embeddingsheightwidthreturnc                    |j                   d   }| j                  j                  j                   d   }t        j                  j                         s%||k(  r ||k(  r| j                  | j                        S | j                  j                  j                  d      }|j                   d   }|| j                  z  }|| j                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   r/   g      ?r   r   bicubicF)sizemodealign_corners)r:   rv   weightr8   jit
is_tracingr7   	unsqueezero   r   reshapepermuter   
functionalinterpolateview)r<   ry   rz   r{   rs   rt   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r?   interpolate_pos_encodingz.JanusVisionEmbeddings.interpolate_pos_encoding   sE    !&&q)//66<<Q? yy##%+*F6UZ?**4+<+<==1188BB1Er"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nr@   pixel_valuesr   c                 X   |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }|r| j                  |||      }	n| j                  | j                        }	||	z   }|S )N)dtyper   r   )
r:   rr   r   r   toflatten	transposer   rv   r7   )
r<   r   r   _rz   r{   target_dtypepatch_embedsry   
pos_embedss
             r?   forwardzJanusVisionEmbeddings.forward   s    *001fe++2288++LOO,O,OP!))!,66q!<
#66z65QJ001B1BCJ*,
r@   )F)rA   rB   rC   r!   rk   r8   Tensorintr   boolr   rM   rN   s   @r?   r4   r4      se    q0 q($5<< $ $UX $]b]i]i $LELL D ]b]i]i r@   r4   rZ   n_repr|   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r:   r;   r   )rZ   r   batchnum_key_value_headsslenhead_dims         r?   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr@   r=   querykeyvalueattention_maskscalingdropoutkwargsc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr   r   r/   )r   r   )ptrainingr   )r   num_key_value_groupsr8   matmulr   r:   r   r   softmaxfloat32r   r   r   r   
contiguous)r=   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsr-   attn_outputs                r?   eager_attention_forwardr      s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r@   c                   t     e Zd ZdZdef fdZ	 d	dej                  dej                  dz  dee	   fdZ
 xZS )
JanusVisionAttentionz(Attention Class for Janus Vision Encoderr%   c                 F   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _
        |j                  }|j                  }d| _        d| _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                        | _        |dkD  rt        j,                  |      nt        j.                         | _        |rt        j0                  | j                        nt        j.                         | _        |r%t        j0                  | j                        | _        y t        j.                         | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      Fr   biasr   )r1   rk   r%   rl   rm   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)r<   r%   proj_dropoutqk_normr>   s       r?   rk   zJanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=r@   NrZ   r   r   c                 "   |j                         \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  d| j
                  | j                        }| j                  |      }|j	                  d| j
                  | j                        }| j                  |      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                   sdn| j"                  | j$                  | j&                  d|\  }}|j	                  ||| j(                        }| j+                  |      }| j-                  |      }||fS )Nr/   r   r           )r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   get_interfacer%   _attn_implementationr   r   r   r   r   rm   r   r   )r<   rZ   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputs                 r?   r   zJanusVisionAttention.forward-  s    "/!3!3!5
GQ{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HJJnn
%
 
%
!\ "))*gt~~N&&{3((0|##r@   N)rA   rB   rC   rT   r!   rk   r8   r   r   r   r   rM   rN   s   @r?   r   r     sO    2Q0 Q@ /3)$||)$ t+)$ +,	)$r@   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )JanusVisionMLPr%   c                    t         |           || _        t        |j                  |j
                  z        | _        t        |j                     | _	        t        j                  |j                  | j                        | _        t        j                  | j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                        | _        y r   )r1   rk   r%   r   rl   	mlp_ratiointermediate_sizer   
hidden_actactivation_fnr   r   fc1fc2r   hidden_dropout_ratedropout1dropout2rx   s     r?   rk   zJanusVisionMLP.__init__Z  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>r@   rZ   r|   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r   )r   r   r   r   r   r<   rZ   s     r?   r   zJanusVisionMLP.forwardd  sP    /**=9m4/m4r@   )	rA   rB   rC   r!   rk   r8   r   r   rM   rN   s   @r?   r   r   Y  s+    ?0 ?U\\ ell r@   r   c            	            e Zd Zdef fdZedej                  dej                  dee	   dej                  fd       Z xZS )r+   r%   c                 R   t         |           |j                  | _        t	        j
                  | j                  |j                        | _        t        |      | _	        t	        j
                  | j                  |j                        | _
        t        |      | _        || _        y N)eps)r1   rk   rl   rm   r   r   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr%   rx   s     r?   rk   z JanusVisionEncoderLayer.__init__n  st    ++<<F<Q<QR-f5<<F<Q<QR!&)r@   rZ   r   r   r|   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rZ   r   rV   )r   r   r   r   )r<   rZ   r   r   residualr   s         r?   r   zJanusVisionEncoderLayer.forwardw  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r@   )rA   rB   rC   r!   rk   r   r8   r   r   r   rU   r   rM   rN   s   @r?   r+   r+   m  s^    0  ||  +,	
 
		 r@   r+   c                   j     e Zd ZdZdef fdZe	 d	dej                  dz  de	e
   defd       Z xZS )
JanusVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`JanusVisionEncoderLayer`].

    Args:
        config: JanusVisionConfig
    r%   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r1   rk   r%   r   
ModuleListrangenum_hidden_layersr+   layersgradient_checkpointingr<   r%   r   r>   s      r?   rk   zJanusVisionEncoder.__init__  sP    mmeTZTlTlNm$n%<V%D$no&+# %os   A#Nr   r   r|   c                 T    |}| j                   D ]  } |||fi |} t        |      S )N)rY   )r   r   )r<   inputs_embedsr   r   rZ   encoder_layers         r?   r   zJanusVisionEncoder.forward  sC     &![[ 	M) M	 ??r@   r   )rA   rB   rC   rT   r!   rk   r   r8   r   r   r   r   r   rM   rN   s   @r?   r   r     s`    ,0 ,  /3@ t+@ +,	@
 
@ @r@   r   c                        e Zd ZU dZdZeed<   eedZ	def fdZ
 ed      e	 	 ddej                  dz  d	ed
ee   deez  fd              Zd Z xZS )JanusVisionModelr   )r(   r%   rZ   r[   c                     t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        | j                          y r   )r1   rk   r%   rl   r4   ry   r   encoderr   r   r   post_layernorm	post_init)r<   r%   rm   r>   s      r?   rk   zJanusVisionModel.__init__  s]     &&	/7)&1 ll9&:O:OPr@   F)tie_last_hidden_statesNr   r   r|   c                     |t        d      | j                  ||      } | j                  dd|i|}|j                  }| j	                  |      }|d d dd d f   }| j	                  |      }t        ||      S )Nz You have to specify pixel_values)r   r  r   )rY   pooler_outputrV   )r   ry   r
  rY   r  r   )r<   r   r   r   rZ   encoder_outputsrY   pooled_outputs           r?   r   zJanusVisionModel.forward  s     ?@@Ogh+74<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
r@   c                     | j                   S r   )ry   r<   s    r?   get_input_embeddingsz%JanusVisionModel.get_input_embeddings  s    r@   r   )rA   rB   rC   main_input_namerF   r!   rD   r+   r   _can_record_outputsrk   r   r   r8   rU   r   r   r   r]   r   r   r  rM   rN   s   @r?   r  r    s    $O!0*
	0 	 u5 26).
''$.
 #'
 +,	

 
+	+
  6
6r@   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVisionAlignerMLPr%   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w Nr   )r1   rk   r   r   rl   projection_dimr   r   r   depthhidden_layersr   r   r   r  s      r?   rk   zJanusVisionAlignerMLP.__init__  s    99V//1F1FG]]NSTUW]WcWcNdeRYYv,,f.C.CDe
 $F$5$56 f   &1B<c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r   r  r   r<   rZ   layers      r?   r   zJanusVisionAlignerMLP.forward  G    /'' 	1E ..}=M!-0M	1 r@   )rA   rB   rC   r!   rk   r   rM   rN   s   @r?   r  r    s    70 7r@   r  c                        e Zd ZdZdef fdZdej                  fdZdej                  dej                  fdZ xZS )	JanusVQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r%   c                    t         |           |j                  | _        |j                  | _        t        |dd      | _        t        j                  | j                  | j                        | _	        |j                  gdz  | _        y )Nbetag      ?r   )r1   rk   num_embeddingsrm   embedding_dimgetattrr'  r   ru   	embeddingrs   quant_state_dimsrx   s     r?   rk   z"JanusVQVAEVectorQuantizer.__init__  sn    $33#--FFD1	d&9&94;M;MN!'!3!3 4q 8r@   hidden_statec           
      L   |j                  dddd      j                         }|j                  d| j                        }t	        j
                  |dz  dd      t	        j
                  | j                  j                  dz  d      z   dt	        j                  d	|| j                  j                  j                  dd            z  z
  }t	        j                  |d      }| j                  |      j                  |j                        }t	        j                  |j                         |z
  dz        | j                  t	        j                  ||j                         z
  dz        z  z   }|||z
  j                         z   }|j                  dddd      j                         }|||fS )
Nr   r   r   r   r/   T)r   keepdimr   z	bd,dn->bn)r   r   r   r)  r8   sumr+  r   einsumr   argminr:   meandetachr'  )r<   r-  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantr`   s          r?   r   z!JanusVQVAEVectorQuantizer.forward  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BDNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e,"5"5"77A=\
 P
 

 *-?,-N,V,V,XX 0771aCNNP!4)===r@   image_tokensr|   c                 B   |j                   d   }| j                  j                  j                   d   }| j                  |      }t        j                  |dd      }|j                  |g| j                  |      }|j                  dddd      j                         }|S )Nr   r/   r   )r   r   r   r   )	r:   r+  r   F	normalizer   r,  r   r   )r<   r:  r   emb_dimr9  s        r?   get_codebook_entryz,JanusVQVAEVectorQuantizer.get_codebook_entry)  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!r@   )rA   rB   rC   rT   r"   rk   r8   r   r   
LongTensorrU   r?  rM   rN   s   @r?   r%  r%    sD    9/ 9>ELL >6"u/?/? "EDUDU "r@   r%  c                   *     e Zd Z	 	 d fd	Zd Z xZS )JanusVQVAEResnetBlockc                    t         |           || _        ||n|| _        || _        t
        j                  j                  d|dd      | _        t
        j                  j                  ||ddd      | _
        t
        j                  j                  d|dd      | _        t
        j                  j                  |j                        | _        t
        j                  j                  ||ddd      | _        | j                  | j                  k7  r`| j                  r*t
        j                  j                  ||ddd      | _        y t
        j                  j                  ||ddd      | _        y y )	N    ư>T
num_groupsrq   r   affiner   r   rg   rh   ri   r   )r1   rk   re   rf   use_conv_shortcutr8   r   	GroupNormnorm1rp   conv1norm2r   r   conv2conv_shortcutnin_shortcut)r<   r%   re   rf   rP  r>   s        r?   rk   zJanusVQVAEResnetBlock.__init__:  s1    	&+7+?K\!.XX''2KUYbf'g
XX__[,AVWab_c
XX''2LVZcg'h
xx''7XX__\<QWXbc_d
t000%%%*XX__[,\]fgqr_%s"$)HHOOK[\efpqO$r!	 1r@   c                    |}| j                  |      }|t        j                  |      z  }| j                  |      }| j	                  |      }|t        j                  |      z  }| j                  |      }| j                  |      }| j                  | j                  k7  r3| j                  r| j                  |      }||z   S | j                  |      }||z   S r   )rL  r8   sigmoidrM  rN  r   rO  re   rf   rJ  rP  rQ  )r<   rZ   r   s      r?   r   zJanusVQVAEResnetBlock.forwardQ  s     

=1}55

=1

=1}55]3

=1t000%%--h7 -''  ,,X6-''r@   r   rA   rB   rC   rk   r   rM   rN   s   @r?   rB  rB  9  s    
 s.(r@   rB  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEAttnBlockc                    t         |           || _        t        j                  j                  d|dd      | _        t        j                  j                  ||ddd      | _        t        j                  j                  ||ddd      | _	        t        j                  j                  ||ddd      | _
        t        j                  j                  ||ddd      | _        y )NrD  rE  TrF  r   r   rI  )r1   rk   re   r8   r   rK  normrp   qkvproj_outr<   re   r>   s     r?   rk   zJanusVQVAEAttnBlock.__init__f  s    &HH&&";TXae&f	kqQR\]^kqQR\]^kqQR\]^[aXYcder@   c                 t   |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }|j                  \  }}}}	|j                  ||||	z        j                  ddd      }|j                  ||||	z        }t        j                  ||      }
|
t        |      dz  z  }
t        j                  |
d      }
|j                  ||||	z        }|
j                  ddd      }
t        j                  ||
      j                  ||||	      }| j                  |      }||z   S )Nr   r   r   r   r0  )rX  rY  rZ  r[  r:   r   r   r8   bmmr   r<  r   r\  )r<   rZ   r   r   r   r   r   channelsrz   r{   r   r   s               r?   r   zJanusVQVAEAttnBlock.forwardp  s5    		-0vvm,VVM*
vvm, /;.@.@+
Hfe#++J&5.QYYZ[]^`ab''
HfunM
yyz:#s8}'>?yy15 $++J&5.Q#++Aq!4iil;CCJPXZ`bghmmK0+%%r@   rT  rN   s   @r?   rV  rV  e  s    f&r@   rV  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvDownsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r   r   rI  )r1   rk   r   rp   convr]  s     r?   rk   z!JanusVQVAEConvDownsample.__init__  s'    IIk;AaYZ[	r@   c                 Z    t        j                  |ddd      }| j                  |      }|S )N)r   r   r   r   constantr   )padr   r   )r<  rg  rd  r   s     r?   r   z JanusVQVAEConvDownsample.forward  s+    mJVWX		-0r@   rT  rN   s   @r?   rb  rb    s    \r@   rb  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvUpsamplec                 t    t         |           t        j                  j	                  ||ddd      | _        y )Nr   r   rI  )r1   rk   r8   r   rp   rd  r]  s     r?   rk   zJanusVQVAEConvUpsample.__init__  s.    HHOOK!TU_`Oa	r@   c                 X    t        j                  |dd      }| j                  |      }|S )Ng       @nearest)scale_factorr   )r<  r   rd  r   s     r?   r   zJanusVQVAEConvUpsample.forward  s(    m#IV		-0r@   rT  rN   s   @r?   ri  ri    s    br@   ri  c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZ	S )JanusVQVAEMidBlockr%   r`  c                     t         |           t        |||      | _        t	        |      | _        t        |||      | _        y )Nr%   re   rf   )r1   rk   rB  block_1rV  attn_1block_2)r<   r%   r`  r>   s      r?   rk   zJanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
r@   rZ   r|   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rr  rs  rt  r   s     r?   r   zJanusVQVAEMidBlock.forward  s2    ]3M2]3r@   )
rA   rB   rC   r"   r   rk   r8   r   r   rM   rN   s   @r?   ro  ro    s2    
/ 
3 
U\\ ell r@   ro  c                   >     e Zd Z fdZdej
                  fdZ xZS )JanusVQVAEEncoderc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }|j                  }|j                  }t        j                  j                  ||ddd      | _        dt        |      z   }|| _        t        j                          | _        t%        | j                        D ]   }t        j                          }	t        j                          }
|||   z  }|||   z  }t%        | j
                        D ]N  }|	j'                  t)        |||             |}|| j                  dz
  k(  s5|
j'                  t+        |             P t        j,                         }|	|_        |
|_        || j                  dz
  k7  rt3        |      |_        | j"                  j'                  |        t7        |      | _        t        j                  j;                  d|dd	      | _        t        j                  j                  ||rd
|z  n|ddd      | _        y )Nr   r   rI  )r   rq  rD  rE  TrF  r   ) r1   rk   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsre   double_latentlatent_channelsr8   r   rp   conv_inr]   in_channel_multiplierr   downr   appendrB  rV  Moduleblockattnrb  
downsamplero  midrK  norm_outconv_out)r<   r%   r}  re   r~  r  rz  r  i_levelr  r  block_in	block_outi_blockr  r>   s                  r?   rk   zJanusVQVAEEncoder.__init__  s   "6#<#<=$33,,((,, 00#66xx{MqYZdef $u-?'@ @%:"MMO	T112 	#GMMOE==?D$'<W'EEH%(:7(CCI !4!45 
?)%$,%. %d22Q66KK 3H =>
? 99;DDJDI$..22":8"DIIT"-	#0 &fh7**bxUYbf*g#0Ao ( 
r@   r   c                    | j                  |      g}t        | j                        D ]  }t        | j                        D ]  } | j                  |   j
                  |   |d         }t        | j                  |   j                        dkD  r" | j                  |   j                  |   |      }|j                  |        || j                  dz
  k7  s|j                  | j                  |   j                  |d                 |d   }| j                  |      }| j                  |      }|t        j                  |      z  }| j                  |      }|S )Nr/   r   r   )r  r   r{  r|  r  r  ry  r  r  r  r  r  r8   rS  r  )r<   r   rZ   r  r  r-  rY   s          r?   r   zJanusVQVAEEncoder.forward  sT   l34T112 		WG !4!45 3@tyy177@!"%  tyy)../!3#C499W#5#:#:7#CL#QL$$\23 $..22$$TYYw%7%B%B=QSCT%UV		W *"- HH%67 !MM*;<U]]+<== MM*;<  r@   )rA   rB   rC   rk   r8   r@  r   rM   rN   s   @r?   rw  rw    s    1
f!E$4$4 !r@   rw  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )JanusVQVAEDecoderc           	      v   t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }||j                  | j                  dz
     z  }t        j                  j                  ||ddd      | _        t        ||      | _        t        j                         | _        t#        t%        | j                              D ]  }t        j                         }t        j                         }||j                  |   z  }	t%        | j
                  dz         D ]N  }
|j'                  t)        |||	             |	}|| j                  dz
  k(  s5|j'                  t+        |             P t        j,                         }||_        ||_        |dk7  rt3        |      |_        | j                   j'                  |        t        j                  j7                  d|dd	      | _        t        j                  j                  ||ddd      | _        y )
Nr   r   rI  rq  r   rD  rE  TrF  )r1   rk   ry  rz  r{  r|  r}  r  rf   r8   r   rp   r  ro  r  r   upreversedr   r  rB  rV  r  r  r  ri  upsamplerK  r  r  )r<   r%   r}  r  rf   r  r  r  r  r  r  r  r>   s               r?   rk   zJanusVQVAEDecoder.__init__  s   "6#<#<=$33,, 00** !6#<#<T=Q=QTU=U#VV xxaXYcde &fh7 --/d&:&: ;< 	GMMOE==?D%(A(A'(JJI !4!4q!89 
?)%$,%. %d22Q66KK 3H =>
? BBHBG!|4X>GGNN2)	. **bxUYbf*g,AVWabcr@   r-  r|   c                 b   | j                  |      }| j                  |      }t        | j                        D ]  }t        | j                  dz         D ]l  } | j
                  |   j                  |   |      }t        | j
                  |   j                        dkD  sK | j
                  |   j                  |   |      }n || j                  dz
  k7  s| j
                  |   j                  |      } | j                  |      }|t        j                  |      z  }| j                  |      }|S )Nr   r   )r  r  r   r{  r|  r  r  ry  r  r  r  r8   rS  r  )r<   r-  r  r  s       r?   r   zJanusVQVAEDecoder.forward1  s    ||L1 xx- T112 	GG !4!4q!89 P>twww/55g>|Ltwww',,-1#A4777#3#8#8#A,#OLP $..22#www/88F	G }}\2l33}}\2r@   )rA   rB   rC   rk   r8   rU   r   rM   rN   s   @r?   r  r    s)    ,d\E$5$5 %:K:K r@   r  c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   y)JanusVQVAEModelOutputa  
    quantized_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Quantized last hidden state from the VQ-VAE model.
    image_tokens (`torch.FloatTensor` of shape `(batch_size, config.vocab_size`):
        Indices of the image tokens predicted by the VQ-VAE model.
    embedding_loss (`torch.FloatTensor`):
        The embedding loss computed during quantization.
    Nquantized_last_hidden_stater:  rS   )
rA   rB   rC   rT   r  r8   rU   rD   r:  rS   rV   r@   r?   r  r  F  sJ     =A!2!2T!9@-1L%##d*1/3NE%%,3r@   r  aS  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    c                        e Zd ZU eed<   g dZeedZdZ	def fdZ
edej                  dee   defd       Zd	ej                  dej$                  fd
Zeedej$                  deej$                  ej$                  f   fd              Z xZS )
JanusVQVAEr%   )rV  rB  r%  r  r   c                    t         |   |       t        |      | _        t	        |      | _        t        j                  j                  |j                  |j                  d      | _        t        j                  j                  |j                  |j                  d      | _        | j                          t        |      | _        d| _        | j#                          y )Nr   F)r1   rk   rw  r
  r%  quantizer8   r   rp   r  rm   
quant_convpost_quant_convevalr  decoderr  r  rx   s     r?   rk   zJanusVQVAE.__init__l  s     (01&9((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		(0&+#r@   r   r|   c                     | j                  |      }| j                  |      }| j                  |      \  }}}t        ||||      S )N)rY   r  r:  rS   )r
  r  r  r  )r<   r   r   rZ   conv_hidden_statesr  emb_lossindicess           r?   encodezJanusVQVAE.encodex  sO    \2!__];9=GY9Z6#Xw$+(C #	
 	
r@   r:  c                    |j                   d   | j                  j                  d   | j                  j                  d   z  k7  rMt        d| j                  j                  d   | j                  j                  d   z   d|j                    d      | j                  j	                  |      }| j                  |      }| j                  |      }|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        r   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r:   r  r,  r   r?  r  r  )r<   r:  codebook_entryrZ   r   s        r?   decodezJanusVQVAE.decode  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2r@   c                     |j                   d   } | j                  |fddi|}| j                  |j                  j	                  |d            }t        ||j                        S )Nr   return_dictTr/   )r:   r  r  r:  r   rQ   rS   )r<   r   r   r   encode_outputsrR   s         r?   r   zJanusVQVAE.forward  se     "''*
$\NtNvN#{{>+F+F+K+KJXZ+[\ 4n6S6STTr@   )rA   rB   rC   r"   rD   rH   rB  rV  r  r  rk   r   r8   r@  r   r   r  r  rU   r  r   r   r]   r   rM   rN   s   @r?   r  r  W  s      /) %O
/ 
 	
5#3#3 	
vFX?Y 	
^s 	
 	
5#3#3 8I8I & 	U''	U 
u  %"3"33	4		U  	Ur@   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVQVAEAlignerMLPr%   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w r  )r1   rk   r   r   rm   r  r   r   r   r   r  r   r   r   r  s      r?   rk   zJanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqRYYv,,f.C.CDq
 $F$5$56 rr  c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r   r!  s      r?   r   zJanusVQVAEAlignerMLP.forward  r#  r@   )rA   rB   rC   r"   rk   r   rM   rN   s   @r?   r  r    s    7/ 7r@   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ	 xZ
S )JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r%   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        t        j                  |j
                  |j                        | _        y r   )r1   rk   r   r   image_token_embed_dimr  r\  r   r   r   r(  vision_headrx   s     r?   rk   zJanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRr@   rZ   r|   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r\  r   r  r   s     r?   r   zJanusVQVAEHead.forward  s6    m4**=9((7r@   )rA   rB   rC   rT   r"   rk   r8   r   tensorr   rM   rN   s   @r?   r  r    s0    YS/ SU\\ ell r@   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                       e Zd Zdef fdZd Zd Zeede	j                  dee   deez  fd              Zd	e	j                   d
e	j                  de	j                  fdZee	 	 	 	 	 	 	 	 	 dd	e	j                   dz  de	j                  dz  de	j$                  dz  de	j                   dz  dedz  de	j                   dz  d
e	j                  dz  dedz  dee	j$                  z  defd              Z xZS )
JanusModelr%   c                    t         |   |       || _        t        j	                  |j
                        | _        t        | j                  j                        | _        t        j	                  |j                        | _        t        j                  | j                  j                  j                  | j                  j                  j                        | _        t#        | j                  j                        | _        t'        | j                  j                        | _        t+        j,                  |j.                        | _        d| _        | j5                          y )N)r%   F)r1   rk   r%   r  _from_configvision_configvision_modelr  alignerr  	vq_configvqmodelr   ru   r(  rm   generation_embeddingsr  generation_alignerr  generation_headr   from_configtext_configlanguage_modelr  r  rx   s     r?   rk   zJanusModel.__init__  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#r@   c                 6    | j                   j                         S r   )r  r  r  s    r?   r  zJanusModel.get_input_embeddings  s    ""7799r@   c                 :    | j                   j                  |       y r   )r  set_input_embeddingsr<   r   s     r?   r  zJanusModel.set_input_embeddings  s    007r@   r   r   r|   c                 p     | j                   |fddi|}| j                  |j                        |_        |S )Nr  T)r  r  rY   r  )r<   r   r   vision_outputss       r?   get_image_featureszJanusModel.get_image_features  s=    
 +**<TTTVT'+||N4T4T'U$r@   	input_idsr  image_featuresc                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r   devicer/   r   r   z6Image features and image tokens do not match, tokens: z, features: )r  r8   r  r%   image_token_idlongr  allr1  r:   r   	expand_asr   r   numel)r<   r  r  r  special_image_maskn_image_tokensn_image_featuress          r?   get_placeholder_maskzJanusModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r@   Nr   r7   r,   cache_position	use_cachelogits_to_keepc
                 $   |d u |d uz  rt        d      | | j                         |      }|| j                  |d      j                  }|j	                  d|j
                  d         }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d|||||||	d|
}t        |j                  |j                  |j                  |j                   |      S d       S )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oneT)r  r/   )r  r  )r  r   r7   r,   r  r  r  )rY   r,   rZ   r[   r\   rV   )r   r  r  r  r   r:   r   r  r   r  masked_scatterr  rX   rY   r,   rZ   r[   )r<   r  r   r   r7   r,   r  r  r  r  r   image_embedsr  image_attention_mask	lm_outputs                  r?   r   zJanusModel.forward
  sS    -t";<s   7D557	BM#22<T2R``L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M~^M'D'' 	
')%+))	
 	
	 ,'99%55#11 ++0<0H
 	

 OS
 	
r@   )	NNNNNNNNr   )rA   rB   rC   r    rk   r  r  r   r   r8   rU   r   r   r]   r   r  r@  r  r   r	   r   r   rX   r   rM   rN   s   @r?   r  r    sz   { *:8 !--9?@R9S	+	+  "))":?:K:K"]b]n]n"0  .215.204(,2626!%-..
##d*.
 ''$..
 t+	.

 &&-.
 .
 ((4/.
 ((4/.
 $;.
 ell*.
 
&.
  .
r@   r  c                   x    e Zd ZddiZdZdZdef fdZd Zd Z	d	e
j                  d
e
j                  fdZee	 	 	 	 	 	 	 	 	 	 dde
j                  dz  de
j                   dz  de
j                  dz  de
j                  dz  dedz  de
j                  dz  de
j                   dz  de
j                  dz  dedz  dee
j                  z  dee   d
efd              Z	 	 	 	 	 	 	 d fd	Zde
j                  fdZ e
j4                         	 	 	 d d	e
j                  dz  de
j                  dz  dedz  f fd       Z xZS )!JanusForConditionalGenerationzlm_head.weightz(model.language_model.embed_tokens.weightr'   Tr%   c                     t         |   |       || _        t        |      | _        t        j                  |j                  j                  |j                  j                  d      | _
        | j                          y )NFr   )r1   rk   r%   r  r&   r   r   r  rl   
vocab_sizelm_headr  rx   s     r?   rk   z&JanusForConditionalGeneration.__init__B  s\     '
yy!3!3!?!?ASASA^A^ejk 	r@   c                 J    | j                   j                  j                         S r   )r&   r  r  r  s    r?   r  z2JanusForConditionalGeneration.get_input_embeddingsK  s    zz((==??r@   c                 N    | j                   j                  j                  |       y r   )r&   r  r  r  s     r?   r  z2JanusForConditionalGeneration.set_input_embeddingsN  s    

!!66u=r@   inputsr|   c                 r    | j                   j                  |      }| j                   j                  |      }|S r   )r&   r  r  )r<   r  r-  s      r?   'prepare_embeddings_for_image_generationzEJanusForConditionalGeneration.prepare_embeddings_for_image_generationQ  s0    zz77?zz44\Br@   Nr  r   r   r7   r,   r  r  labelsr  r  r   c                     | j                   d|||||||	|d|}|j                  }t        |
t              rt	        |
 d      n|
}| j                  |dd|ddf         }d}|4 | j                  d||| j                  j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r  r   r   r7   r,   r  r  r  N)ra   r  r  )r`   ra   r,   rZ   r[   r\   rV   )r&   rY   r3   r   slicer  loss_functionr%   r  r  r_   r,   rZ   r[   r\   )r<   r  r   r   r7   r,   r  r  r  r  r  r   outputsrZ   slice_indicesra   r`   s                    r?   r   z%JanusForConditionalGeneration.forwardV  s    , $** 

%)%+')

 

  118B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r@   c	           
      h    t        |   |f||||||d|	}
|s|	j                  dd      s||
d<   |
S )N)r,   r  r   r  r  is_first_iterationr  Tr   )r1   prepare_inputs_for_generationget)r<   r  r   r,   r   r  r  r  r  r   model_inputsr>   s              r?   r  z;JanusForConditionalGeneration.prepare_inputs_for_generation  sZ     w<	
+')))1	
 	
 VZZT%B+7L(r@   r:  c                 x    | j                   j                  j                  |      }|j                  dddd      }|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r   r   r   )r&   r  r  r   )r<   r:  decoded_images      r?   decode_image_tokensz1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9r@   logits_processorc           	         |j                  d| j                        }t        j                  |      }|j                  dd      }|dk(  rt	        %|   d|||d d|S  |j                  di |}|j                         t        j                  t        j                  fvrt        d      |j                          | j                  |j                                ||n	t               }d|d<   |j                  t         j#                  d       d	|_        |j                  |d
<   | j%                  ||j&                  |      \  }}	}|j(                  |j*                  }}
t-        |j.                        dk7  rt        d|j.                   d      |d u}| j1                  |||j*                         |j                  r:|j                  dkD  r+|j3                  t5        |j                               d |_        | j7                  ||j.                  d   |d ||      } | j8                  d|||j:                  d|\  }}| j<                  j>                  j@                  jB                  }|j.                  \  }}|jE                  dd      }|j                  dd       }|jE                  dd      }||d<   ||d d d f   |j&                  k7  ||d d d f   |jF                  d   k7  z  }||d d d f   jI                  ||jJ                          | jM                         |      }| jO                  |||      }|jQ                  dd       @| jS                  |jT                  xs d|dz  tW        |jX                  ||z         |      |d<   t[        j\                  ||f|
|      }|j^                  }|j`                  }|jb                  }|jd                  }|jf                  }|r|rdnd }|r|rdnd }|r|rdnd }|r|rdnd }ti        |      D ]|  } | jj                  d||d|}d|v r!|d   jm                  |j*                        |d<   |d   jm                  |j*                        |d<    | j<                  jn                  di |||d}| jq                  ||      }|jr                  d d dd d f   ju                         } | j<                  jw                  |       }! |||!      }"|jx                  r>t[        jz                  |"d      }#t[        j|                  |#d      j                  d      }$nt[        j                  |"d      }$|$|d d |f<   t[        j                  |$|$g      }$|$j                  d      }$| j                  |$      } |r@|r|!fz  }|r| j                         fz  }|r|j                  z  }|r|j                  z  }|rt        |!|||j                        S |S ) Ngeneration_configgeneration_moder)   )r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r   r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  r   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr,   static)cache_implementationr   max_cache_lenmodel_kwargsr  rV   )r  r  r  )output_attentionsoutput_hidden_statesr/   r0  )num_samples)	sequencesscoresra   r[   rZ   r,   )Ipopr  copydeepcopyr1   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r   loggerwarning_prepare_model_inputsbos_token_idr   r  ry  r:   _prepare_special_tokensr  r
   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr&   r  r%   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr  _get_initial_cache_positionr  _prepare_static_cacher  max
max_lengthr8   zerosr  r  output_scoresoutput_logitsreturn_dict_in_generater   r  r   r  #_update_model_kwargs_for_generationrY   cloner  	do_sampler   multinomialsqueezeargmaxcatr   r  floatr[   rZ   r   r,   )&r<   r  r   r  r   r  r  r
  r  model_input_namer   r  kwargs_has_attention_maskr"  r   r   input_tokensmaskr  generated_tokensr  r  r,  r-  r.  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  r-  r  next_token_scoresprobs
next_tokenr>   s&                                        r?   r  z&JanusForConditionalGeneration.generate  s    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   0(//9&9 002>;P;PR`RnRn:ooT  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N%22L5
1	#\ ")9)9vy1$MiooM^EF  %3$$>!$$%68QZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #E$"D"D #
))>>#
 	#
	<  ::2299JJ'oo
G ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW3113LA77V-t4<.2.H.H%6%K%K%Wx%>!"3">">@PSZ@Z[) /I /L*+ !;;
4D'EU[ab .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'( #	UA=4== +|GSL  </1=>N1O1R1RS`SgSg1h-.-9:J-K-N-N}OcOc-dL)*/djj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG#	UJ #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#r@   )
NNNNNNNNNr   )NNNNNNF)NNN)rA   rB   rC   _tied_weights_keysoutput_modalitiesrL   r    rk   r  r  r8   r   r  r   r   r@  rU   r	   r   r   r   r   r_   r   r  r  no_gradr   r  rM   rN   s   @r?   r  r  =  s   *,VW)!{ @>ell u|| 
  .215.204(,2626*.!%-.1
##d*1
 ''$.1
 t+	1

 &&-1
 1
 ((4/1
 ((4/1
   4'1
 $;1
 ell*1
 +,1
 
%1
  1
l   D
 
 U]]_ '+267;	|$t#|$ ((4/|$ .4	|$ |$r@   r  )r$   r  r  r  r  )r   )Ur  collections.abcr   dataclassesr   r8   torch.nn.functionalr   r   r<   r   r5   activationsr   cache_utilsr	   
generationr
   r   r   r   generation.utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   autor   configuration_janusr    r!   r"   
get_loggerrA   r  r$   rQ   rX   r_   r  r4   r   r   r   r6  r   r   r   r+   r   r  r  r%  rB  rV  rb  ri  ro  rw  r  r  r  r  r  r  r  __all__rV   r@   r?   <module>rY     sq  *  $ !     & !   u u 9 9 X X F &  0  Q Q 
		H	% i? i i$ 
	4{ 	4 	4 
@; @ @6 
@+ @ @4HBII HV	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%4I$299 I$XRYY ( 8  F@ @D 2+ 2 2jBII $<"		 <"~)(BII )(X &"))  &F	ryy 	RYY  ,J!		 J!ZA		 AH 46 4  4 CU% CUCUL299 $RYY   
n
% n

n
by$$8/ y$x	 tr@   