
    i                         d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZmZmZmZ  e       rddlZ ej<                  e      Z  G d
 de!e      Z" e       rddl#m$Z$  G d dejJ                        Z&e	 G d d             Z'e	 G d de'             Z(e	 G d de'             Z)e	 G d de'             Z*e	 G d de'             Z+e	 G d de'             Z,y)z
Adapted from
https://github.com/huggingface/transformers/blob/52cb4034ada381fe1ffe8d428a1076e5411a8026/src/transformers/utils/quantization_config.py
    N)	dataclassis_dataclass)Enum)partial)AnyCallableDictListOptionalUnion)version   )is_torch_availableis_torchao_availableis_torchao_versionloggingc                        e Zd ZdZdZdZdZdZy)QuantizationMethodbitsandbytesgguftorchaoquantomodeloptN)__name__
__module____qualname__BITS_AND_BYTESGGUFTORCHAOQUANTOMODELOPT     r/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/diffusers/quantizers/quantization_config.pyr   r   .   s    #NDGFHr#   r   )MappingTypec                        e Zd Z fdZ xZS )TorchAoJSONEncoderc                 Z    t        |t              r|j                  S t        |   |      S N)
isinstancer%   namesuperdefault)selfobj	__class__s     r$   r-   zTorchAoJSONEncoder.default:   s%    #{+xx7?3''r#   )r   r   r   r-   __classcell__r0   s   @r$   r'   r'   9   s    	( 	(r#   r'   c                       e Zd ZU dZeed<   g Zedd       Zde	e
ej                  f   fdZdee
ef   fdZd Zd	 Zdd
ede
fdZd Zy)QuantizationConfigMixinz-
    Mixin class for quantization config
    quant_methodc                      | di |}g }|j                         D ]0  \  }}t        ||      st        |||       |j                  |       2 |D ]  }|j	                  |d        |r||fS |S )a  
        Instantiates a [`QuantizationConfigMixin`] from a Python dictionary of parameters.

        Args:
            config_dict (`Dict[str, Any]`):
                Dictionary that will be used to instantiate the configuration object.
            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
                `PreTrainedModel`.
            kwargs (`Dict[str, Any]`):
                Additional parameters from which to initialize the configuration object.

        Returns:
            [`QuantizationConfigMixin`]: The configuration object instantiated from those parameters.
        Nr"   )itemshasattrsetattrappendpop)clsconfig_dictreturn_unused_kwargskwargsconfig	to_removekeyvalues           r$   	from_dictz!QuantizationConfigMixin.from_dictI   s    $ #{#	 ,,. 	&JCvs#U+  %	&  	"CJJsD!	"  6>!Mr#   json_file_pathc                     t        |dd      5 }| j                         }t        j                  |dd      dz   }|j	                  |       ddd       y# 1 sw Y   yxY w)	a  
        Save this instance to a JSON file.

        Args:
            json_file_path (`str` or `os.PathLike`):
                Path to the JSON file in which this configuration instance's parameters will be saved.
            use_diff (`bool`, *optional*, defaults to `True`):
                If set to `True`, only the difference between the config instance and the default
                `QuantizationConfig()` is serialized to JSON file.
        wzutf-8)encodingr   Tindent	sort_keys
N)opento_dictjsondumpswrite)r.   rE   writerr=   json_strings        r$   to_json_filez$QuantizationConfigMixin.to_json_filej   sU     .#8 	&F,,.K**[dKdRKLL%		& 	& 	&s   =AAreturnc                 @    t        j                  | j                        S )
        Serializes this instance to a Python dictionary. Returns:
            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
        )copydeepcopy__dict__r.   s    r$   rN   zQuantizationConfigMixin.to_dict{   s    
 }}T]]++r#   c              #      K   t        j                  | j                        j                         D ]  \  }}||f  yw)zTallows `dict(obj)` for situations where obj may be a dict or QuantizationConfigMixinN)rX   rY   rZ   r7   )r.   attrrC   s      r$   __iter__z QuantizationConfigMixin.__iter__   s9     ==7==? 	KD%+	s   =?c                 T    | j                   j                   d| j                          S )N )r0   r   to_json_stringr[   s    r$   __repr__z QuantizationConfigMixin.__repr__   s(    ..))*!D,?,?,A+BCCr#   use_diffc                     |du r| j                         }n| j                         }t        j                  |dd      dz   S )a  
        Serializes this instance to a JSON string.

        Args:
            use_diff (`bool`, *optional*, defaults to `True`):
                If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
                is serialized to JSON string.

        Returns:
            `str`: String containing all the attributes that make up this configuration instance in JSON format.
        Tr   rI   rL   )to_diff_dictrN   rO   rP   )r.   rc   r=   s      r$   ra   z&QuantizationConfigMixin.to_json_string   s=     t++-K,,.Kzz+a4@4GGr#   c                     g }|j                         D ]0  \  }}t        | |      st        | ||       |j                  |       2 |j                         D ci c]  \  }}||vs|| }}}|S c c}}w )a  
        Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes,
        returning all the unused kwargs.

        Args:
            kwargs (`Dict[str, Any]`):
                Dictionary of attributes to tentatively update this class.

        Returns:
            `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
        )r7   r8   r9   r:   )r.   r?   rA   rB   rC   unused_kwargss         r$   updatezQuantizationConfigMixin.update   s}     	 ,,. 	&JCtS!c5)  %	& 7=lln]
US\H\e]] ^s   A0&A0NF)T)r   r   r   __doc__r   __annotations___exclude_attributes_at_initclassmethodrD   r   strosPathLikerT   r	   r   rN   r^   rb   boolra   rh   r"   r#   r$   r4   r4   @   s|     %$"$ @&5bkk1A+B &",c3h ,
DHt Hs H$r#   r4   c                       e Zd ZdZg dZ	 	 	 	 	 	 	 	 	 	 ddZed        Zej                  de	fd       Zed        Z
e
j                  de	fd	       Z
d
 Zd Zd Zdeeef   fdZd Zdeeef   fdZy)BitsAndBytesConfiga  
    This is a wrapper class about all possible attributes and features that you can play with a model that has been
    loaded using `bitsandbytes`.

    This replaces `load_in_8bit` or `load_in_4bit` therefore both options are mutually exclusive.

    Currently only supports `LLM.int8()`, `FP4`, and `NF4` quantization. If more methods are added to `bitsandbytes`,
    then more arguments will be added to this class.

    Args:
        load_in_8bit (`bool`, *optional*, defaults to `False`):
            This flag is used to enable 8-bit quantization with LLM.int8().
        load_in_4bit (`bool`, *optional*, defaults to `False`):
            This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from
            `bitsandbytes`.
        llm_int8_threshold (`float`, *optional*, defaults to 6.0):
            This corresponds to the outlier threshold for outlier detection as described in `LLM.int8() : 8-bit Matrix
            Multiplication for Transformers at Scale` paper: https://huggingface.co/papers/2208.07339 Any hidden states
            value that is above this threshold will be considered an outlier and the operation on those values will be
            done in fp16. Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5],
            but there are some exceptional systematic outliers that are very differently distributed for large models.
            These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of
            magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6,
            but a lower threshold might be needed for more unstable models (small models, fine-tuning).
        llm_int8_skip_modules (`List[str]`, *optional*):
            An explicit list of the modules that we do not want to convert in 8-bit. This is useful for models such as
            Jukebox that has several heads in different places and not necessarily at the last position. For example
            for `CausalLM` models, the last `lm_head` is typically kept in its original `dtype`.
        llm_int8_enable_fp32_cpu_offload (`bool`, *optional*, defaults to `False`):
            This flag is used for advanced use cases and users that are aware of this feature. If you want to split
            your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use
            this flag. This is useful for offloading large models such as `google/flan-t5-xxl`. Note that the int8
            operations will not be run on CPU.
        llm_int8_has_fp16_weight (`bool`, *optional*, defaults to `False`):
            This flag runs LLM.int8() with 16-bit main weights. This is useful for fine-tuning as the weights do not
            have to be converted back and forth for the backward pass.
        bnb_4bit_compute_dtype (`torch.dtype` or str, *optional*, defaults to `torch.float32`):
            This sets the computational type which might be different than the input type. For example, inputs might be
            fp32, but computation can be set to bf16 for speedups.
        bnb_4bit_quant_type (`str`,  *optional*, defaults to `"fp4"`):
            This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types
            which are specified by `fp4` or `nf4`.
        bnb_4bit_use_double_quant (`bool`, *optional*, defaults to `False`):
            This flag is used for nested quantization where the quantization constants from the first quantization are
            quantized again.
        bnb_4bit_quant_storage (`torch.dtype` or str, *optional*, defaults to `torch.uint8`):
            This sets the storage type to pack the quanitzed 4-bit prarams.
        kwargs (`Dict[str, Any]`, *optional*):
            Additional parameters from which to initialize the configuration object.
    )_load_in_4bit_load_in_8bitr5   Nc                 N    t         j                   _        |r|rt        d      | _        | _        | _        | _        | _        | _	        | _
        |	 _        |t        j                   _        nSt        |t               rt#        t        |       _        n-t        |t        j$                        r| _        nt        d      |
t        j&                   _        nbt        |
t               r%|
dvrt        d      t#        t        |
       _        n-t        |
t        j$                        r|
 _        nt        d      |rQt+         fd|D              s=t,        j/                  dt1        |j3                                d j4                   d	        j7                          y )
NVload_in_4bit and load_in_8bit are both True, but only one can be used at the same timez8bnb_4bit_compute_dtype must be a string or a torch.dtype)float16float32int8uint8float64bfloat16zv`bnb_4bit_quant_storage` must be a valid string (one of 'float16', 'float32', 'int8', 'uint8', 'float64', 'bfloat16') z8bnb_4bit_quant_storage must be a string or a torch.dtypec              3   :   K   | ]  }|j                   v   y wr)   )rl   ).0kr.   s     r$   	<genexpr>z.BitsAndBytesConfig.__init__.<locals>.<genexpr>#  s     Ta4#C#CCTs   zUnused kwargs: z. These kwargs are not used in .)r   r   r5   
ValueErrorru   rt   llm_int8_thresholdllm_int8_skip_modules llm_int8_enable_fp32_cpu_offloadllm_int8_has_fp16_weightbnb_4bit_quant_typebnb_4bit_use_double_quanttorchry   bnb_4bit_compute_dtyper*   rn   getattrdtyper{   bnb_4bit_quant_storageallloggerwarninglistkeysr0   	post_init)r.   load_in_8bitload_in_4bitr   r   r   r   r   r   r   r   r?   s   `           r$   __init__zBitsAndBytesConfig.__init__   ss    /==Luvv))"4%:"0P-(@%#6 )B&!)*/--D'.4*1%9O*PD'.<*@D'WXX!)*/++D'.4% .  ! M  +2%9O*PD'.<*@D'WXX#TVTTNN_T&++--@,AA`aeaoao`ppqrsr#   c                     | j                   S r)   )rt   r[   s    r$   r   zBitsAndBytesConfig.load_in_4bit(      !!!r#   rC   c                 z    t        |t              st        d      | j                  r|rt	        d      || _        y )Nload_in_4bit must be a booleanrw   )r*   rq   	TypeErrorr   r   rt   r.   rC   s     r$   r   zBitsAndBytesConfig.load_in_4bit,  7    %&<==uvv"r#   c                     | j                   S r)   )ru   r[   s    r$   r   zBitsAndBytesConfig.load_in_8bit5  r   r#   c                 z    t        |t              st        d      | j                  r|rt	        d      || _        y )Nload_in_8bit must be a booleanrw   )r*   rq   r   r   r   ru   r   s     r$   r   zBitsAndBytesConfig.load_in_8bit9  r   r#   c                    t        | j                  t              st        d      t        | j                  t              st        d      t        | j
                  t              st        d      | j                  %t        | j                  t              st        d      t        | j                  t              st        d      t        | j                  t              st        d      | j                  /t        | j                  t        j                        st        d      t        | j                  t              st        d	      t        | j                   t              st        d
      | j                  rTt#        j$                  t&        j(                  j#                  d            t#        j$                  d      k\  st+        d      yy)z~
        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
        r   r   z"llm_int8_threshold must be a floatNz/llm_int8_skip_modules must be a list of stringsz2llm_int8_enable_fp32_cpu_offload must be a booleanz*llm_int8_has_fp16_weight must be a booleanz*bnb_4bit_compute_dtype must be torch.dtypez$bnb_4bit_quant_type must be a stringz+bnb_4bit_use_double_quant must be a booleanr   z0.39.0z[4 bit quantization requires bitsandbytes>=0.39.0 - please upgrade your bitsandbytes version)r*   r   rq   r   r   r   floatr   r   r   r   r   r   r   r   rn   r   r   parse	importlibmetadatar   r[   s    r$   r   zBitsAndBytesConfig.post_initB  st    $++T2<==$++T2<==$1159@AA%%1*TE_E_ae:fMNN$??FPQQ$77>HII&&2:dFaFachcncn;oHII$22C8BCC$88$?IJJW]]93E3E3M3Mn3]%^biboboc
 &
 m &
r#   c                 6    | j                   xs | j                  S )zP
        Returns `True` if the model is quantizable, `False` otherwise.
        )r   r   r[   s    r$   is_quantizablez!BitsAndBytesConfig.is_quantizableg  s       5D$5$55r#   c                     | j                   ry| j                  r| j                  dk(  ry| j                  r| j                  dk(  ryy)z
        This method returns the quantization method used for the model. If the model is not quantizable, it returns
        `None`.
        llm_int8fp4nf4N)r   r   r   r[   s    r$   quantization_methodz&BitsAndBytesConfig.quantization_methodm  sE    
 4#;#;u#D4#;#;u#Dr#   rU   c                    t        j                  | j                        }t        |d         j	                  d      d   |d<   t        |d         j	                  d      d   |d<   | j
                  |d<   | j                  |d<   |S )rW   r   r      r   r   r   )rX   rY   rZ   rn   splitr   r   )r.   outputs     r$   rN   zBitsAndBytesConfig.to_dict{  s    
 t}}-+.v6N/O+P+V+VWZ+[\]+^'(+.v6N/O+P+V+VWZ+[\]+^'(!%!2!2~!%!2!2~r#   c                     | j                         }| j                  j                   dt        j                  |dd       dS )Nr`   r   TrI   rL   )rN   r0   r   rO   rP   r.   r=   s     r$   rb   zBitsAndBytesConfig.__repr__  s;    lln..))*!DJJ{1X\,]+^^`aar#   c                     | j                         }t               j                         }i }|j                         D ]  \  }}|||   k7  s|||<    |S )a'  
        Removes all attributes from config which correspond to the default config attributes for better readability and
        serializes to a Python dictionary.

        Returns:
            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
        )rN   rs   r7   )r.   r=   default_config_dictserializable_config_dictrB   rC   s         r$   re   zBitsAndBytesConfig.to_diff_dict  sh     lln 12::<#%  &++- 	6JC+C0005(-	6 ('r#   )
FFg      @NFFNr   FN)r   r   r   rj   rl   r   propertyr   setterrq   r   r   r   r   r	   rn   r   rN   rb   re   r"   r#   r$   rs   rs      s    1f #U ").!&#!"'#<| " " #$ # # " " #$ # ##J6c3h b(d38n (r#   rs   c                   $    e Zd ZdZdded   fdZy)GGUFQuantizationConfigaI  This is a config class for GGUF Quantization techniques.

    Args:
        compute_dtype: (`torch.dtype`, defaults to `torch.float32`):
            This sets the computational type which might be different than the input type. For example, inputs might be
            fp32, but computation can be set to bf16 for speedups.

    Ncompute_dtypeztorch.dtypec                     t         j                  | _        || _        d| _        d | _        | j                  t        j                  | _        y y )NT)r   r   r5   r   pre_quantizedmodules_to_not_convertr   ry   )r.   r   s     r$   r   zGGUFQuantizationConfig.__init__  sG    .33*! '+#%!&D &r#   r)   )r   r   r   rj   r   r   r"   r#   r$   r   r     s    	/h}&= 	/r#   r   c                        e Zd ZdZ	 ddeedf   deee      ddfdZd Z	 fd	Z
edd
       Zed        Zedefd       Zd Zd Z xZS )TorchAoConfiga  This is a config class for torchao quantization/sparsity techniques.

    Args:
        quant_type (Union[`str`, AOBaseConfig]):
            The type of quantization we want to use, currently supporting:
                - **Integer quantization:**
                    - Full function names: `int4_weight_only`, `int8_dynamic_activation_int4_weight`,
                      `int8_weight_only`, `int8_dynamic_activation_int8_weight`
                    - Shorthands: `int4wo`, `int4dq`, `int8wo`, `int8dq`

                - **Floating point 8-bit quantization:**
                    - Full function names: `float8_weight_only`, `float8_dynamic_activation_float8_weight`,
                      `float8_static_activation_float8_weight`
                    - Shorthands: `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`,
                      `float8_e4m3_tensor`, `float8_e4m3_row`,

                - **Floating point X-bit quantization:**
                    - Full function names: `fpx_weight_only`
                    - Shorthands: `fpX_eAwB`, where `X` is the number of bits (between `1` to `7`), `A` is the number
                      of exponent bits and `B` is the number of mantissa bits. The constraint of `X == A + B + 1` must
                      be satisfied for a given shorthand notation.

                - **Unsigned Integer quantization:**
                    - Full function names: `uintx_weight_only`
                    - Shorthands: `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo`
                - An AOBaseConfig instance: for more advanced configuration options.
        modules_to_not_convert (`List[str]`, *optional*, default to `None`):
            The list of modules to not quantize, useful for quantizing models that explicitly require to have some
            modules left in their original precision.
        kwargs (`Dict[str, Any]`, *optional*):
            The keyword arguments for the chosen type of quantization, for example, int4_weight_only quantization
            supports two keyword arguments `group_size` and `inner_k_tiles` currently. More API examples and
            documentation of arguments can be found in
            https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques

    Example:
        ```python
        from diffusers import FluxTransformer2DModel, TorchAoConfig

        # AOBaseConfig-based configuration
        from torchao.quantization import Int8WeightOnlyConfig

        quantization_config = TorchAoConfig(Int8WeightOnlyConfig())

        # String-based config
        quantization_config = TorchAoConfig("int8wo")
        transformer = FluxTransformer2DModel.from_pretrained(
            "black-forest-labs/Flux.1-Dev",
            subfolder="transformer",
            quantization_config=quantization_config,
            torch_dtype=torch.bfloat16,
        )
        ```
    N
quant_typeAOBaseConfigr   rU   c                     t         j                  | _        || _        || _        d|v r|d   | _        n|| _        | j                          y )Nquant_type_kwargs)r   r   r5   r   r   r   r   )r.   r   r   r?   s       r$   r   zTorchAoConfig.__init__  sK     /66$&<# &(%+,?%@D"%+D"r#   c           	         t        | j                  t              st        dd      r,t	        dt        | j                        j                   d      ddlm} t        | j                  |      s+t        dt        | j                        j                         y t        | j                  t              r| j                         }| j                  |j                         vr|| j                  j                  d      xs | j                  j                  d	      }|r)| j                         st	        d
| j                   d      t	        d
| j                   d      || j                     }t        j                  |      }|j                   j#                         D ch c]N  }|j$                  t        j&                  j(                  t        j&                  j*                  fv r|j,                  P }}t/        | j0                  j                         |z
        }t3        |      dkD  rt	        d| j                   d| d| d      y y c c}w )Nz<=0.9.0z6torchao <= 0.9.0 only supports string quant_type, got z1. Upgrade to torchao > 0.9.0 to use AOBaseConfig.r   )r   z0quant_type must be a AOBaseConfig instance, got r   fpzRequested quantization type: z is not supported on GPUs with CUDA capability <= 8.9. You can check the CUDA capability of your GPU using `torch.cuda.get_device_capability()`.z is not supported or is an incorrect `quant_type` name. If you think the provided quantization type should be supported, please open an issue at https://github.com/huggingface/diffusers/issues.zThe quantization method "z4" does not support the following keyword arguments: z2. The following keywords arguments are supported: r   )r*   r   rn   r   r   typer   torchao.quantization.quant_apir   r   !_get_torchao_quant_type_to_methodr   
startswith&_is_xpu_or_cuda_capability_atleast_8_9inspect	signature
parametersvalueskind	ParameterKEYWORD_ONLYPOSITIONAL_OR_KEYWORDr+   r   r   len)	r.   r   TORCHAO_QUANT_TYPE_METHODSis_floating_quant_typemethodr   param
all_kwargsunsupported_kwargss	            r$   r   zTorchAoConfig.post_init  s9   $//3/!$0 LTRVRaRaMbMkMkLl mF G 
 Ddoo|<"RSWX\XgXgShSqSqRr stt = -)-)O)O)Q&&@&E&E&GG)-)C)CG)L)pPTP_P_PjPjkoPp&)$2]2]2_$77H Ip q 
 !3DOO3D EO P 
 0@F))&1I '1188:::'"3"3"@"@'BSBSBiBi!jj 

J 
 "&d&<&<&A&A&Cj&P!Q%&* //@@t)**\]g\hhik  +3 .$s   AIc                 @   t         |          }t        | j                  t              rd|v rd|d   v rt        |d   d         r=|d   d   j                  j                  t        j                  |d   d         g|d   d<   t        |d   d   t              r[t        |d   d         dk(  sJ d       t        |d   d   d   t              sJ d       t        |d   d   d   t              sJ d       t        d	      |S dd
lm} d || j                        i|d<   |S )z&Convert configuration to a dictionary.r   layoutr   z*layout saves layout name and layout kwargsr   zlayout name must be a stringr   zlayout kwargs must be a dictzlayout must be a list)config_to_dictr-   r   )r,   rN   r*   r   rn   r   r0   r   dataclassesasdictr   r   dictr   torchao.core.configr   )r.   dr   r0   s      r$   rN   zTorchAoConfig.to_dict1  sG   GOdoos+"a'H:M8N,N"5 6x @A-.x8BBKK#**1-@+A(+KL8A)*84 a 34X>Eq!45h?@AEsGssE%a(;&<X&Fq&I3OoQooO%a(;&<X&Fq&I4PpRppP$%<==  ;  ).*IJAlOr#   c                    t        dd      st        d      |j                         }|j                  d      }t	        |t
              r
 | d
d|i|S t        |      dk(  rd|v sJ d       |d   }dd	lm}  ||      } | d
d|i|S )z'Create configuration from a dictionary.>r   zATorchAoConfig requires torchao > 0.9.0 for construction from dictr   r   r-   z8Expected only one key 'default' in quant_type dictionaryr   )config_from_dictr"   )	r   NotImplementedErrorrX   r;   r*   rn   r   r   r   )r<   r=   r>   r?   r   r   s         r$   rD   zTorchAoConfig.from_dictM  s     "#w/%&ijj!&&( __\2
j#&<*<<< :!#	Z(? 	
F	
?  	*
 	9%j1
8j8K88r#   c                 x   t               r!ddlmm}m}mm}m}m}m	}m
} ddlmm dt        j                  ffd}dt         ffd}	||||d}
||||d	}t#        |t        j$                  
      |t#        |t        j$                  
      t#        |t        j&                  
      t#        t        j&                  t        j&                        d |t        j&                        d|i |	d       |	d       |	d       |	d       |	d      }|t#        |t        j(                        t#        |t        j*                        t#        |t        j,                        t#        |t        j.                        t#        |t        j0                        t#        |t        j2                        t#        |t        j4                        d}i }|j7                  |
       |j7                  |       |j7                  |       | j9                         r|j7                  |       |S t;        d      )z`
        Returns supported torchao quantization types with all commonly used notations.
        r   )	'float8_dynamic_activation_float8_weight&float8_static_activation_float8_weightfloat8_weight_onlyfpx_weight_onlyint4_weight_only#int8_dynamic_activation_int4_weight#int8_dynamic_activation_int8_weightint8_weight_onlyuintx_weight_only)PerRow	PerTensorr   c           	          | t         j                  k(  rdnd}i }fD ].  }|u rdnd}t        | |  |        |       f      |d| d| <   0 |S )Ne5m2e4m3tensorrow)activation_dtypeweight_dtypegranularity	float8dq__)r   float8_e5m2r   )r   r+   typesgranularity_clsgranularity_namer   r   r   s        r$   generate_float8dq_typeszPTorchAoConfig._get_torchao_quant_type_to_method.<locals>.generate_float8dq_types|  s}    !&%*;*;!;v(16': O3Bi3OxUZ$CJ?).%*%4%68I$J	DEIdV1-=,>?@ r#   bitsc           	          i }t        d|       D ]$  }| |z
  dz
  }t        ||      |d|  d| d| <   & | dz
  }|dz   dz  }||z
  }t        ||      |d|  <   |S )Nr   )ebitsmbitsr   _emr   )ranger   )r  r   r  r  non_sign_bitsdefault_ebitsdefault_mbitsr   s          r$   generate_fpx_quantization_typeszXTorchAoConfig._get_torchao_quant_type_to_method.<locals>.generate_fpx_quantization_types  s    "1d^ mE 5L1,E:A/Y^fk:lEBtfBugQug67m !%q!.!2q 8 - =%,_MYf%g4&k"r#   )int4wor   int4dqr   )int8wor   int8dqr   )r   )r   r   )float8wor   float8wo_e5m2float8wo_e4m3float8dqr   float8dq_e4m3r                  )r   )r   uint1wouint2wouint3wouint4wouint5wouint6wouint7wozYTorchAoConfig requires torchao to be installed, please install with `pip install torchao`)r   torchao.quantizationr   r   r   r   r   r   r   r   r   torchao.quantization.observerr   r   r   r   intr   r   float8_e4m3fnuint1uint2uint3uint4uint5uint6uint7rh   r   r   )r<   r   r   r   r   r   r   r   r  r  INT4_QUANTIZATION_TYPESINT8_QUANTIZATION_TYPESFLOATX_QUANTIZATION_TYPESUINTX_QUANTIZATION_DTYPESQUANTIZATION_TYPESr   r   r   r   s                  @@@@r$   r   z/TorchAoConfig._get_torchao_quant_type_to_methode  s     !
 
 
 Hu{{  c   +$4=7Z'# +$4=7Z'# $$6UEVEVW&8!();%J[J[!\!();%J]J]!^C;b ");%*%8%8!&!4!4")#)2 *%*=*=>3#)6 9:`7#)< 2!4=#)> 2!4?#)@ 2!4A#)B 2!4C#)D 2!4E#)%L &7"#4EKKH"#4EKKH"#4EKKH"#4EKKH"#4EKKH"#4EKKH"#4EKKH
)% "$%%&=>%%&=>%%&?@99;"))*CD%%k r#   c                      t         j                  j                         r0t         j                  j                         \  } }| dk(  r|dk\  S | dk\  S t         j                  j                         ryt        d      )N   	   TzPTorchAO requires a CUDA compatible GPU or Intel XPU and installation of PyTorch.)r   cudais_availableget_device_capabilityxpuRuntimeError)majorminors     r$   r   z4TorchAoConfig._is_xpu_or_cuda_capability_atleast_8_9  sb    ::""$ ::;;=LE5zz!A:YY##%qrrr#   c                    t        | j                  t              s| j                  S | j                         }| j                  j                         }t        j                  j                         set               rZ| j                  dk(  rJt        j                  t        j                  j                  d            t        j                  d      k\  r|j                  dd      t        j                  j                         rt        j                  t        j                  j                  d            t        j                  d      k\  rnt        j                  t        j                  j                  d            t        j                  d      kD  r&d	d
lm} d	dlm}  |       |d<   |j(                  |d<   nt+        d      d	dlm}  |       |d<    || j                     di |S )zBCreate the appropriate quantization method based on configuration.r   r   z0.8.0r   Nz0.11.0r   z2.7.9r   )Int4XPULayout)ZeroPointDomainzero_point_domainzTorchAoConfig requires torchao >= 0.11.0 and torch >= 2.8.0 for XPU support. Please upgrade the version or use run on CPU with the cpu version pytorch.)Int4CPULayoutr"   )r*   r   rn   r   r   rX   r   r6  r7  r   r   r   r   r   getr9  torchao.dtypesr>  %torchao.quantization.quant_primitivesr?  INTr   rA  )r.   methodsr   r>  r?  rA  s         r$   get_apply_tensor_subclassz'TorchAoConfig.get_apply_tensor_subclass  sp   $//3/??"<<>G $ 6 6 ; ; =JJ++-(*OO'99MM)"4"4"<"<Y"GHGMMZaLbb%))(D9A99))+}}Y%7%7%?%?	%JKw}} P !--	(:(:(B(B7(KLw}}]dOee@Y6Co)(3APATAT)*=>( v  =2?/%h/+74??+@.?@@r#   c                     | j                         }| j                  j                   dt        j                  |ddt
               dS )a_  
        Example of how this looks for `TorchAoConfig("uint4wo", group_size=32)`:

        ```
        TorchAoConfig {
            "modules_to_not_convert": null,
            "quant_method": "torchao",
            "quant_type": "uint4wo",
            "quant_type_kwargs": {
                "group_size": 32
            }
        }
        ```
        r`   r   T)rJ   rK   r<   rL   )rN   r0   r   rO   rP   r'   r   s     r$   rb   zTorchAoConfig.__repr__  sB     lln~~&&'qKUY_q)r(ssuv	
r#   r)   ri   )r   r   r   rj   r   rn   r   r
   r   r   rN   rm   rD   r   staticmethodrq   r   rG  rb   r1   r2   s   @r$   r   r     s    5t 7;#~-. !)c 3
 
$*X8 9 9. D DL 	sD 	s 	s AD
r#   r   c                   8    e Zd ZdZ	 	 ddedeee      fdZd Zy)QuantoConfiga  
    This is a wrapper class about all possible attributes and features that you can play with a model that has been
    loaded using `quanto`.

    Args:
        weights_dtype (`str`, *optional*, defaults to `"int8"`):
            The target dtype for the weights after quantization. Supported values are ("float8","int8","int4","int2")
       modules_to_not_convert (`list`, *optional*, default to `None`):
            The list of modules to not quantize, useful for quantizing models that explicitly require to have some
            modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers).
    Nweights_dtyper   c                 j    t         j                  | _        || _        || _        | j                          y r)   )r   r    r5   rL  r   r   )r.   rL  r   r?   s       r$   r   zQuantoConfig.__init__=  s,     /55*&<#r#   c                 ^    g d}| j                   |vrt        d| d| j                          y)z;
        Safety checker that arguments are correct
        )float8rz   int4int2zOnly support weights in z but found N)rL  r   )r.   accepted_weightss     r$   r   zQuantoConfig.post_initI  sA     >%5578H7IUYUgUgThijj 6r#   )rz   N)	r   r   r   rj   rn   r   r
   r   r   r"   r#   r$   rK  rK  /  s5    
 $6:

 !)c 3
kr#   rK  c                       e Zd ZdZddddddZdddZ	 	 	 	 	 	 	 	 	 	 dd	ed
eee      de	dee
   dee
   dee
   dee
   dedee   dee   de	ddfdZddefdZd	edefdZdeeef   fdZy)NVIDIAModelOptConfiga  This is a config class to use nvidia modelopt for quantization.

    Args:
        quant_type (`str`):
            The type of quantization we want to use, following is how to use:
                **weightquant_activationquant ==> FP8_FP8** In the above example we have use FP8 for both weight and
                activation quantization. Following are the all the options:
                    - FP8
                    - INT8
                    - INT4
                    - NF4
                    - NVFP4
        modules_to_not_convert (`List[str]`, *optional*, default to `None`):
            The list of modules to not quantize, useful for quantizing models that explicitly require to have some
        weight_only (`bool`, *optional*, default to `False`):
            If set to `True`, the quantization will be applied only to the weights of the model.
        channel_quantize (`int`, *optional*, default to `None`):
            The channel quantization axis, useful for quantizing models across different axes.
        block_quantize (`int`, *optional*, default to `None`):
            The block size, useful to further quantize each channel/axes into blocks.
        scale_channel_quantize (`int`, *optional*, default to `None`):
            The scale channel quantization axis, useful for quantizing calculated scale across different axes.
        scale_block_quantize (`int`, *optional*, default to `None`):
            The scale block size, useful for quantizing each scale channel/axes into blocks.
        algorithm (`str`, *optional*, default to `"max"`):
            The algorithm to use for quantization, currently only supports `"max"`.
        forward_loop (`Callable`, *optional*, default to `None`):
            The forward loop function to use for calibration during quantization.
        modelopt_config (`dict`, *optional*, default to `None`):
            The modelopt config, useful for passing custom configs to modelopt.
        disable_conv_quantization (`bool`, *optional*, default to `False`):
            If set to `True`, the quantization will be disabled for convolutional layers.
        kwargs (`Dict[str, Any]`, *optional*):
            Additional parameters which are to be used for calibration.
    )r  r  r4  r  )r   r   )FP8INT8INT4NF4NVFP4)rX  rY  Nr   r   weight_onlychannel_quantizeblock_quantizescale_channel_quantizescale_block_quantize	algorithmforward_loopmodelopt_configdisable_conv_quantizationrU   c                    t         j                  | _        | j                  |       || _        || _        || _        || _        d|i| _        |	| _	        || _
        || _        |
s| j                         n|
| _        || _        y )Nr   )r   r!   r5   _normalize_quant_typer   rZ  r[  r\  	calib_cfgr`  r]  r^  get_config_from_quant_typera  rb  )r.   r   r   rZ  r[  r\  r]  r^  r_  r`  ra  rb  r?   s                r$   r   zNVIDIAModelOptConfig.__init__  s     /77"":.&<#& 0,i
 )&<#$8!HWt>>@]l)B&r#   	operationc                 d    ddl m} t        |      dk(  rd| d}t        j                  |       y y )Nr   )_PATCHED_CLASSESzNot z weights in modelopt format. This might cause unreliable behavior.Please make sure to run the following code before loading/saving model weights:

    from modelopt.torch.opt import enable_huggingface_checkpointing
    enable_huggingface_checkpointing()
)&modelopt.torch.opt.plugins.huggingfaceri  r   warningswarn)r.   rg  ri  warning_msgs       r$   check_model_patchingz)NVIDIAModelOptConfig.check_model_patching  s<    K A%yk "; ;  MM+& &r#   c                    |j                  d      }|d   }t        |      dkD  r|d   nd}t        |      dkD  rt        j                  d| d       d}d}n\|t        j
                  vrt        j                  d	| d
       d}|-|t        j
                  vrt        j                  d| d       d}||d|z   z   | _        ydz   | _        y)a  
        Validates and normalizes the quantization type string.

        Splits the quant_type into weight and activation components, verifies them against supported types, and
        replaces unsupported values with safe defaults.

        Args:
            quant_type (str): The input quantization type string (e.g., 'FP8_INT8').

        Returns:
            str: A valid quantization type string (e.g., 'FP8_INT8' or 'FP8').
        r   r   r   Nr   zQuantization type z. is not supported. Picking FP8_INT8 as defaultrU  zWeight Quantization type z) is not supported. Picking FP8 as defaultzActivation Quantization type z* is not supported. Picking INT8 as default )r   r   r   r   rT  quanttype_to_numbitsr   )r.   r   partsw_typeact_types        r$   rd  z*NVIDIAModelOptConfig._normalize_quant_type  s       %q"5zA~584u:>NN/
|;ijkFH1FFF!:6(Bklm#8L8a8a(a!>xjHrst h6JC(NSPRSr#   c                    ddl mc m} ddii ddii i i i d|j                  j                  | j
                  d}|d   }| j                  r|D ]  }d	|vs||   rd||   d<    | j                  j                  d
      }|d   }t        |      dkD  r|d   j                  dd      nd}|D ]]  }||j                  j                  vsd||   vs$|dk(  r|t        j                  |   ||   d<   Et        j                  |   ||   d<   _ | j                  K| j                  ?| j                  | j                  i|d	   d<   | j                  | j                  ddi|d   d<   n8| j                  ,| j                  |d	   d<   | j                  |d   d<   d|d   d<   | j                  | j                   |t        j"                  v rA|d	   d   j%                  t        j"                  |   | j                  | j                   id       |rS|t        j"                  v rA|d   d   j%                  t        j"                  |   | j                  | j                   id       |S )z<
        Get the config from the quantization type.
        r   N
fake_quantFenable)*weight_quantizer*input_quantizerz*output_quantizerz*q_bmm_quantizerz*k_bmm_quantizerz*v_bmm_quantizerz*softmax_quantizer)	quant_cfgr_  rz  rx  r   r   Arp  ry  num_bitsblock_sizesr   dynamicaxis)
scale_bitsscale_block_sizes)modelopt.torch.quantizationr   quantizationr@   _default_disabled_quantizer_cfgre  rZ  r   r   r   replacerT  rq  r\  r[  r]  r^  quanttype_to_scalingbitsrh   )r.   mtqBASE_CONFIGrz  r   rr  rs  rt  s           r$   rf  z/NVIDIAModelOptConfig.get_config_from_quant_type  s    	21 '3E%:$&&.%6$&$&$&&(	 **<<	 
  ,	 3&a/	!-2IaL*3 %%c*q03E
Q58##C,D 	]A

BBBxW`abWcGc**+3G3\3\]e3f	!Z0+?+T+TU[+\	!Z(	] *t/D/D/P=A=R=RTXTgTg<hI)*=9%%t':':	<I()-8 "".595J5JI)*62484I4II()&14=I()&1 &&2t7P7P7\-FFF-.}=DD&:&S&STZ&[.2.I.I4KdKd-e H(<(U(UU,-m<CC&:&S&ST\&].2.I.I4KdKd-e r#   )
NTNNNNmaxNNF)loading)r   r   r   rj   rq  r  rn   r   r
   rq   r%  r   r   r   rn  rd  r	   r   rf  r"   r#   r$   rT  rT  R  s3   "J    7; *.(,04.2+/*.*/CC !)c 3C 	C
 #3-C !C !)C 'smC C x(C "$C $(C 
C>'c 'T T T:ADcN Ar#   rT  )-rj   rX   r   importlib.metadatar   r   rO   ro   rk  r   r   enumr   	functoolsr   typingr   r   r	   r
   r   r   	packagingr   utilsr   r   r   r   r   
get_loggerr   r   rn   r   rD  r%   JSONEncoderr'   r4   rs   r   r   rK  rT  r"   r#   r$   <module>r     s9  $
      	  /   = =  Y Y 			H	%d  A(T-- ( o o od l(0 l( l(^ /4 / /, q
+ q
 q
h k* k kD {2 { {r#   