
    i">                         d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	m
Z
mZmZmZ d dlmZ erd dlmZ e G d d             Zy)	    N)	dataclass)TYPE_CHECKINGLiteralOptionalUnion)DeepSpeedSequenceParallelConfigDistributedTypeTorchContextParallelConfigTorchTensorParallelConfig)is_torch_version)Acceleratorc                   >   e Zd ZU dZdZee   ed<   dZee   ed<   dZ	ee   ed<   dZ
ee   ed<   dZed   ed<   dZee   ed	<   dZed
   ed<   dZedef   ed<   dZedef   ed<   dZedef   ed<   dZd Zd Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed        Z#ed        Z$ed        Z%ed        Z&ed        Z'de(fd Z)d*dee(   fd!Z*d"e+e+ed#f   e+e(d#f   f   fd$Z,d% Z-d&e(d'efd(Z.d+d)Z/y),ParallelismConfiga  
    A dataclass to configure parallelisms applied to the model. Inspired by torchtitan's `ParallelDims`
    https://github.com/pytorch/torchtitan/blob/main/torchtitan/distributed/parallel_dims.py

    Args:
        dp_replicate_size (`int`, defaults to `1`):
            The size of the data parallel group. If `dp_replicate_size` is set to 1, the data parallel replication
            group will not be used.
        dp_shard_size (`int`, defaults to `1`):
            The size of the model shard group. If `dp_replicate_size > 1` and `tp_size > 1`, `dp_shard_size` must also
            be greater than 1, as composing DDP + TP is currently not supported.
        tp_size (`int`, defaults to `1`):
            The size of the tensor parallel group. If `tp_size` is set to `1`, the tensor parallel group will not be
            used.
        tp_handler (`~utils.TorchTensorParallelConfig`, defaults to `None`):
            The handler for the tensor parallel group.
        cp_size (`int`, defaults to `1`):
            The size of the context parallel group. Currently not supported, but reserved for future use and enabled
            for downstream libraries.
        cp_backend (`str`, defaults to `torch`):
            Which CP backend to use: `torch` (FSDP2)
        sp_size (`int`, defaults to `1`):
            The size of the sequence parallel group.
        sp_backend (`str`, defaults to `deepspeed`):
            Which SP backend to use:`deepspeed` (ALST/Ulysses)

    You may obtain different distributed data parallel paradigms by configuring `dp_replicate_size` and `dp_shard_size`
    together:
        - `dp_replicate_size == 1` and `dp_shard_size > 1`, we obtain Fully Sharded Data Parallel (FSDP).
        - `dp_replicate_size > 1` and `dp_shard_size > 1`, we obtain Hybrid Sharded Data Parallel (HSDP).
        - `dp_replicate_size > 1` and `dp_shard_size == 1` is an invalid configuration, to use pure DP, use
          `DistributedDataParallelKwargs` instead.

    Ndp_replicate_sizedp_shard_sizetp_sizecp_sizetorch
cp_backendsp_size	deepspeed
sp_backend
tp_handler
cp_handler
sp_handlerc                    d| j                    d| j                   d| j                   d| j                   d| j                   d| j
                   d| j                   d| j                   d	| j                   d
| j                   dS )Nz'ParallelismConfig(
 	dp_replicate_size=z,
	dp_shard_size=z,
	tp_size=z,
	cp_size=z,
	cp_backend=z,
	sp_size=z,
	sp_backend=z,
	total_size=z
	tp_handler=z,
	cp_handler=z)
)
r   r   r   r   r   r   r   
total_sizer   r   selfs    g/home/obispo/Crisostomo_bridge/mision_env/lib/python3.12/site-packages/accelerate/parallelism_config.py__repr__zParallelismConfig.__repr__U   s    ##'#9#9": ;#112 3 ' ' OO, - ' OO, - OO, - OO, - OO,C
1	
    c                     dd l }dg}|j                  | j                  j                         D ci c]3  \  }}||vr*|t	        |d      r|j                  |j                        n|5 c}}       y c c}}w )Nr   device_mesh__dict__)copydeepcopyr%   itemshasattr)r   r&   _non_serializable_fieldskvs        r    to_jsonzParallelismConfig.to_jsond   sm    $1?  !MM//1Aq44 :0F4==,AM	
s   8A1
c                 R    g }| j                   r|dgz  }| j                  r|dgz  }|S )zENames of enabled dimensions across which data parallelism is applied.dp_replicatedp_shard)dp_replicate_enableddp_shard_enabledr   dimss     r    dp_dim_nameszParallelismConfig.dp_dim_namesq   s9     $$^$$D  ZL Dr"   c                 v    g }| j                   r|dgz  }| j                  r|dgz  }| j                  r|dgz  }|S )z]Names of enabled dimensions which will receive the same batch (non-data parallel dimensions).tpcpsp)
tp_enabled
cp_enabled
sp_enabledr3   s     r    non_dp_dim_namesz"ParallelismConfig.non_dp_dim_names{   sD     ??TFND??TFND??TFNDr"   c                 R    g }| j                   r|dgz  }| j                  r|dgz  }|S )zlNames of enabled dimensions which will be flattened into a joint mesh across which is model sharded in FSDP.r0   r8   )r2   r;   r3   s     r    dp_shard_cp_dim_namesz'ParallelismConfig.dp_shard_cp_dim_names   s5       ZL D??TFNDr"   c                 v    g }| j                   r|dgz  }| j                  r|dgz  }| j                  r|dgz  }|S )z@Names of enabled dimensions across which loss should be averagedr/   r0   r8   )r1   r2   r;   r3   s     r    dp_cp_dim_namesz!ParallelismConfig.dp_cp_dim_names   sK     $$^$$D  ZL D??TFNDr"   c                 :    g }| j                   r|dgz  }|dgz  }|S )z^Names of enabled dimensions across which FSDP is applied, including data parallel replication.r/   dp_shard_cp)r1   r3   s     r    fsdp_dim_namesz ParallelismConfig.fsdp_dim_names   s0     $$^$$Dr"   c                     | j                   | j                  z  | j                  z  | j                  z  | j                  z  S )zSThe total size of the parallelism configuration, which is the product of all sizes.)r   r   r   r   r   r   s    r    r   zParallelismConfig.total_size   s9     %%(:(::T\\IDLLX[_[g[gggr"   c                 N    | j                   | j                  z  | j                  z  S )zhThe size of the non-data parallel dimensions, which is the product of tensor and context parallel sizes.)r   r   r   r   s    r    non_data_parallel_sizez(ParallelismConfig.non_data_parallel_size   s      ||dll*T\\99r"   c                 4    | j                   | j                  z  S )z_The size of the data parallel dimensions, which is the product of data parallel replication and)r   r   r   s    r    data_parallel_sizez$ParallelismConfig.data_parallel_size   s     %%(:(:::r"   c                      | j                   dkD  S )zKTrue if data parallel replication is enabled, i.e. `dp_replicate_size > 1`.   )r   r   s    r    r1   z&ParallelismConfig.dp_replicate_enabled   s     %%))r"   c                      | j                   dkD  S )zDTrue if data parallel sharding is enabled, i.e. `dp_shard_size > 1`.rK   )r   r   s    r    r2   z"ParallelismConfig.dp_shard_enabled   s     !!A%%r"   c                      | j                   dkD  S )z:True if tensor parallelism is enabled, i.e. `tp_size > 1`.rK   )r   r   s    r    r:   zParallelismConfig.tp_enabled        ||ar"   c                      | j                   dkD  S )z;True if context parallelism is enabled, i.e. `cp_size > 1`.rK   )r   r   s    r    r;   zParallelismConfig.cp_enabled   rN   r"   c                      | j                   dkD  S )z;True if context parallelism is enabled, i.e. `sp_size > 1`.rK   )r   r   s    r    r<   zParallelismConfig.sp_enabled   rN   r"   c                 4    | j                   | j                  z   S )z$Names of all active mesh dimensions.)r5   r=   r   s    r    active_mesh_dimsz"ParallelismConfig.active_mesh_dims   s       4#8#888r"   device_typec                    t        dd      rddlm} nt        d      | j	                         }t        |      dk(  ry|\  }} ||||      }| j                  r|| j                     j                  d       | j                  r|| j                     j                  d	       | j                  r|| j                     j                  d
       |S )a!  Builds a device mesh for the given device type based on the parallelism configuration.
        This method will also create required joint meshes (e.g. `dp_shard_cp`, `dp_cp`, `dp`).

        Args:
            device_type (`str`): The type of device for which to build the mesh, e
        z>=z2.2.0r   )init_device_meshz4Building a device_mesh requires to have torch>=2.2.0N)mesh_dim_namesdprC   dp_cp)
r   torch.distributed.device_meshrU   RuntimeError	_get_meshlenr5   _flattenr?   rA   )r   rS   rU   meshrV   
mesh_shaper$   s          r    build_device_meshz#ParallelismConfig.build_device_mesh   s     D'*FUVV~~t9>%)"
&)

 ))*33D9%%223<<]K,,-66w?r"   c                     | j                   &|"| j                  |      | _         | j                   S d|?| j                   j                  |k7  r&t        d| j                   j                   d| d      | j                   S )Nz@You need to pass a device_type e.g cuda to build the device meshz4The device_mesh is already created with device type z@. However, you are trying to get a device mesh with device_type z<. Please check if you correctly initialized your device_mesh)r$   r`   rS   
ValueError)r   rS   s     r    get_device_meshz!ParallelismConfig.get_device_mesh   s    #&#'#9#9+#F   ZZ&##//;>$NtO_O_OkOkNl  mm  ny  mz  zv  w  r"   return.c                     | j                   D ci c]  }|| j                  |    }}g dt        |j                         fd      }t	        t        |       S c c}w )zQGenerate mesh shape and dimension names for torch.distributed.init_device_mesh().)r/   r0   r8   r9   r7   c                 ,    j                  | d         S )Nr   )index)x
mesh_orders    r    <lambda>z-ParallelismConfig._get_mesh.<locals>.<lambda>	  s    :++AaD1 r"   )key)rR   _sizessortedr(   tuplezip)r   parallelism	mesh_dimssorted_itemsri   s       @r    r[   zParallelismConfig._get_mesh   se     OSNcNcd{[$++k"::d	d D
OO2
 S,'(( es   Ac           
      .   | j                   .t        t        j                  j	                  dd            | _         | j
                  .t        t        j                  j	                  dd            | _        | j                  .t        t        j                  j	                  dd            | _        | j                  .t        t        j                  j	                  dd            | _        | j                  %t        j                  j	                  dd      | _        | j                  .t        t        j                  j	                  dd            | _	        | j                  %t        j                  j	                  d	d
      | _
        | j                  dkD  r| j                  t               | _        | j                  dkD  r| j                  t               | _        nqt        t              }t!        | j                  || j                           s>t#        d| j                   d|| j                      dt%        | j                               | j                  dkD  r| j&                  t)               | _        | j                   dk  rt#        d| j                          | j
                  dk  rt#        d| j
                         | j                  dk  rt#        d| j                         | j                  dk  rt#        d| j                         dg}| j                  |vrt#        d| d| j                         | j                  dk  rt#        d| j                         d
g}| j                  |vrt#        d| d| j                         | j                  dkD  s| j                  dkD  r)| j                   dkD  r| j
                  dk(  rt#        d      | j                   | j
                  | j                  | j                  | j                  d| _        y )N$PARALLELISM_CONFIG_DP_REPLICATE_SIZE1 PARALLELISM_CONFIG_DP_SHARD_SIZEPARALLELISM_CONFIG_TP_SIZEPARALLELISM_CONFIG_CP_SIZEPARALLELISM_CONFIG_CP_BACKENDr   PARALLELISM_CONFIG_SP_SIZEPARALLELISM_CONFIG_SP_BACKENDr   rK   )r   zParallelismConfig's cp_backend=z
 requires z, but cp_handler was set to z.dp_replicate_size must be at least 1, but got z*dp_shard_size must be at least 1, but got z$tp_size must be at least 1, but got z$cp_size must be at least 1, but got zcp_backend must be one of z
, but got z$sp_size must be at least 1, but got zsp_backend must be one of aC  Tensor/Context parallelism (tp/cp_size > 1) cannot be used with pure data parallelism (dp_replicate_size > 1 and dp_shard_size == 1). Please set dp_shard_size > 1 and dp_replicate_size == 1 to compose FSDP + TP/CP for 2D parallel, or set dp_replicate_size == 1 and dp_shard_size > 1 to compose HSDP + TP/CP for 3D parallel.)r/   r0   r7   r8   r9   )r   intosenvirongetr   r   r   r   r   r   r   r   r   r
   dict
isinstancerb   typer   r   rl   )r   cp_backends_config_mapvalid_cp_backendsvalid_sp_backendss       r    __post_init__zParallelismConfig.__post_init__  s   !!)%(8^`c)d%eD"%!$RZZ^^4VX[%\!]D<<rzz~~.JCPQDL<<rzz~~.JCPQDL??" jjnn-LgVDO<<rzz~~.JCPQDL??" jjnn-LkZDO<<!&";"=<<!&"<">)-4*& "$//3I$//3Z[$9$//9J*Uklpl{l{U|T}  ~Z  [_  `d  `o  `o  [p  Zq  r  <<!&"A"C!!A%MdNdNdMefgg!I$J\J\I]^__<<!CDLL>RSS<<!CDLL>RSS$I??"339:K9LJW[WfWfVghii<<!CDLL>RSS(M??"339:K9LJW[WfWfVghiiLL1q 0d6L6Lq6PUYUgUgklUlo  !22**,,,,,,
r"   rp   sizec                     || j                   j                         v s"J d| j                   j                                 || j                   |<   t        | | d|       y )NzParallelism must be one of _size)rl   keyssetattr)r   rp   r   s      r    	_set_sizezParallelismConfig._set_sizeQ  s\    dkk..00d4OPTP[P[P`P`PbOc2dd0#'K U+T2r"   c                    t               }|j                  s| j                  dk(  ry | j                  dk(  r| j                  d|j                         | j                  |j                  k7  r&t        d| j                   d|j                   d      | j                  dkD  rN|j                  sB|j                  s6|j                  t        j                  k(  st        d|j                   d      | j                  j                         D ]4  \  }}|dk(  st        | | dd       |j                  d	| d
| d       6 |r:|j                  r-t        j                   ddj#                  |      z   t$               y y y )NrK   r/   zParallelismConfig total_size (z ) does not match num_processes (zJ). Please adjust dp_replicate_size/ dp_shard_size/tp_size/cp_size/sp_size.zParallelismConfig is only compatible DistributedType.FSDP (version 2) or DistributedType.Multi{Device} or DistributedType.DEEPSPEED, but got ._handlerzParallelismConfig.z_handler is set, but z0_size is set to 1. This handler will be ignored.z.ParallelismConfig has the following warnings:

)setmulti_devicer   r   num_processesrb   is_fsdp2distributed_typer	   	DEEPSPEEDrl   r(   getattraddis_main_processwarningswarnjoinUserWarning)r   accelerator	_warningsrp   r   s        r    _validate_acceleratorz'ParallelismConfig._validate_acceleratorV  s   E	''DOOq,@ ??aNN>;+D+DE??k77700A B""-";";!< =9:  ??Q  ''++/H/HH b  cn  c  c  b@  @A  B  "&!2!2!4 	KqyWTk](+CTJV(5J;-  XH  I	 44MMADIIiDXX 59r"   )N)r   r   )0__name__
__module____qualname____doc__r   r   r|   __annotations__r   r   r   r   r   r   r   r   r   r   r   r
   r   r   r$   r!   r-   propertyr5   r=   r?   rA   rD   r   rG   rI   r1   r2   r:   r;   r<   rR   strr`   rc   rn   r[   r   r   r    r"   r    r   r   !   s:   !F (,x}+#'M8C='!GXc]!!GXc]!#'J '!GXc]!'+J$+ :>Jd556=:>Jd667>?CJd;;<CK

   	 	   	 	   h h : : ; ; * * & &             9 9S < 8C=  )5sCx%S/!AB )B
H3S 3 3
$r"   r   )r}   r   dataclassesr   typingr   r   r   r   accelerate.utils.dataclassesr   r	   r
   r   accelerate.utils.versionsr   
accelerater   r   r   r"   r    <module>r      sI    
  ! : :  7 & X X Xr"   