-
Notifications
You must be signed in to change notification settings - Fork 314
[CLI][GB200] Add ultraserver instance(p6e-gb200) capacity block support #6928
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3b95735
9248780
ba7f627
a19dac4
5acfbff
9ceb5ad
c58365d
253780e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,9 +63,14 @@ | |
| NODE_BOOTSTRAP_TIMEOUT, | ||
| ONTAP, | ||
| OPENZFS, | ||
| ULTRASERVER_INSTANCE_PREFIX_LIST, | ||
| Feature, | ||
| ) | ||
| from pcluster.utils import get_partition, get_resource_name_from_resource_arn, to_snake_case | ||
| from pcluster.utils import ( | ||
| get_partition, | ||
| get_resource_name_from_resource_arn, | ||
| to_snake_case, | ||
| ) | ||
| from pcluster.validators.awsbatch_validators import ( | ||
| AwsBatchComputeInstanceTypeValidator, | ||
| AwsBatchComputeResourceSizeValidator, | ||
|
|
@@ -141,6 +146,7 @@ | |
| ) | ||
| from pcluster.validators.ec2_validators import ( | ||
| AmiOsCompatibleValidator, | ||
| CapacityBlockHealthStatusValidator, | ||
| CapacityReservationResourceGroupValidator, | ||
| CapacityReservationSizeValidator, | ||
| CapacityReservationValidator, | ||
|
|
@@ -2409,7 +2415,16 @@ def instance_types(self) -> List[str]: | |
| def instance_type(self): | ||
| """Instance type of this compute resource.""" | ||
| if not self._instance_type: | ||
| self._instance_type = Resource.init_param(self._instance_type_from_capacity_reservation()) | ||
| capacity_reservation_id = ( | ||
| self.capacity_reservation_target.capacity_reservation_id if self.capacity_reservation_target else None | ||
| ) | ||
| ( | ||
| instance_type_from_capacity_reservation, | ||
| _, | ||
| ) = AWSApi.instance().ec2.get_instance_type_and_reservation_type_from_capacity_reservation( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are the benefits of creating an entirely new function rather than just using/building upon the existing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| capacity_reservation_id | ||
| ) | ||
| self._instance_type = Resource.init_param(instance_type_from_capacity_reservation) | ||
gmarciani marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return self._instance_type | ||
|
|
||
| def _register_validators(self, context: ValidatorContext = None): | ||
|
|
@@ -2453,18 +2468,6 @@ def disable_simultaneous_multithreading_manually(self) -> bool: | |
| """Return true if simultaneous multithreading must be disabled with a cookbook script.""" | ||
| return self.disable_simultaneous_multithreading and self._instance_type_info.default_threads_per_core() > 1 | ||
|
|
||
| def _instance_type_from_capacity_reservation(self): | ||
| """Return the instance type from the configured CapacityReservationId, if any.""" | ||
| instance_type = None | ||
| capacity_reservation_id = ( | ||
| self.capacity_reservation_target.capacity_reservation_id if self.capacity_reservation_target else None | ||
| ) | ||
| if capacity_reservation_id: | ||
| capacity_reservations = AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation_id]) | ||
| if capacity_reservations: | ||
| instance_type = capacity_reservations[0].instance_type() | ||
| return instance_type | ||
|
|
||
|
|
||
| class _CommonQueue(BaseQueue): | ||
| """Represent the Common Queue resource between Slurm and future scheduler implementation.""" | ||
|
|
@@ -2931,6 +2934,7 @@ def __init__( | |
| pool.ssh.allowed_ips = self.head_node.ssh.allowed_ips | ||
|
|
||
| self.__image_dict = None | ||
| self.__ultraserver_capacity_block_dict = None | ||
| # Cache capacity reservations information together to reduce number of boto3 calls. | ||
| # Since this cache is only used for validation, if AWSClientError happens | ||
| # (e.g insufficient IAM permissions to describe the capacity reservations), we catch the exception to avoid | ||
|
|
@@ -2986,6 +2990,53 @@ def login_nodes_subnet_ids(self): | |
| subnet_ids_set.add(subnet_id) | ||
| return list(subnet_ids_set) | ||
|
|
||
| @property | ||
| def ultraserver_capacity_block_dict(self): | ||
| """ | ||
| Return a dictionary mapping ultraserver instance prefixes to their capacity block reservation IDs. | ||
|
|
||
| This property collects all capacity block reservations used by ultraserver instances | ||
| (e.g., p6e-gb200) across all queues and compute resources in the cluster configuration. | ||
|
|
||
| Returns: | ||
| dict: A dictionary where keys are ultraserver instance prefixes (e.g., 'p6e-gb200') | ||
| and values are lists of capacity reservation IDs for that instance type. | ||
|
|
||
| Example: | ||
| { | ||
| 'p6e-gb200': ['cr-123456', 'cr-789012'] | ||
| } | ||
| """ | ||
| if self.__ultraserver_capacity_block_dict: | ||
| return self.__ultraserver_capacity_block_dict | ||
|
|
||
| self.__ultraserver_capacity_block_dict = {} | ||
|
|
||
| # Initialize empty lists for each supported ultraserver instance prefix | ||
| for ultraserver_instance_prefix in ULTRASERVER_INSTANCE_PREFIX_LIST: | ||
| self.__ultraserver_capacity_block_dict[ultraserver_instance_prefix] = [] | ||
|
|
||
| # Iterate through all queues and compute resources to find ultraserver capacity blocks | ||
| for queue in self.scheduling.queues: | ||
| for compute_resource in queue.compute_resources: | ||
| cr_target = compute_resource.capacity_reservation_target or queue.capacity_reservation_target | ||
| if cr_target and cr_target.capacity_reservation_id: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if the config specifies a resource group ARN rather than a reservation id?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We only accept capacity block for ultraserver instance. ResourceGroupArn is not considered here.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this a limitation captured in our code, i.e. we fail if we specify a capacity block and CapacityReservationResourceGroupArn does the validation fail? At least in documentation it does not seems so: https://docs.aws.amazon.com/parallelcluster/latest/ug/Scheduling-v3.html#yaml-Scheduling-SlurmQueues-CapacityReservationTarget
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean specify CapacityBlock and and CapacityReservationResourceGroupArn at the same time? But if it's CapacityBlock, you have to use InstanceType section or leave it empty(optional). https://docs.aws.amazon.com/parallelcluster/latest/ug/launch-instances-capacity-blocks.html But even if it doesn't fail, why it matters in our case? This function is to collect
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The use case we are talking about is: a compute resource with CAPACITY BLOCK, instance type specified and a resourcegroupArn rather than a reservationId. I think we should support this scenario. If we do not support it, let's verify that that there exist a validator to prevent it. If it does not exist, the next steps are:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had a sync with Giacomo. We are not going to support it because: >Capacity Reservation Groups are designed to work with standard Capacity Reservations, not with Capacity Blocks. They allow you to group and manage multiple Capacity Reservations together, but this functionality does not extend to Capacity Blocks. |
||
| # Get instance type and reservation type from the capacity reservation | ||
| ( | ||
| instance_type, | ||
| reservation_type, | ||
| ) = AWSApi.instance().ec2.get_instance_type_and_reservation_type_from_capacity_reservation( | ||
| cr_target.capacity_reservation_id | ||
| ) | ||
| # Extract instance prefix (e.g., 'p6e-gb200' from 'p6e-gb200.36xlarge') | ||
| instance_prefix = instance_type.split(".")[0] | ||
| # Only collect capacity blocks for ultraserver instances | ||
| if reservation_type == "capacity-block" and instance_prefix in ULTRASERVER_INSTANCE_PREFIX_LIST: | ||
| self.__ultraserver_capacity_block_dict.get(instance_prefix).append( | ||
| cr_target.capacity_reservation_id | ||
| ) | ||
| return self.__ultraserver_capacity_block_dict | ||
|
|
||
| def _register_login_node_validators(self): | ||
| """Register all login node validators to ensure that the resource parameters are valid.""" | ||
| # Check if all subnets(head node, Login nodes, compute nodes) are in the same VPC and support DNS. | ||
|
|
@@ -3223,6 +3274,13 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 | |
| num_of_instances=num_of_instances, | ||
| ) | ||
|
|
||
| for ultraserver_instance_prefix in ULTRASERVER_INSTANCE_PREFIX_LIST: | ||
| if self.ultraserver_capacity_block_dict.get(ultraserver_instance_prefix): | ||
| self._register_validator( | ||
| CapacityBlockHealthStatusValidator, | ||
| capacity_reservation_ids=self.ultraserver_capacity_block_dict.get(ultraserver_instance_prefix), | ||
| ) | ||
|
|
||
| @property | ||
| def image_dict(self): | ||
| """Return image dict of queues, key is queue name, value is image id.""" | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.