From 7c7c4b6de5c10edba751fad65ffb6a6b859d453b Mon Sep 17 00:00:00 2001 From: Victor Barr Date: Tue, 5 Dec 2023 15:44:18 -0800 Subject: [PATCH] Verify reservation exists and print describe details if it does (#37) * Verify reservation exists and print describe details if it does * more readme details --- README.md | 37 ++++++++++++++++++++++++++++++++++++- xpk.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8f232db..2ae5660 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,19 @@ cleanup with a `Cluster Delete`. ## Cluster Create +First set the project and zone through gcloud config or xpk arguments. + +```shell +PROJECT_ID=my-project-id +ZONE=us-east5-b +# gcloud config: +gcloud config set project $PROJECT_ID +gcloud config set compute/zone $ZONE +# xpk arguments +xpk .. --zone $ZONE --project $PROJECT_ID +``` + + The cluster created is a regional cluster to enable the GKE control plane across all zones. @@ -71,7 +84,7 @@ all zones. ```shell # Find your reservations gcloud compute reservations list --project=$PROJECT_ID - # Run cluster create with reservation + # Run cluster create with reservation. python3 xpk.py cluster create \ --cluster xpk-test --tpu-type=v5litepod-256 \ --num-slices=2 \ @@ -86,6 +99,14 @@ all zones. --num-slices=4 --on-demand ``` +* Cluster Create (provision spot / preemptable capacity): + + ```shell + python3 xpk.py cluster create \ + --cluster xpk-test --tpu-type=v5litepod-16 \ + --num-slices=4 --spot + ``` + * Cluster Create can be called again with the same `--cluster name` to modify the number of slices or retry failed steps. @@ -375,3 +396,17 @@ python3 xpk.py cluster create --cluster-cpu-machine-type=CPU_TYPE ... * `requires one of ["container.*"] permission(s)` Add [Kubernetes Engine Admin](https://cloud.google.com/iam/docs/understanding-roles#kubernetes-engine-roles) to your user. + +## Reservation Troubleshooting: + +### How to determine your reservation and its size / utilization: + +```shell +PROJECT_ID=my-project +ZONE=us-east5-b +RESERVATION=my-reservation-name +# Find the reservations in your project +gcloud beta compute reservations list --project=$PROJECT_ID +# Find the tpu machine type and current utilization of a reservation. +gcloud beta compute reservations describe $RESERVATION --project=$PROJECT_ID --zone=$ZONE +``` diff --git a/xpk.py b/xpk.py index b39804b..f22a945 100644 --- a/xpk.py +++ b/xpk.py @@ -791,7 +791,7 @@ def print_reservations(args) -> int: 0 if successful and 1 otherwise. """ command = ( - f'gcloud compute reservations list --project={args.project}' + f'gcloud beta compute reservations list --project={args.project}' ) return_code = ( run_command_with_updates( @@ -803,6 +803,30 @@ def print_reservations(args) -> int: return 0 +def verify_reservation_exists(args) -> int: + """Verify the reservation exists. + + Args: + args: user provided arguments for running the command. + + Returns: + 0 if successful and 1 otherwise. + """ + command = ( + f'gcloud beta compute reservations describe {args.reservation}' + f' --project={args.project} --zone={args.zone}' + ) + return_code = ( + run_command_with_updates( + command, 'Describe reservation', args) + ) + if return_code != 0: + xpk_print(f'Describe reservation returned ERROR {return_code}') + xpk_print('Please confirm that your reservation name is correct.') + return 1 + return 0 + + def get_capacity_arguments(args) -> tuple[str, int]: """Determine the TPU Nodepool creation capacity arguments needed. @@ -822,6 +846,9 @@ def get_capacity_arguments(args) -> tuple[str, int]: capacity_args = "" num_types+=1 if args.reservation: + return_code = verify_reservation_exists(args) + if return_code > 0: + return capacity_args, return_code capacity_args = ( f'--reservation-affinity=specific --reservation={args.reservation}' )