Skip to content

Commit

Permalink
Add A3/H100 to the nvml test
Browse files Browse the repository at this point in the history
  • Loading branch information
LujieDuan committed Jun 27, 2024
1 parent e80385f commit aa93ee4
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ gpu_platforms:
- suse-cloud:sles-15
- ubuntu-os-cloud:ubuntu-2004-lts
- ubuntu-os-cloud:ubuntu-2204-lts
- model: h100
platforms:
- ubuntu-os-cloud:ubuntu-2004-lts
expected_metrics:
- type: agent.googleapis.com/gpu/utilization
value_type: DOUBLE
Expand Down
72 changes: 43 additions & 29 deletions integration_test/third_party_apps_test/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -806,10 +806,11 @@ func determineImpactedApps(modifiedFiles []string, allApps map[string]metadata.I
}

type accelerator struct {
model string
fullName string
machineType string
availableZone string
model string
fullName string
machineType string
availableZone string
acceleratorCount int
}

type test struct {
Expand Down Expand Up @@ -837,40 +838,53 @@ var defaultApps = map[string]bool{
var gpuModels = map[string]accelerator{
// This is the A100 40G model; A100 80G is similar so skipping
"a100": {
model: "a100",
fullName: "nvidia-tesla-a100",
machineType: "a2-highgpu-1g",
availableZone: "us-central1-a",
model: "a100",
fullName: "nvidia-tesla-a100",
machineType: "a2-highgpu-1g",
availableZone: "us-central1-a",
acceleratorCount: 1,
},
"v100": {
model: "v100",
fullName: "nvidia-tesla-v100",
machineType: "n1-standard-2",
availableZone: "us-central1-a",
model: "v100",
fullName: "nvidia-tesla-v100",
machineType: "n1-standard-2",
availableZone: "us-central1-a",
acceleratorCount: 1,
},
"t4": {
model: "t4",
fullName: "nvidia-tesla-t4",
machineType: "n1-standard-2",
availableZone: "us-central1-a",
model: "t4",
fullName: "nvidia-tesla-t4",
machineType: "n1-standard-2",
availableZone: "us-central1-a",
acceleratorCount: 1,
},
"p4": {
model: "p4",
fullName: "nvidia-tesla-p4",
machineType: "n1-standard-2",
availableZone: "us-central1-a",
model: "p4",
fullName: "nvidia-tesla-p4",
machineType: "n1-standard-2",
availableZone: "us-central1-a",
acceleratorCount: 1,
},
"p100": {
model: "p100",
fullName: "nvidia-tesla-p100",
machineType: "n1-standard-2",
availableZone: "us-central1-c",
model: "p100",
fullName: "nvidia-tesla-p100",
machineType: "n1-standard-2",
availableZone: "us-central1-c",
acceleratorCount: 1,
},
"l4": {
model: "l4",
fullName: "nvidia-l4",
machineType: "g2-standard-4",
availableZone: "us-central1-a",
model: "l4",
fullName: "nvidia-l4",
machineType: "g2-standard-4",
availableZone: "us-central1-a",
acceleratorCount: 1,
},
"h100": {
model: "h100",
fullName: "nvidia-h100-80gb",
machineType: "a3-highgpu-8g",
availableZone: "us-west1-a",
acceleratorCount: 8,
},
}

Expand Down Expand Up @@ -997,7 +1011,7 @@ func TestThirdPartyApps(t *testing.T) {
if tc.gpu != nil {
options.ExtraCreateArguments = append(
options.ExtraCreateArguments,
fmt.Sprintf("--accelerator=count=1,type=%s", tc.gpu.fullName),
fmt.Sprintf("--accelerator=count=%d,type=%s", tc.gpu.acceleratorCount, tc.gpu.fullName),
"--maintenance-policy=TERMINATE")
options.ExtraCreateArguments = append(options.ExtraCreateArguments, "--boot-disk-size=100GB")
options.MachineType = tc.gpu.machineType
Expand Down

0 comments on commit aa93ee4

Please sign in to comment.