Skip to content

Commit

Permalink
lookup the nvidia/nvidia-frontend device by device major number
Browse files Browse the repository at this point in the history
Signed-off-by: Tariq Ibrahim <[email protected]>
  • Loading branch information
tariq1890 committed Feb 2, 2024
1 parent 2f3600a commit 0f27d53
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 77 deletions.
54 changes: 36 additions & 18 deletions cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package devchar

import (
"errors"
"fmt"
"path/filepath"

Expand All @@ -28,18 +29,23 @@ import (
)

type allPossible struct {
logger logger.Interface
devRoot string
deviceMajors devices.Devices
migCaps nvcaps.MigCaps
logger logger.Interface
devRoot string
devices devices.Devices[devices.Name, devices.Major]
migCaps nvcaps.MigCaps
}

// newAllPossible returns a new allPossible device node lister.
// This lister lists all possible device nodes for NVIDIA GPUs, control devices, and capability devices.
func newAllPossible(logger logger.Interface, devRoot string) (nodeLister, error) {
deviceMajors, err := devices.GetNVIDIADevices()
nvdevices, err := devices.GetNVIDIADevices()
if err != nil {
return nil, fmt.Errorf("failed reading device majors: %v", err)
return nil, fmt.Errorf("failed reading devices: %w", err)
}

nvdeviceMajors, err := devices.GetNVIDIADeviceMajors()
if err != nil {
return nil, fmt.Errorf("failed reading device major map: %w", err)
}

var requiredMajors []devices.Name
Expand All @@ -53,18 +59,22 @@ func newAllPossible(logger logger.Interface, devRoot string) (nodeLister, error)
requiredMajors = append(requiredMajors, devices.NVIDIACaps)
}

requiredMajors = append(requiredMajors, devices.NVIDIAGPU, devices.NVIDIAUVM)
requiredMajors = append(requiredMajors, devices.NVIDIAUVM)
for _, name := range requiredMajors {
if !deviceMajors.Exists(name) {
return nil, fmt.Errorf("missing required device major %s", name)
if !nvdevices.Exists(name) {
return nil, fmt.Errorf("missing required device %s", name)
}
}

if !nvdeviceMajors.Exists(devices.Major(devices.NVIDIAGPUMajor)) {
return nil, errors.New("missing required device nvidia/nvidia-frontend")
}

l := allPossible{
logger: logger,
devRoot: devRoot,
deviceMajors: deviceMajors,
migCaps: migCaps,
logger: logger,
devRoot: devRoot,
devices: nvdevices,
migCaps: migCaps,
}

return l, nil
Expand Down Expand Up @@ -105,8 +115,8 @@ func (m allPossible) getControlDeviceNodes() ([]deviceNode, error) {

// Define the control devices for standard GPUs.
controlDevices := []deviceNode{
m.newDeviceNode(devices.NVIDIAGPU, "/dev/nvidia-modeset", devices.NVIDIAModesetMinor),
m.newDeviceNode(devices.NVIDIAGPU, "/dev/nvidiactl", devices.NVIDIACTLMinor),
m.newDeviceNodeFromMajor(devices.NVIDIAGPUMajor, "/dev/nvidia-modeset", devices.NVIDIAModesetMinor),
m.newDeviceNodeFromMajor(devices.NVIDIAGPUMajor, "/dev/nvidiactl", devices.NVIDIACTLMinor),
m.newDeviceNode(devices.NVIDIAUVM, "/dev/nvidia-uvm", devices.NVIDIAUVMMinor),
m.newDeviceNode(devices.NVIDIAUVM, "/dev/nvidia-uvm-tools", devices.NVIDIAUVMToolsMinor),
}
Expand All @@ -133,8 +143,8 @@ func (m allPossible) getControlDeviceNodes() ([]deviceNode, error) {

// getGPUDeviceNodes generates a list of device nodes for a given GPU.
func (m allPossible) getGPUDeviceNodes(gpu int) []deviceNode {
d := m.newDeviceNode(
devices.NVIDIAGPU,
d := m.newDeviceNodeFromMajor(
devices.NVIDIAGPUMajor,
fmt.Sprintf("/dev/nvidia%d", gpu),
gpu,
)
Expand Down Expand Up @@ -178,11 +188,19 @@ func (m allPossible) getNVCapDeviceNodes(gpu int) []deviceNode {
// newDeviceNode creates a new device node with the specified path and major/minor numbers.
// The path is adjusted for the specified driver root.
func (m allPossible) newDeviceNode(deviceName devices.Name, path string, minor int) deviceNode {
major, _ := m.deviceMajors.Get(deviceName)
major, _ := m.devices.Get(deviceName)

return deviceNode{
path: filepath.Join(m.devRoot, path),
major: uint32(major),
minor: uint32(minor),
}
}

func (m allPossible) newDeviceNodeFromMajor(deviceMajor devices.Major, path string, minor int) deviceNode {
return deviceNode{
path: filepath.Join(m.devRoot, path),
major: uint32(deviceMajor),
minor: uint32(minor),
}
}
72 changes: 62 additions & 10 deletions internal/info/proc/devices/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ const (
NVIDIACTLMinor = 255
NVIDIAModesetMinor = 254

NVIDIAFrontend = Name("nvidia-frontend")
NVIDIAGPU = NVIDIAFrontend
NVIDIACaps = Name("nvidia-caps")
NVIDIAUVM = Name("nvidia-uvm")
NVIDIAGPUMajor = 195

NVIDIACaps = Name("nvidia-caps")
NVIDIAUVM = Name("nvidia-uvm")

procDevicesPath = "/proc/devices"
nvidiaDevicePrefix = "nvidia"
Expand All @@ -50,14 +50,14 @@ type Major int
// Devices represents the set of devices under /proc/devices
//
//go:generate moq -stub -out devices_mock.go . Devices
type Devices interface {
Exists(Name) bool
Get(Name) (Major, bool)
type Devices[A, B any] interface {
Exists(A) bool
Get(A) (B, bool)
}

type devices map[Name]Major

var _ Devices = devices(nil)
var _ Devices[Name, Major] = devices(nil)

// Exists checks if a Device with a given name exists or not
func (d devices) Exists(name Name) bool {
Expand All @@ -72,14 +72,19 @@ func (d devices) Get(name Name) (Major, bool) {
}

// GetNVIDIADevices returns the set of NVIDIA Devices on the machine
func GetNVIDIADevices() (Devices, error) {
func GetNVIDIADevices() (Devices[Name, Major], error) {
return nvidiaDevices(procDevicesPath)
}

// GetNVIDIADeviceMajors returns the set of NVIDIA Device Major numbers on the machine
func GetNVIDIADeviceMajors() (Devices[Major, Name], error) {
return nvidiaDeviceMajors(procDevicesPath)
}

// nvidiaDevices returns the set of NVIDIA Devices from the specified devices file.
// This is useful for testing since we may be testing on a system where `/proc/devices` does
// contain a reference to NVIDIA devices.
func nvidiaDevices(devicesPath string) (Devices, error) {
func nvidiaDevices(devicesPath string) (Devices[Name, Major], error) {
devicesFile, err := os.Open(devicesPath)
if os.IsNotExist(err) {
return nil, nil
Expand All @@ -92,6 +97,20 @@ func nvidiaDevices(devicesPath string) (Devices, error) {
return nvidiaDeviceFrom(devicesFile)
}

// nvidiaDeviceMajors returns the map of NVIDIA Device Major numbers and their corresponding device names
func nvidiaDeviceMajors(devicesPath string) (Devices[Major, Name], error) {
devicesFile, err := os.Open(devicesPath)
if os.IsNotExist(err) {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("error opening devices file: %v", err)
}
defer devicesFile.Close()

return nvidiaDeviceMajorFrom(devicesFile)
}

var errNoNvidiaDevices = errors.New("no NVIDIA devices found")

func nvidiaDeviceFrom(reader io.Reader) (devices, error) {
Expand Down Expand Up @@ -139,3 +158,36 @@ func processProcDeviceLine(line string) (Name, Major, error) {

return "", 0, fmt.Errorf("unparsable line: %v", line)
}

type deviceMajors map[Major]Name

// Exists checks if a Device with a given name exists or not
func (d deviceMajors) Exists(m Major) bool {
_, exists := d[m]
return exists
}

// Get a Device from Devices
func (d deviceMajors) Get(m Major) (Name, bool) {
device, exists := d[m]
return device, exists
}

func nvidiaDeviceMajorFrom(reader io.Reader) (deviceMajors, error) {
allDevices := devicesFrom(reader)
nvidiaDeviceMajors := make(deviceMajors)

var hasNvidiaDevices bool
for n, d := range allDevices {
if !strings.HasPrefix(string(n), nvidiaDevicePrefix) {
continue
}
nvidiaDeviceMajors[d] = n
hasNvidiaDevices = true
}

if !hasNvidiaDevices {
return nil, errNoNvidiaDevices
}
return nvidiaDeviceMajors, nil
}
54 changes: 27 additions & 27 deletions internal/info/proc/devices/devices_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion internal/info/proc/devices/devices_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,6 @@ func TestProcessDeviceFileLine(t *testing.T) {
}

// testDevices creates a set of test NVIDIA devices
func testDevices(d map[Name]Major) Devices {
func testDevices(d map[Name]Major) Devices[Name, Major] {
return devices(d)
}
Loading

0 comments on commit 0f27d53

Please sign in to comment.