Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions assets/gpu-feature-discovery/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ spec:
labels:
app: gpu-feature-discovery
app.kubernetes.io/part-of: nvidia-gpu
annotations:
nvidia.cdi.k8s.io/container.gpu-feature-discovery: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.gpu-feature-discovery: "true"
Expand Down
2 changes: 2 additions & 0 deletions assets/state-dcgm-exporter/0800_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ spec:
metadata:
labels:
app: nvidia-dcgm-exporter
annotations:
nvidia.cdi.k8s.io/container.nvidia-dcgm-exporter: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.dcgm-exporter: "true"
Expand Down
2 changes: 2 additions & 0 deletions assets/state-dcgm/0400_dcgm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ spec:
metadata:
labels:
app: nvidia-dcgm
annotations:
nvidia.cdi.k8s.io/container.nvidia-dcgm-ctr: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.dcgm: "true"
Expand Down
2 changes: 2 additions & 0 deletions assets/state-device-plugin/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ spec:
metadata:
labels:
app: nvidia-device-plugin-daemonset
annotations:
nvidia.cdi.k8s.io/container.nvidia-device-plugin: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.device-plugin: "true"
Expand Down
2 changes: 2 additions & 0 deletions assets/state-kata-manager/0600_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ spec:
metadata:
labels:
name: nvidia-kata-manager
annotations:
nvidia.cdi.k8s.io/container.nvidia-kata-manager: management.nvidia.com/gpu=all
spec:
tolerations:
- key: nvidia.com/gpu
Expand Down
2 changes: 2 additions & 0 deletions assets/state-mig-manager/0600_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ spec:
metadata:
labels:
app: nvidia-mig-manager
annotations:
nvidia.cdi.k8s.io/container.nvidia-mig-manager: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.mig-manager: "true"
Expand Down
2 changes: 2 additions & 0 deletions assets/state-mps-control-daemon/0400_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ spec:
metadata:
labels:
app: nvidia-device-plugin-mps-control-daemon
annotations:
nvidia.cdi.k8s.io/container.mps-control-daemon-ctr: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.device-plugin: "true"
Expand Down
2 changes: 2 additions & 0 deletions assets/state-operator-validation/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ spec:
labels:
app: nvidia-operator-validator
app.kubernetes.io/part-of: gpu-operator
annotations:
nvidia.cdi.k8s.io/container.toolkit-validation: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.operator-validator: "true"
Expand Down
2 changes: 2 additions & 0 deletions assets/state-sandbox-device-plugin/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ spec:
metadata:
labels:
app: nvidia-sandbox-device-plugin-daemonset
annotations:
nvidia.cdi.k8s.io/container.nvidia-sandbox-device-plugin-ctr: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.sandbox-device-plugin: "true"
Expand Down
2 changes: 2 additions & 0 deletions assets/state-sandbox-validation/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ spec:
labels:
app: nvidia-sandbox-validator
app.kubernetes.io/part-of: gpu-operator
annotations:
nvidia.cdi.k8s.io/container.nvidia-sandbox-validator: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.sandbox-validator: "true"
Expand Down
2 changes: 2 additions & 0 deletions assets/state-vgpu-device-manager/0600_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ spec:
metadata:
labels:
app: nvidia-vgpu-device-manager
annotations:
nvidia.cdi.k8s.io/container.nvidia-vgpu-device-manager: management.nvidia.com/gpu=all
spec:
nodeSelector:
nvidia.com/gpu.deploy.vgpu-device-manager: "true"
Expand Down
52 changes: 36 additions & 16 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ const (
DefaultDockerConfigFile = "/etc/docker/daemon.json"
// DefaultDockerSocketFile indicates default docker socket file
DefaultDockerSocketFile = "/var/run/docker.sock"
// DefaultRuntimeNRISocketFile indicates the default container runtime NRI socket file
DefaultRuntimeNRISocketFile = "/var/run/nri/nri.sock"
// DefaultCRIOConfigFile indicates default config file path for cri-o. .
DefaultCRIOConfigFile = "/etc/crio/config.toml"
// DefaultCRIODropInConfigFile indicates the default path to the drop-in config file for cri-o
Expand All @@ -82,9 +84,11 @@ const (
DefaultRuntimeClass = "nvidia"
// DriverInstallPathVolName represents volume name for driver install path provided to toolkit
DriverInstallPathVolName = "driver-install-path"
// DefaultRuntimeSocketTargetDir represents target directory where runtime socket dirctory will be mounted
// DefaultRuntimeNRISocketTargetDir represents target directory where runtime NRI socket directory will be mounted
DefaultRuntimeNRISocketTargetDir = "/runtime/nri-sock-dir/"
// DefaultRuntimeSocketTargetDir represents target directory where runtime socket directory will be mounted
DefaultRuntimeSocketTargetDir = "/runtime/sock-dir/"
// DefaultRuntimeConfigTargetDir represents target directory where runtime socket dirctory will be mounted
// DefaultRuntimeConfigTargetDir represents target directory where runtime socket directory will be mounted
DefaultRuntimeConfigTargetDir = "/runtime/config-dir/"
// DefaultRuntimeDropInConfigTargetDir represents target directory where drop-in config directory will be mounted
DefaultRuntimeDropInConfigTargetDir = "/runtime/config-dir.d/"
Expand Down Expand Up @@ -941,7 +945,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
return err
}

setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
// setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// update env required for MIG support
applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
Expand Down Expand Up @@ -1440,6 +1444,22 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
socketVol := corev1.Volume{Name: volMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(runtimeSocketFile)}}}
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, socketVol)
}

// setup mounts for the runtime NRI socket file
nriSocketFile := getContainerEnv(container, "RUNTIME_NRI_SOCKET")
if nriSocketFile == "" {
nriSocketFile = DefaultRuntimeNRISocketFile
}

setContainerEnv(container, "RUNTIME_NRI_SOCKET", DefaultRuntimeNRISocketTargetDir+path.Base(nriSocketFile))

nriVolMountSocketName := "nri-socket"
nriVolMountSocket := corev1.VolumeMount{Name: nriVolMountSocketName, MountPath: DefaultRuntimeNRISocketTargetDir}
container.VolumeMounts = append(container.VolumeMounts, nriVolMountSocket)

nriSocketVol := corev1.Volume{Name: nriVolMountSocketName, VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: path.Dir(nriSocketFile), Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}}
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, nriSocketVol)

return nil
}

Expand Down Expand Up @@ -1511,7 +1531,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
return err
}

setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
// setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// update env required for MIG support
applyMIGConfiguration(devicePluginMainContainer, config.MIG.Strategy)
Expand Down Expand Up @@ -1591,7 +1611,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic
return err
}

setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
// setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// update env required for MIG support
applyMIGConfiguration(mpsControlMainContainer, config.MIG.Strategy)
Expand Down Expand Up @@ -1699,7 +1719,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
}
}

setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
// setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// set hostPID if specified for DCGM Exporter
if config.DCGMExporter.IsHostPIDEnabled() {
Expand Down Expand Up @@ -1849,7 +1869,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu
}
}

setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
// setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

return nil
}
Expand Down Expand Up @@ -1891,7 +1911,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args
}

setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
// setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

// set ConfigMap name for "mig-parted-config" Volume
for i, vol := range obj.Spec.Template.Spec.Volumes {
Expand Down Expand Up @@ -2185,7 +2205,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
return fmt.Errorf("%v", err)
}

setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)
// setRuntimeClassName(&obj.Spec.Template.Spec, config, n.runtime)

var validatorErr error
// apply changes for individual component validators(initContainers)
Expand Down Expand Up @@ -2559,13 +2579,13 @@ func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string {
return DefaultRuntimeClass
}

func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) {
if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO {
return
}
runtimeClassName := getRuntimeClassName(config)
podSpec.RuntimeClassName = &runtimeClassName
}
// func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec, runtime gpuv1.Runtime) {
// if !config.CDI.IsEnabled() && runtime == gpuv1.CRIO {
// return
// }
// runtimeClassName := getRuntimeClassName(config)
// podSpec.RuntimeClassName = &runtimeClassName
//}

func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) {
var containerProbe *corev1.Probe
Expand Down
Loading
Loading