Skip to content

Commit 55f7dfd

Browse files
committed
UPSTREAM: <carry>: Fix: Prevent Available=False during single-replica deployment upgrades in SNO
1 parent 7d2dd62 commit 55f7dfd

File tree

5 files changed

+451
-27
lines changed

5 files changed

+451
-27
lines changed

staging/operator-lifecycle-manager/pkg/controller/operators/olm/apiservices.go

Lines changed: 129 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,9 @@ func (a *Operator) checkAPIServiceResources(csv *v1alpha1.ClusterServiceVersion,
148148
if serviceAccountName == "" {
149149
serviceAccountName = "default"
150150
}
151-
_, err = a.opClient.KubernetesInterface().CoreV1().ServiceAccounts(deployment.GetNamespace()).Get(context.TODO(), serviceAccountName, metav1.GetOptions{})
151+
// Use lister instead of direct API call to avoid timeout issues during SNO upgrades
152+
// when the kube-apiserver is temporarily unavailable
153+
_, err = a.lister.CoreV1().ServiceAccountLister().ServiceAccounts(deployment.GetNamespace()).Get(serviceAccountName)
152154
if err != nil {
153155
logger.WithError(err).WithField("serviceaccount", serviceAccountName).Warnf("could not retrieve ServiceAccount")
154156
errs = append(errs, err)
@@ -213,9 +215,23 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
213215
}
214216

215217
// Check if deployment is being updated or rolling out
216-
if deployment.Status.UnavailableReplicas > 0 ||
217-
deployment.Status.UpdatedReplicas < deployment.Status.Replicas {
218-
a.logger.Debugf("Deployment %s has unavailable replicas, likely due to pod disruption", deploymentSpec.Name)
218+
// This includes several scenarios:
219+
// 1. UnavailableReplicas > 0: Some replicas are not ready
220+
// 2. UpdatedReplicas < Replicas: Rollout in progress
221+
// 3. Generation != ObservedGeneration: Spec changed but not yet observed
222+
// 4. AvailableReplicas < desired: Not all replicas are available yet
223+
isRollingOut := deployment.Status.UnavailableReplicas > 0 ||
224+
deployment.Status.UpdatedReplicas < deployment.Status.Replicas ||
225+
deployment.Generation != deployment.Status.ObservedGeneration ||
226+
deployment.Status.AvailableReplicas < *deployment.Spec.Replicas
227+
228+
if isRollingOut {
229+
a.logger.Debugf("Deployment %s is rolling out or has unavailable replicas (unavailable=%d, updated=%d/%d, available=%d/%d, generation=%d/%d), likely due to pod disruption",
230+
deploymentSpec.Name,
231+
deployment.Status.UnavailableReplicas,
232+
deployment.Status.UpdatedReplicas, deployment.Status.Replicas,
233+
deployment.Status.AvailableReplicas, *deployment.Spec.Replicas,
234+
deployment.Status.ObservedGeneration, deployment.Generation)
219235

220236
// Check pod status to confirm disruption
221237
selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
@@ -230,6 +246,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
230246
continue
231247
}
232248

249+
// For single-replica deployments during rollout, if no pods exist yet,
250+
// this is likely the brief window where the old pod is gone and new pod
251+
// hasn't been created yet. This is expected disruption during upgrade.
252+
// According to the OpenShift contract: "A component must not report Available=False
253+
// during the course of a normal upgrade."
254+
if len(pods) == 0 && *deployment.Spec.Replicas == 1 && isRollingOut {
255+
a.logger.Infof("Single-replica deployment %s is rolling out with no pods yet - expected disruption during upgrade, will not mark CSV as Failed", deploymentSpec.Name)
256+
return true
257+
}
258+
259+
// Track if we found any real failures or expected disruptions
260+
foundExpectedDisruption := false
261+
foundRealFailure := false
262+
233263
// Check if any pod is in expected disruption state
234264
for _, pod := range pods {
235265
// Check how long the pod has been in disrupted state
@@ -244,7 +274,8 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
244274
// Pod is terminating (DeletionTimestamp is set)
245275
if pod.DeletionTimestamp != nil {
246276
a.logger.Debugf("Pod %s is terminating - expected disruption", pod.Name)
247-
return true
277+
foundExpectedDisruption = true
278+
continue
248279
}
249280

250281
// For pending pods, we need to distinguish between expected disruption
@@ -257,11 +288,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
257288
// Check pod conditions for scheduling issues
258289
for _, condition := range pod.Status.Conditions {
259290
if condition.Type == corev1.PodScheduled && condition.Status == corev1.ConditionFalse {
260-
// If pod has been unschedulable for a while, it's likely a real issue
261-
// not a temporary disruption from cluster upgrade
262291
if condition.Reason == "Unschedulable" {
263-
isRealFailure = true
264-
a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
292+
// CRITICAL: In single-replica deployments during rollout, Unschedulable is EXPECTED
293+
// due to PodAntiAffinity preventing new pod from scheduling while old pod is terminating.
294+
// This is especially common in single-node clusters or control plane scenarios.
295+
// Per OpenShift contract: "A component must not report Available=False during normal upgrade."
296+
if *deployment.Spec.Replicas == 1 && isRollingOut {
297+
a.logger.Infof("Pod %s is unschedulable during single-replica rollout - likely PodAntiAffinity conflict, treating as expected disruption", pod.Name)
298+
isExpectedDisruption = true
299+
} else {
300+
// Multi-replica or non-rollout Unschedulable is a real issue
301+
isRealFailure = true
302+
foundRealFailure = true
303+
a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
304+
}
265305
break
266306
}
267307
}
@@ -278,6 +318,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
278318
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
279319
// These are real failures, not temporary disruptions
280320
isRealFailure = true
321+
foundRealFailure = true
281322
a.logger.Debugf("Pod %s has container error %s - real failure, not disruption", pod.Name, reason)
282323
}
283324
}
@@ -292,6 +333,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
292333
isExpectedDisruption = true
293334
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
294335
isRealFailure = true
336+
foundRealFailure = true
295337
a.logger.Debugf("Pod %s has init container error %s - real failure, not disruption", pod.Name, reason)
296338
}
297339
}
@@ -302,17 +344,19 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
302344
continue
303345
}
304346

305-
// If it's in expected disruption state, return true
347+
// If it's in expected disruption state, mark it
306348
if isExpectedDisruption {
307349
a.logger.Debugf("Pod %s is in expected disruption state", pod.Name)
308-
return true
350+
foundExpectedDisruption = true
351+
continue
309352
}
310353

311354
// If pending without clear container status, check if it's just being scheduled
312355
// This could be normal pod creation during node drain
313356
if len(pod.Status.ContainerStatuses) == 0 && len(pod.Status.InitContainerStatuses) == 0 {
314357
a.logger.Debugf("Pod %s is pending without container statuses - likely being scheduled", pod.Name)
315-
return true
358+
foundExpectedDisruption = true
359+
continue
316360
}
317361
}
318362

@@ -323,14 +367,86 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
323367
switch reason {
324368
case "ContainerCreating", "PodInitializing":
325369
a.logger.Debugf("Pod %s container is starting - expected disruption", pod.Name)
326-
return true
370+
foundExpectedDisruption = true
327371
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff":
328372
// Real failures - don't treat as disruption
329373
a.logger.Debugf("Pod %s has container error %s - not treating as disruption", pod.Name, reason)
374+
foundRealFailure = true
330375
}
331376
}
332377
}
333378
}
379+
380+
// After checking all pods, make a decision
381+
// If we found expected disruption and no real failures, treat as disruption
382+
if foundExpectedDisruption && !foundRealFailure {
383+
a.logger.Infof("Deployment %s has pods in expected disruption state - will not mark CSV as Failed per Available contract", deploymentSpec.Name)
384+
return true
385+
}
386+
387+
// For single-replica deployments during rollout, if we found no real failures,
388+
// treat as expected disruption to comply with the OpenShift contract:
389+
// "A component must not report Available=False during the course of a normal upgrade."
390+
// Single-replica deployments inherently have unavailability during rollout,
391+
// but this is acceptable and should not trigger Available=False.
392+
if !foundRealFailure && *deployment.Spec.Replicas == 1 && isRollingOut {
393+
a.logger.Infof("Single-replica deployment %s is rolling out - treating as expected disruption per Available contract", deploymentSpec.Name)
394+
return true
395+
}
396+
}
397+
}
398+
399+
return false
400+
}
401+
402+
// hasAnyDeploymentInRollout checks if any deployment in the CSV is currently rolling out
403+
// or has unavailable replicas. This is used to detect potential lister cache sync issues
404+
// during SNO upgrades where the informer cache may not have synced yet after node reboot.
405+
func (a *Operator) hasAnyDeploymentInRollout(csv *v1alpha1.ClusterServiceVersion) bool {
406+
strategy, err := a.resolver.UnmarshalStrategy(csv.Spec.InstallStrategy)
407+
if err != nil {
408+
a.logger.Debugf("Unable to unmarshal strategy for CSV %s: %v", csv.Name, err)
409+
return false
410+
}
411+
412+
strategyDetailsDeployment, ok := strategy.(*v1alpha1.StrategyDetailsDeployment)
413+
if !ok {
414+
a.logger.Debugf("CSV %s does not use deployment strategy", csv.Name)
415+
return false
416+
}
417+
418+
for _, deploymentSpec := range strategyDetailsDeployment.DeploymentSpecs {
419+
deployment, err := a.lister.AppsV1().DeploymentLister().Deployments(csv.Namespace).Get(deploymentSpec.Name)
420+
if err != nil {
421+
// If we can't get the deployment, it might be a cache sync issue itself
422+
// In this case, we should be conservative and assume rollout is happening
423+
if apierrors.IsNotFound(err) {
424+
a.logger.Debugf("Deployment %s not found in cache, assuming rollout in progress", deploymentSpec.Name)
425+
return true
426+
}
427+
a.logger.Debugf("Error getting deployment %s: %v", deploymentSpec.Name, err)
428+
continue
429+
}
430+
431+
// Check if deployment is rolling out or has unavailable replicas
432+
// This covers various scenarios during cluster upgrades
433+
isRollingOut := deployment.Status.UnavailableReplicas > 0 ||
434+
deployment.Status.UpdatedReplicas < deployment.Status.Replicas ||
435+
deployment.Generation != deployment.Status.ObservedGeneration
436+
437+
// Also check if available replicas are less than desired (for single-replica case)
438+
if deployment.Spec.Replicas != nil {
439+
isRollingOut = isRollingOut || deployment.Status.AvailableReplicas < *deployment.Spec.Replicas
440+
}
441+
442+
if isRollingOut {
443+
a.logger.Debugf("Deployment %s is rolling out (unavailable=%d, updated=%d/%d, available=%d, generation=%d/%d)",
444+
deploymentSpec.Name,
445+
deployment.Status.UnavailableReplicas,
446+
deployment.Status.UpdatedReplicas, deployment.Status.Replicas,
447+
deployment.Status.AvailableReplicas,
448+
deployment.Status.ObservedGeneration, deployment.Generation)
449+
return true
334450
}
335451
}
336452

0 commit comments

Comments
 (0)