openshift
diff --git a/‎staging/operator-lifecycle-manager/pkg/controller/operators/olm/apiservices.go‎
Lines changed: 129 additions & 13 deletions b/‎staging/operator-lifecycle-manager/pkg/controller/operators/olm/apiservices.go‎
Lines changed: 129 additions & 13 deletions
@@ -148,7 +148,9 @@ func (a *Operator) checkAPIServiceResources(csv *v1alpha1.ClusterServiceVersion,
 		if serviceAccountName == "" {
 			serviceAccountName = "default"
 		}
-		_, err = a.opClient.KubernetesInterface().CoreV1().ServiceAccounts(deployment.GetNamespace()).Get(context.TODO(), serviceAccountName, metav1.GetOptions{})
+		// Use lister instead of direct API call to avoid timeout issues during SNO upgrades
+		// when the kube-apiserver is temporarily unavailable
+		_, err = a.lister.CoreV1().ServiceAccountLister().ServiceAccounts(deployment.GetNamespace()).Get(serviceAccountName)
 		if err != nil {
 			logger.WithError(err).WithField("serviceaccount", serviceAccountName).Warnf("could not retrieve ServiceAccount")
 			errs = append(errs, err)
@@ -213,9 +215,23 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 		}
 
 		// Check if deployment is being updated or rolling out
-		if deployment.Status.UnavailableReplicas > 0 ||
-			deployment.Status.UpdatedReplicas < deployment.Status.Replicas {
-			a.logger.Debugf("Deployment %s has unavailable replicas, likely due to pod disruption", deploymentSpec.Name)
+		// This includes several scenarios:
+		// 1. UnavailableReplicas > 0: Some replicas are not ready
+		// 2. UpdatedReplicas < Replicas: Rollout in progress
+		// 3. Generation != ObservedGeneration: Spec changed but not yet observed
+		// 4. AvailableReplicas < desired: Not all replicas are available yet
+		isRollingOut := deployment.Status.UnavailableReplicas > 0 ||
+			deployment.Status.UpdatedReplicas < deployment.Status.Replicas ||
+			deployment.Generation != deployment.Status.ObservedGeneration ||
+			deployment.Status.AvailableReplicas < *deployment.Spec.Replicas
+
+		if isRollingOut {
+			a.logger.Debugf("Deployment %s is rolling out or has unavailable replicas (unavailable=%d, updated=%d/%d, available=%d/%d, generation=%d/%d), likely due to pod disruption",
+				deploymentSpec.Name,
+				deployment.Status.UnavailableReplicas,
+				deployment.Status.UpdatedReplicas, deployment.Status.Replicas,
+				deployment.Status.AvailableReplicas, *deployment.Spec.Replicas,
+				deployment.Status.ObservedGeneration, deployment.Generation)
 
 			// Check pod status to confirm disruption
 			selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
@@ -230,6 +246,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 				continue
 			}
 
+			// For single-replica deployments during rollout, if no pods exist yet,
+			// this is likely the brief window where the old pod is gone and new pod
+			// hasn't been created yet. This is expected disruption during upgrade.
+			// According to the OpenShift contract: "A component must not report Available=False
+			// during the course of a normal upgrade."
+			if len(pods) == 0 && *deployment.Spec.Replicas == 1 && isRollingOut {
+				a.logger.Infof("Single-replica deployment %s is rolling out with no pods yet - expected disruption during upgrade, will not mark CSV as Failed", deploymentSpec.Name)
+				return true
+			}
+
+			// Track if we found any real failures or expected disruptions
+			foundExpectedDisruption := false
+			foundRealFailure := false
+
 			// Check if any pod is in expected disruption state
 			for _, pod := range pods {
 				// Check how long the pod has been in disrupted state
@@ -244,7 +274,8 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 				// Pod is terminating (DeletionTimestamp is set)
 				if pod.DeletionTimestamp != nil {
 					a.logger.Debugf("Pod %s is terminating - expected disruption", pod.Name)
-					return true
+					foundExpectedDisruption = true
+					continue
 				}
 
 				// For pending pods, we need to distinguish between expected disruption
@@ -257,11 +288,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 					// Check pod conditions for scheduling issues
 					for _, condition := range pod.Status.Conditions {
 						if condition.Type == corev1.PodScheduled && condition.Status == corev1.ConditionFalse {
-							// If pod has been unschedulable for a while, it's likely a real issue
-							// not a temporary disruption from cluster upgrade
 							if condition.Reason == "Unschedulable" {
-								isRealFailure = true
-								a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
+								// CRITICAL: In single-replica deployments during rollout, Unschedulable is EXPECTED
+								// due to PodAntiAffinity preventing new pod from scheduling while old pod is terminating.
+								// This is especially common in single-node clusters or control plane scenarios.
+								// Per OpenShift contract: "A component must not report Available=False during normal upgrade."
+								if *deployment.Spec.Replicas == 1 && isRollingOut {
+									a.logger.Infof("Pod %s is unschedulable during single-replica rollout - likely PodAntiAffinity conflict, treating as expected disruption", pod.Name)
+									isExpectedDisruption = true
+								} else {
+									// Multi-replica or non-rollout Unschedulable is a real issue
+									isRealFailure = true
+									foundRealFailure = true
+									a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
+								}
 								break
 							}
 						}
@@ -278,6 +318,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 							case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
 								// These are real failures, not temporary disruptions
 								isRealFailure = true
+								foundRealFailure = true
 								a.logger.Debugf("Pod %s has container error %s - real failure, not disruption", pod.Name, reason)
 							}
 						}
@@ -292,6 +333,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 								isExpectedDisruption = true
 							case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
 								isRealFailure = true
+								foundRealFailure = true
 								a.logger.Debugf("Pod %s has init container error %s - real failure, not disruption", pod.Name, reason)
 							}
 						}
@@ -302,17 +344,19 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 						continue
 					}
 
-					// If it's in expected disruption state, return true
+					// If it's in expected disruption state, mark it
 					if isExpectedDisruption {
 						a.logger.Debugf("Pod %s is in expected disruption state", pod.Name)
-						return true
+						foundExpectedDisruption = true
+						continue
 					}
 
 					// If pending without clear container status, check if it's just being scheduled
 					// This could be normal pod creation during node drain
 					if len(pod.Status.ContainerStatuses) == 0 && len(pod.Status.InitContainerStatuses) == 0 {
 						a.logger.Debugf("Pod %s is pending without container statuses - likely being scheduled", pod.Name)
-						return true
+						foundExpectedDisruption = true
+						continue
 					}
 				}
 
@@ -323,14 +367,86 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 						switch reason {
 						case "ContainerCreating", "PodInitializing":
 							a.logger.Debugf("Pod %s container is starting - expected disruption", pod.Name)
-							return true
+							foundExpectedDisruption = true
 						case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff":
 							// Real failures - don't treat as disruption
 							a.logger.Debugf("Pod %s has container error %s - not treating as disruption", pod.Name, reason)
+							foundRealFailure = true
 						}
 					}
 				}
 			}
+
+			// After checking all pods, make a decision
+			// If we found expected disruption and no real failures, treat as disruption
+			if foundExpectedDisruption && !foundRealFailure {
+				a.logger.Infof("Deployment %s has pods in expected disruption state - will not mark CSV as Failed per Available contract", deploymentSpec.Name)
+				return true
+			}
+
+			// For single-replica deployments during rollout, if we found no real failures,
+			// treat as expected disruption to comply with the OpenShift contract:
+			// "A component must not report Available=False during the course of a normal upgrade."
+			// Single-replica deployments inherently have unavailability during rollout,
+			// but this is acceptable and should not trigger Available=False.
+			if !foundRealFailure && *deployment.Spec.Replicas == 1 && isRollingOut {
+				a.logger.Infof("Single-replica deployment %s is rolling out - treating as expected disruption per Available contract", deploymentSpec.Name)
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// hasAnyDeploymentInRollout checks if any deployment in the CSV is currently rolling out
+// or has unavailable replicas. This is used to detect potential lister cache sync issues
+// during SNO upgrades where the informer cache may not have synced yet after node reboot.
+func (a *Operator) hasAnyDeploymentInRollout(csv *v1alpha1.ClusterServiceVersion) bool {
+	strategy, err := a.resolver.UnmarshalStrategy(csv.Spec.InstallStrategy)
+	if err != nil {
+		a.logger.Debugf("Unable to unmarshal strategy for CSV %s: %v", csv.Name, err)
+		return false
+	}
+
+	strategyDetailsDeployment, ok := strategy.(*v1alpha1.StrategyDetailsDeployment)
+	if !ok {
+		a.logger.Debugf("CSV %s does not use deployment strategy", csv.Name)
+		return false
+	}
+
+	for _, deploymentSpec := range strategyDetailsDeployment.DeploymentSpecs {
+		deployment, err := a.lister.AppsV1().DeploymentLister().Deployments(csv.Namespace).Get(deploymentSpec.Name)
+		if err != nil {
+			// If we can't get the deployment, it might be a cache sync issue itself
+			// In this case, we should be conservative and assume rollout is happening
+			if apierrors.IsNotFound(err) {
+				a.logger.Debugf("Deployment %s not found in cache, assuming rollout in progress", deploymentSpec.Name)
+				return true
+			}
+			a.logger.Debugf("Error getting deployment %s: %v", deploymentSpec.Name, err)
+			continue
+		}
+
+		// Check if deployment is rolling out or has unavailable replicas
+		// This covers various scenarios during cluster upgrades
+		isRollingOut := deployment.Status.UnavailableReplicas > 0 ||
+			deployment.Status.UpdatedReplicas < deployment.Status.Replicas ||
+			deployment.Generation != deployment.Status.ObservedGeneration
+
+		// Also check if available replicas are less than desired (for single-replica case)
+		if deployment.Spec.Replicas != nil {
+			isRollingOut = isRollingOut || deployment.Status.AvailableReplicas < *deployment.Spec.Replicas
+		}
+
+		if isRollingOut {
+			a.logger.Debugf("Deployment %s is rolling out (unavailable=%d, updated=%d/%d, available=%d, generation=%d/%d)",
+				deploymentSpec.Name,
+				deployment.Status.UnavailableReplicas,
+				deployment.Status.UpdatedReplicas, deployment.Status.Replicas,
+				deployment.Status.AvailableReplicas,
+				deployment.Status.ObservedGeneration, deployment.Generation)
+			return true
 		}
 	}