@@ -148,7 +148,9 @@ func (a *Operator) checkAPIServiceResources(csv *v1alpha1.ClusterServiceVersion,
148148 if serviceAccountName == "" {
149149 serviceAccountName = "default"
150150 }
151- _ , err = a .opClient .KubernetesInterface ().CoreV1 ().ServiceAccounts (deployment .GetNamespace ()).Get (context .TODO (), serviceAccountName , metav1.GetOptions {})
151+ // Use lister instead of direct API call to avoid timeout issues during SNO upgrades
152+ // when the kube-apiserver is temporarily unavailable
153+ _ , err = a .lister .CoreV1 ().ServiceAccountLister ().ServiceAccounts (deployment .GetNamespace ()).Get (serviceAccountName )
152154 if err != nil {
153155 logger .WithError (err ).WithField ("serviceaccount" , serviceAccountName ).Warnf ("could not retrieve ServiceAccount" )
154156 errs = append (errs , err )
@@ -213,9 +215,23 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
213215 }
214216
215217 // Check if deployment is being updated or rolling out
216- if deployment .Status .UnavailableReplicas > 0 ||
217- deployment .Status .UpdatedReplicas < deployment .Status .Replicas {
218- a .logger .Debugf ("Deployment %s has unavailable replicas, likely due to pod disruption" , deploymentSpec .Name )
218+ // This includes several scenarios:
219+ // 1. UnavailableReplicas > 0: Some replicas are not ready
220+ // 2. UpdatedReplicas < Replicas: Rollout in progress
221+ // 3. Generation != ObservedGeneration: Spec changed but not yet observed
222+ // 4. AvailableReplicas < desired: Not all replicas are available yet
223+ isRollingOut := deployment .Status .UnavailableReplicas > 0 ||
224+ deployment .Status .UpdatedReplicas < deployment .Status .Replicas ||
225+ deployment .Generation != deployment .Status .ObservedGeneration ||
226+ deployment .Status .AvailableReplicas < * deployment .Spec .Replicas
227+
228+ if isRollingOut {
229+ a .logger .Debugf ("Deployment %s is rolling out or has unavailable replicas (unavailable=%d, updated=%d/%d, available=%d/%d, generation=%d/%d), likely due to pod disruption" ,
230+ deploymentSpec .Name ,
231+ deployment .Status .UnavailableReplicas ,
232+ deployment .Status .UpdatedReplicas , deployment .Status .Replicas ,
233+ deployment .Status .AvailableReplicas , * deployment .Spec .Replicas ,
234+ deployment .Status .ObservedGeneration , deployment .Generation )
219235
220236 // Check pod status to confirm disruption
221237 selector , err := metav1 .LabelSelectorAsSelector (deployment .Spec .Selector )
@@ -230,6 +246,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
230246 continue
231247 }
232248
249+ // For single-replica deployments during rollout, if no pods exist yet,
250+ // this is likely the brief window where the old pod is gone and new pod
251+ // hasn't been created yet. This is expected disruption during upgrade.
252+ // According to the OpenShift contract: "A component must not report Available=False
253+ // during the course of a normal upgrade."
254+ if len (pods ) == 0 && * deployment .Spec .Replicas == 1 && isRollingOut {
255+ a .logger .Infof ("Single-replica deployment %s is rolling out with no pods yet - expected disruption during upgrade, will not mark CSV as Failed" , deploymentSpec .Name )
256+ return true
257+ }
258+
259+ // Track if we found any real failures or expected disruptions
260+ foundExpectedDisruption := false
261+ foundRealFailure := false
262+
233263 // Check if any pod is in expected disruption state
234264 for _ , pod := range pods {
235265 // Check how long the pod has been in disrupted state
@@ -244,7 +274,8 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
244274 // Pod is terminating (DeletionTimestamp is set)
245275 if pod .DeletionTimestamp != nil {
246276 a .logger .Debugf ("Pod %s is terminating - expected disruption" , pod .Name )
247- return true
277+ foundExpectedDisruption = true
278+ continue
248279 }
249280
250281 // For pending pods, we need to distinguish between expected disruption
@@ -257,11 +288,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
257288 // Check pod conditions for scheduling issues
258289 for _ , condition := range pod .Status .Conditions {
259290 if condition .Type == corev1 .PodScheduled && condition .Status == corev1 .ConditionFalse {
260- // If pod has been unschedulable for a while, it's likely a real issue
261- // not a temporary disruption from cluster upgrade
262291 if condition .Reason == "Unschedulable" {
263- isRealFailure = true
264- a .logger .Debugf ("Pod %s is unschedulable - not a temporary disruption" , pod .Name )
292+ // CRITICAL: In single-replica deployments during rollout, Unschedulable is EXPECTED
293+ // due to PodAntiAffinity preventing new pod from scheduling while old pod is terminating.
294+ // This is especially common in single-node clusters or control plane scenarios.
295+ // Per OpenShift contract: "A component must not report Available=False during normal upgrade."
296+ if * deployment .Spec .Replicas == 1 && isRollingOut {
297+ a .logger .Infof ("Pod %s is unschedulable during single-replica rollout - likely PodAntiAffinity conflict, treating as expected disruption" , pod .Name )
298+ isExpectedDisruption = true
299+ } else {
300+ // Multi-replica or non-rollout Unschedulable is a real issue
301+ isRealFailure = true
302+ foundRealFailure = true
303+ a .logger .Debugf ("Pod %s is unschedulable - not a temporary disruption" , pod .Name )
304+ }
265305 break
266306 }
267307 }
@@ -278,6 +318,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
278318 case "ImagePullBackOff" , "ErrImagePull" , "CrashLoopBackOff" , "CreateContainerConfigError" , "InvalidImageName" :
279319 // These are real failures, not temporary disruptions
280320 isRealFailure = true
321+ foundRealFailure = true
281322 a .logger .Debugf ("Pod %s has container error %s - real failure, not disruption" , pod .Name , reason )
282323 }
283324 }
@@ -292,6 +333,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
292333 isExpectedDisruption = true
293334 case "ImagePullBackOff" , "ErrImagePull" , "CrashLoopBackOff" , "CreateContainerConfigError" , "InvalidImageName" :
294335 isRealFailure = true
336+ foundRealFailure = true
295337 a .logger .Debugf ("Pod %s has init container error %s - real failure, not disruption" , pod .Name , reason )
296338 }
297339 }
@@ -302,17 +344,19 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
302344 continue
303345 }
304346
305- // If it's in expected disruption state, return true
347+ // If it's in expected disruption state, mark it
306348 if isExpectedDisruption {
307349 a .logger .Debugf ("Pod %s is in expected disruption state" , pod .Name )
308- return true
350+ foundExpectedDisruption = true
351+ continue
309352 }
310353
311354 // If pending without clear container status, check if it's just being scheduled
312355 // This could be normal pod creation during node drain
313356 if len (pod .Status .ContainerStatuses ) == 0 && len (pod .Status .InitContainerStatuses ) == 0 {
314357 a .logger .Debugf ("Pod %s is pending without container statuses - likely being scheduled" , pod .Name )
315- return true
358+ foundExpectedDisruption = true
359+ continue
316360 }
317361 }
318362
@@ -323,14 +367,85 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
323367 switch reason {
324368 case "ContainerCreating" , "PodInitializing" :
325369 a .logger .Debugf ("Pod %s container is starting - expected disruption" , pod .Name )
326- return true
370+ foundExpectedDisruption = true
327371 case "ImagePullBackOff" , "ErrImagePull" , "CrashLoopBackOff" :
328372 // Real failures - don't treat as disruption
329373 a .logger .Debugf ("Pod %s has container error %s - not treating as disruption" , pod .Name , reason )
374+ foundRealFailure = true
330375 }
331376 }
332377 }
333378 }
379+
380+ // After checking all pods, make a decision
381+ // If we found expected disruption and no real failures, treat as disruption
382+ if foundExpectedDisruption && ! foundRealFailure {
383+ a .logger .Infof ("Deployment %s has pods in expected disruption state - will not mark CSV as Failed per Available contract" , deploymentSpec .Name )
384+ return true
385+ }
386+
387+ // For single-replica deployments during rollout, if we found no real failures,
388+ // treat as expected disruption to comply with the OpenShift contract:
389+ // "A component must not report Available=False during the course of a normal upgrade."
390+ // Single-replica deployments inherently have unavailability during rollout,
391+ // but this is acceptable and should not trigger Available=False.
392+ if ! foundRealFailure && * deployment .Spec .Replicas == 1 && isRollingOut {
393+ a .logger .Infof ("Single-replica deployment %s is rolling out - treating as expected disruption per Available contract" , deploymentSpec .Name )
394+ return true
395+ }
396+ }
397+ }
398+
399+ return false
400+ }
401+
402+ // hasAnyDeploymentInRollout checks if any deployment in the CSV is currently rolling out
403+ // or has unavailable replicas. This is used to detect potential lister cache sync issues
404+ // during SNO upgrades where the informer cache may not have synced yet after node reboot.
405+ func (a * Operator ) hasAnyDeploymentInRollout (csv * v1alpha1.ClusterServiceVersion ) bool {
406+ strategy , err := a .resolver .UnmarshalStrategy (csv .Spec .InstallStrategy )
407+ if err != nil {
408+ a .logger .Debugf ("Unable to unmarshal strategy for CSV %s: %v" , csv .Name , err )
409+ return false
410+ }
411+
412+ strategyDetailsDeployment , ok := strategy .(* v1alpha1.StrategyDetailsDeployment )
413+ if ! ok {
414+ a .logger .Debugf ("CSV %s does not use deployment strategy" , csv .Name )
415+ return false
416+ }
417+
418+ for _ , deploymentSpec := range strategyDetailsDeployment .DeploymentSpecs {
419+ deployment , err := a .lister .AppsV1 ().DeploymentLister ().Deployments (csv .Namespace ).Get (deploymentSpec .Name )
420+ if err != nil {
421+ // If deployment is not found, it could be:
422+ // 1. A real error (deployment never existed or was deleted)
423+ // 2. Cache sync issue after node reboot
424+ // We can't distinguish between these cases here, so we don't assume rollout.
425+ // The caller should handle this case appropriately.
426+ a .logger .Debugf ("Error getting deployment %s: %v" , deploymentSpec .Name , err )
427+ continue
428+ }
429+
430+ // Check if deployment is rolling out or has unavailable replicas
431+ // This covers various scenarios during cluster upgrades
432+ isRollingOut := deployment .Status .UnavailableReplicas > 0 ||
433+ deployment .Status .UpdatedReplicas < deployment .Status .Replicas ||
434+ deployment .Generation != deployment .Status .ObservedGeneration
435+
436+ // Also check if available replicas are less than desired (for single-replica case)
437+ if deployment .Spec .Replicas != nil {
438+ isRollingOut = isRollingOut || deployment .Status .AvailableReplicas < * deployment .Spec .Replicas
439+ }
440+
441+ if isRollingOut {
442+ a .logger .Debugf ("Deployment %s is rolling out (unavailable=%d, updated=%d/%d, available=%d, generation=%d/%d)" ,
443+ deploymentSpec .Name ,
444+ deployment .Status .UnavailableReplicas ,
445+ deployment .Status .UpdatedReplicas , deployment .Status .Replicas ,
446+ deployment .Status .AvailableReplicas ,
447+ deployment .Status .ObservedGeneration , deployment .Generation )
448+ return true
334449 }
335450 }
336451
0 commit comments