diff --git a/api/v1alpha3/vspherevm_types.go b/api/v1alpha3/vspherevm_types.go index d28cb2361c..fd0e44fb6a 100644 --- a/api/v1alpha3/vspherevm_types.go +++ b/api/v1alpha3/vspherevm_types.go @@ -19,6 +19,7 @@ package v1alpha3 import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/cluster-api/errors" ) const ( @@ -82,6 +83,36 @@ type VSphereVMStatus struct { // network interfaces. // +optional Network []NetworkStatus `json:"network,omitempty"` + + // FailureReason will be set in the event that there is a terminal problem + // reconciling the vspherevm and will contain a succinct value suitable + // for vm interpretation. + // + // This field should not be set for transitive errors that a controller + // faces that are expected to be fixed automatically over + // time (like service outages), but instead indicate that something is + // fundamentally wrong with the vm. + // + // Any transient errors that occur during the reconciliation of vspherevms + // can be added as events to the vspherevm object and/or logged in the + // controller's output. + // +optional + FailureReason *errors.MachineStatusError `json:"failureReason,omitempty"` + + // FailureMessage will be set in the event that there is a terminal problem + // reconciling the vspherevm and will contain a more verbose string suitable + // for logging and human consumption. + // + // This field should not be set for transitive errors that a controller + // faces that are expected to be fixed automatically over + // time (like service outages), but instead indicate that something is + // fundamentally wrong with the vm. + // + // Any transient errors that occur during the reconciliation of vspherevms + // can be added as events to the vspherevm object and/or logged in the + // controller's output. + // +optional + FailureMessage *string `json:"failureMessage,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1alpha3/zz_generated.deepcopy.go b/api/v1alpha3/zz_generated.deepcopy.go index df8f1cebbb..51f8af5c42 100644 --- a/api/v1alpha3/zz_generated.deepcopy.go +++ b/api/v1alpha3/zz_generated.deepcopy.go @@ -840,6 +840,16 @@ func (in *VSphereVMStatus) DeepCopyInto(out *VSphereVMStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.FailureReason != nil { + in, out := &in.FailureReason, &out.FailureReason + *out = new(errors.MachineStatusError) + **out = **in + } + if in.FailureMessage != nil { + in, out := &in.FailureMessage, &out.FailureMessage + *out = new(string) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VSphereVMStatus. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml index c08ada61c5..7ef9875901 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_vspherevms.yaml @@ -286,6 +286,28 @@ spec: to determine the actual type of clone operation used to create this VM. type: string + failureMessage: + description: "FailureMessage will be set in the event that there is + a terminal problem reconciling the vspherevm and will contain a + more verbose string suitable for logging and human consumption. + \n This field should not be set for transitive errors that a controller + faces that are expected to be fixed automatically over time (like + service outages), but instead indicate that something is fundamentally + wrong with the vm. \n Any transient errors that occur during the + reconciliation of vspherevms can be added as events to the vspherevm + object and/or logged in the controller's output." + type: string + failureReason: + description: "FailureReason will be set in the event that there is + a terminal problem reconciling the vspherevm and will contain a + succinct value suitable for vm interpretation. \n This field should + not be set for transitive errors that a controller faces that are + expected to be fixed automatically over time (like service outages), + but instead indicate that something is fundamentally wrong with + the vm. \n Any transient errors that occur during the reconciliation + of vspherevms can be added as events to the vspherevm object and/or + logged in the controller's output." + type: string network: description: Network returns the network status for each of the machine's configured network interfaces. diff --git a/controllers/vspheremachine_controller.go b/controllers/vspheremachine_controller.go index d7a8ebff7f..20712774e9 100644 --- a/controllers/vspheremachine_controller.go +++ b/controllers/vspheremachine_controller.go @@ -243,21 +243,16 @@ func (r machineReconciler) reconcileDeleteVM(ctx *context.MachineContext) error func (r machineReconciler) reconcileDeleteVMPre7(ctx *context.MachineContext) error { // Get ready to find the associated VSphereVM resource. - vm := &infrav1.VSphereVM{} vmKey := apitypes.NamespacedName{ Namespace: ctx.VSphereMachine.Namespace, Name: ctx.Machine.Name, } - + vm, err := r.findVMPre7(ctx) // Attempt to find the associated VSphereVM resource. - if err := ctx.Client.Get(ctx, vmKey, vm); err != nil { - // If an error occurs finding the VSphereVM resource other than - // IsNotFound, then return the error. Otherwise it means the VSphereVM - // is already deleted, and that's okay. - if !apierrors.IsNotFound(err) { - return errors.Wrapf(err, "failed to get VSphereVM %s", vmKey) - } - } else if vm.GetDeletionTimestamp().IsZero() { + if err != nil { + return err + } + if vm.GetDeletionTimestamp().IsZero() { // If the VSphereVM was found and it's not already enqueued for // deletion, go ahead and attempt to delete it. if err := ctx.Client.Delete(ctx, vm); err != nil { @@ -274,7 +269,33 @@ func (r machineReconciler) reconcileDeleteVMPre7(ctx *context.MachineContext) er return nil } +func (r machineReconciler) findVMPre7(ctx *context.MachineContext) (*infrav1.VSphereVM, error) { + // Get ready to find the associated VSphereVM resource. + vm := &infrav1.VSphereVM{} + vmKey := apitypes.NamespacedName{ + Namespace: ctx.VSphereMachine.Namespace, + Name: ctx.Machine.Name, + } + // Attempt to find the associated VSphereVM resource. + if err := ctx.Client.Get(ctx, vmKey, vm); err != nil { + return nil, err + } + return vm, nil +} + func (r machineReconciler) reconcileNormal(ctx *context.MachineContext) (reconcile.Result, error) { + vsphereVM, err := r.findVMPre7(ctx) + if err != nil { + if !apierrors.IsNotFound(err) { + return reconcile.Result{}, err + } + } + if vsphereVM != nil { + // Reconcile VSphereMachine's failures + ctx.VSphereMachine.Status.FailureReason = vsphereVM.Status.FailureReason + ctx.VSphereMachine.Status.FailureMessage = vsphereVM.Status.FailureMessage + } + // If the VSphereMachine is in an error state, return early. if ctx.VSphereMachine.Status.FailureReason != nil || ctx.VSphereMachine.Status.FailureMessage != nil { ctx.Logger.Info("Error state detected, skipping reconciliation") diff --git a/controllers/vspherevm_controller.go b/controllers/vspherevm_controller.go index 25821608df..9e37f67066 100644 --- a/controllers/vspherevm_controller.go +++ b/controllers/vspherevm_controller.go @@ -88,8 +88,8 @@ type vmReconciler struct { } // Reconcile ensures the back-end state reflects the Kubernetes resource state intent. +// nolint:gocognit func (r vmReconciler) Reconcile(req ctrl.Request) (_ ctrl.Result, reterr error) { - // Get the VSphereVM resource for this request. vsphereVM := &infrav1.VSphereVM{} if err := r.Client.Get(r, req.NamespacedName, vsphereVM); err != nil { @@ -262,6 +262,11 @@ func (r vmReconciler) reconcileDelete(ctx *context.VMContext) (reconcile.Result, } func (r vmReconciler) reconcileNormal(ctx *context.VMContext) (reconcile.Result, error) { + + if ctx.VSphereVM.Status.FailureReason != nil || ctx.VSphereVM.Status.FailureMessage != nil { + r.Logger.Info("VM is failed, won't reconcile", "namespace", ctx.VSphereVM.Namespace, "name", ctx.VSphereVM.Name) + return reconcile.Result{}, nil + } // If the VSphereVM doesn't have our finalizer, add it. ctrlutil.AddFinalizer(ctx.VSphereVM, infrav1.VMFinalizer) diff --git a/pkg/services/govmomi/errors.go b/pkg/services/govmomi/errors.go index 07166dea25..df20dd98c4 100644 --- a/pkg/services/govmomi/errors.go +++ b/pkg/services/govmomi/errors.go @@ -39,3 +39,15 @@ func isNotFound(err error) bool { return false } } + +func wasNotFoundByBIOSUUID(err error) bool { + switch err.(type) { + case errNotFound, *errNotFound: + if err.(errNotFound).uuid != "" && !err.(errNotFound).instanceUUID { + return true + } + return false + default: + return false + } +} diff --git a/pkg/services/govmomi/service.go b/pkg/services/govmomi/service.go index 0dd57b7035..63f7e3adcd 100644 --- a/pkg/services/govmomi/service.go +++ b/pkg/services/govmomi/service.go @@ -18,6 +18,7 @@ package govmomi import ( "encoding/base64" + "fmt" "github.com/pkg/errors" @@ -27,6 +28,8 @@ import ( "github.com/vmware/govmomi/vim25/types" corev1 "k8s.io/api/core/v1" apitypes "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/pointer" + capierrors "sigs.k8s.io/cluster-api/errors" infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/api/v1alpha3" "sigs.k8s.io/cluster-api-provider-vsphere/pkg/context" @@ -69,8 +72,15 @@ func (vms *VMService) ReconcileVM(ctx *context.VMContext) (vm infrav1.VirtualMac if !isNotFound(err) { return vm, err } - // If VM's MoRef could not be found then the VM does not exist, - // and the VM should be created. + + // If the machine was not found by BIOS UUID it means that it got deleted from vcenter directly + if wasNotFoundByBIOSUUID(err) { + ctx.VSphereVM.Status.FailureReason = capierrors.MachineStatusErrorPtr(capierrors.UpdateMachineError) + ctx.VSphereVM.Status.FailureMessage = pointer.StringPtr(fmt.Sprintf("Unable to find VM by BIOS UUID %s. The vm was removed from infra", ctx.VSphereVM.Spec.BiosUUID)) + return vm, err + } + + // Otherwise, this is a new machine and the the VM should be created. // Get the bootstrap data. bootstrapData, err := vms.getBootstrapData(ctx)