sled agent: split ensure into "register" and "ensure state" APIs

Split the sled agent's `/instances/{id}` PUT endpoint into two endpoints: - A PUT to `/instances/{id}` "registers" an instance with a sled. This creates a record for the instance in the manager, but does not start its Propolis and does not try to drive the instance to any particular state. - A PUT to `/instances/{id}/state` attempts to change the state of a previously- registered instance's VM by starting it, stopping it, rebooting it, initializing by live migration, or unceremoniously destroying it. (This last case is meant to provide a safety valve that lets Nexus get an unresponsive Propolis off a sled.) This allows the instance create saga to avoid a class of problems in which an instance starts, stops (due to user input to the VM), and then is errantly restarted by a replayed saga step: because sled agent will only accept requests to run a registered instance, and stopping an instance unregisters it, a replayed "run this VM" saga node won't restart the VM. The migration saga is vulnerable to a similar class of problem, so this groundwork is necessary to write that saga correctly. A secondary benefit of this change is that operations on running instances (like "stop" and "reboot") no longer need to construct an (unused) `InstanceHardware` to pass to the sled agent's ensure endpoint. Update the simulated sled agent to support these APIs, update callers in Nexus to use them, and split the instance create saga's "instance ensure" step into two steps as described above. This requires some extra affordances in simulated collections to support simulated disks, since instance state changes no longer go through a path where an instance's hardware manifest is available. Finally, add some Nexus logging to record information about CRDB updates that Nexus applies when a call to sled agent produces a new `InstanceRuntimeState`, since these are handy for debugging. Tested: cargo test; installed Omicron locally and played around with some instances.
oxidecomputer · Apr 4, 2023 · ff46bd7 · ff46bd7
1 parent 4e172d0
commit ff46bd7
Show file tree

Hide file tree

Showing 14 changed files with 1,001 additions and 417 deletions.
diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
@@ -37,6 +37,7 @@ use omicron_common::api::external::UpdateResult;
 use omicron_common::api::external::Vni;
 use omicron_common::api::internal::nexus;
 use ref_cast::RefCast;
+use sled_agent_client::types::InstancePutStateBody;
 use sled_agent_client::types::InstanceStateRequested;
 use sled_agent_client::types::SourceNatConfig;
 use sled_agent_client::Client as SledAgentClient;
@@ -329,19 +330,8 @@ impl super::Nexus {
         opctx: &OpContext,
         instance_lookup: &lookup::Instance<'_>,
     ) -> UpdateResult<db::model::Instance> {
-        // To implement reboot, we issue a call to the sled agent to set a
-        // runtime state of "reboot". We cannot simply stop the Instance and
-        // start it again here because if we crash in the meantime, we might
-        // leave it stopped.
-        //
-        // When an instance is rebooted, the "rebooting" flag remains set on
-        // the runtime state as it transitions to "Stopping" and "Stopped".
-        // This flag is cleared when the state goes to "Starting".  This way,
-        // even if the whole rack powered off while this was going on, we would
-        // never lose track of the fact that this Instance was supposed to be
-        // running.
         let (.., authz_instance, db_instance) = instance_lookup.fetch().await?;
-        self.instance_set_runtime(
+        self.instance_request_state(
             opctx,
             &authz_instance,
             &db_instance,
@@ -357,8 +347,26 @@ impl super::Nexus {
         opctx: &OpContext,
         instance_lookup: &lookup::Instance<'_>,
     ) -> UpdateResult<db::model::Instance> {
-        let (.., authz_instance, db_instance) = instance_lookup.fetch().await?;
-        self.instance_set_runtime(
+        // TODO-correctness: This needs to be a saga for crash resiliency
+        // purposes (otherwise the instance can be leaked if Nexus crashes
+        // between registration and instance start).
+        let (.., authz_instance, mut db_instance) =
+            instance_lookup.fetch().await?;
+
+        // The instance is not really being "created" (it already exists from
+        // the caller's perspective), but if it does not exist on its sled, the
+        // target sled agent will populate its instance manager with the
+        // contents of this modified record, and that record needs to allow a
+        // transition to the Starting state.
+        let new_runtime = nexus_db_model::InstanceRuntimeState {
+            state: nexus_db_model::InstanceState(InstanceState::Creating),
+            ..db_instance.runtime_state
+        };
+        db_instance.runtime_state = new_runtime;
+        self.instance_ensure_registered(opctx, &authz_instance, &db_instance)
+            .await?;
+
+        self.instance_request_state(
             opctx,
             &authz_instance,
             &db_instance,
@@ -375,7 +383,7 @@ impl super::Nexus {
         instance_lookup: &lookup::Instance<'_>,
     ) -> UpdateResult<db::model::Instance> {
         let (.., authz_instance, db_instance) = instance_lookup.fetch().await?;
-        self.instance_set_runtime(
+        self.instance_request_state(
             opctx,
             &authz_instance,
             &db_instance,
@@ -397,21 +405,25 @@ impl super::Nexus {
     fn check_runtime_change_allowed(
         &self,
         runtime: &nexus::InstanceRuntimeState,
+        requested: &InstanceStateRequested,
     ) -> Result<(), Error> {
         // Users are allowed to request a start or stop even if the instance is
         // already in the desired state (or moving to it), and we will issue a
         // request to the SA to make the state change in these cases in case the
         // runtime state we saw here was stale.  However, users are not allowed
         // to change the state of an instance that's migrating, failed or
-        // destroyed.
+        // destroyed, except that multiple requests to migrate in are allowed
+        // for idempotency.
         let allowed = match runtime.run_state {
             InstanceState::Creating => true,
             InstanceState::Starting => true,
             InstanceState::Running => true,
             InstanceState::Stopping => true,
             InstanceState::Stopped => true,
             InstanceState::Rebooting => true,
-            InstanceState::Migrating => false,
+            InstanceState::Migrating => {
+                matches!(requested, InstanceStateRequested::MigrationTarget(_))
+            }
             InstanceState::Repairing => false,
             InstanceState::Failed => false,
             InstanceState::Destroyed => false,
@@ -429,21 +441,43 @@ impl super::Nexus {
         }
     }
 
-    /// Modifies the runtime state of the Instance as requested.  This generally
-    /// means booting or halting the Instance.
-    pub(crate) async fn instance_set_runtime(
+    pub(crate) async fn instance_request_state(
         &self,
         opctx: &OpContext,
         authz_instance: &authz::Instance,
         db_instance: &db::model::Instance,
         requested: InstanceStateRequested,
     ) -> Result<(), Error> {
         opctx.authorize(authz::Action::Modify, authz_instance).await?;
-
         self.check_runtime_change_allowed(
             &db_instance.runtime().clone().into(),
+            &requested,
         )?;
 
+        let sa = self.instance_sled(&db_instance).await?;
+        let instance_put_result = sa
+            .instance_put_state(
+                &db_instance.id(),
+                &InstancePutStateBody { state: requested },
+            )
+            .await
+            .map(|res| res.into_inner().updated_runtime);
+
+        self.handle_instance_put_result(db_instance, instance_put_result)
+            .await
+            .map(|_| ())
+    }
+
+    /// Modifies the runtime state of the Instance as requested.  This generally
+    /// means booting or halting the Instance.
+    pub(crate) async fn instance_ensure_registered(
+        &self,
+        opctx: &OpContext,
+        authz_instance: &authz::Instance,
+        db_instance: &db::model::Instance,
+    ) -> Result<(), Error> {
+        opctx.authorize(authz::Action::Modify, authz_instance).await?;
+
         // Gather disk information and turn that into DiskRequests
         let disks = self
             .db_datastore
@@ -594,26 +628,67 @@ impl super::Nexus {
                 &db_instance.id(),
                 &sled_agent_client::types::InstanceEnsureBody {
                     initial: instance_hardware,
-                    target: requested,
-                    migrate: None,
                 },
             )
-            .await;
+            .await
+            .map(|res| Some(res.into_inner()));
+
+        self.handle_instance_put_result(db_instance, instance_put_result)
+            .await
+            .map(|_| ())
+    }
+
+    /// Updates an instance's CRDB record based on the result of a call to sled
+    /// agent that tried to update the instance's state.
+    ///
+    /// # Parameters
+    ///
+    /// - `db_instance`: The CRDB instance record observed by the caller before
+    ///   it attempted to update the instance's state.
+    /// - `result`: The result of the relevant sled agent operation. If this is
+    ///   `Ok`, the payload is the updated instance runtime state returned from
+    ///   sled agent, if there was one.
+    ///
+    /// # Return value
+    ///
+    /// - `Ok(true)` if the caller supplied an updated instance record and this
+    ///   routine successfully wrote it to CRDB.
+    /// - `Ok(false)` if the sled agent call succeeded, but this routine did not
+    ///   update CRDB. This can happen either because sled agent didn't return
+    ///   an updated record or because the updated record was superseded by
+    ///   a state update with a more advanced generation number.
+    /// - `Err` if the sled agent operation failed or this routine received an
+    ///   error while trying to update CRDB.
+    async fn handle_instance_put_result(
+        &self,
+        db_instance: &db::model::Instance,
+        result: Result<
+            Option<sled_agent_client::types::InstanceRuntimeState>,
+            sled_agent_client::Error<sled_agent_client::types::Error>,
+        >,
+    ) -> Result<bool, Error> {
+        slog::debug!(&self.log, "Handling sled agent instance PUT result";
+                     "result" => ?result);
 
-        match instance_put_result {
-            Ok(new_runtime) => {
+        match result {
+            Ok(Some(new_runtime)) => {
                 let new_runtime: nexus::InstanceRuntimeState =
-                    new_runtime.into_inner().into();
+                    new_runtime.into();
 
-                self.db_datastore
+                let update_result = self
+                    .db_datastore
                     .instance_update_runtime(
                         &db_instance.id(),
                         &new_runtime.into(),
                     )
-                    .await
-                    .map(|_| ())
-            }
+                    .await;
 
+                slog::debug!(&self.log,
+                             "Attempted DB update after instance PUT";
+                             "result" => ?update_result);
+                update_result
+            }
+            Ok(None) => Ok(false),
             Err(e) => {
                 // The sled-agent has told us that it can't do what we
                 // requested, but does that mean a failure? One example would be

diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs
@@ -114,8 +114,12 @@ declare_saga_actions! {
         + sic_add_network_config
         - sic_remove_network_config
     }
-    INSTANCE_ENSURE -> "instance_ensure" {
-        + sic_instance_ensure
+    INSTANCE_ENSURE_REGISTERED -> "instance_ensure_registered" {
+        + sic_instance_ensure_registered
+        - sic_instance_ensure_registered_undo
+    }
+    INSTANCE_ENSURE_RUNNING -> "instance_ensure_running" {
+        + sic_instance_ensure_running
     }
 }
 
@@ -334,7 +338,11 @@ impl NexusSaga for SagaInstanceCreate {
                 i,
             )?;
         }
-        builder.append(instance_ensure_action());
+
+        builder.append(instance_ensure_registered_action());
+        if params.create_params.start {
+            builder.append(instance_ensure_running_action());
+        }
         Ok(builder.build()?)
     }
 }
@@ -1233,7 +1241,7 @@ async fn sic_delete_instance_record(
     Ok(())
 }
 
-async fn sic_instance_ensure(
+async fn sic_instance_ensure_registered(
     sagactx: NexusActionContext,
 ) -> Result<(), ActionError> {
     // TODO-correctness is this idempotent?
@@ -1288,19 +1296,78 @@ async fn sic_instance_ensure(
     } else {
         osagactx
             .nexus()
-            .instance_set_runtime(
-                &opctx,
-                &authz_instance,
-                &db_instance,
-                InstanceStateRequested::Running,
-            )
+            .instance_ensure_registered(&opctx, &authz_instance, &db_instance)
             .await
             .map_err(ActionError::action_failed)?;
     }
 
     Ok(())
 }
 
+async fn sic_instance_ensure_registered_undo(
+    sagactx: NexusActionContext,
+) -> Result<(), anyhow::Error> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+    let datastore = osagactx.datastore();
+    let instance_id = sagactx.lookup::<Uuid>("instance_id")?;
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    let (.., authz_instance, db_instance) = LookupPath::new(&opctx, &datastore)
+        .instance_id(instance_id)
+        .fetch()
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    osagactx
+        .nexus()
+        .instance_request_state(
+            &opctx,
+            &authz_instance,
+            &db_instance,
+            InstanceStateRequested::Destroyed,
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(())
+}
+
+async fn sic_instance_ensure_running(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+    let datastore = osagactx.datastore();
+    let instance_id = sagactx.lookup::<Uuid>("instance_id")?;
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+
+    let (.., authz_instance, db_instance) = LookupPath::new(&opctx, &datastore)
+        .instance_id(instance_id)
+        .fetch()
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    osagactx
+        .nexus()
+        .instance_request_state(
+            &opctx,
+            &authz_instance,
+            &db_instance,
+            InstanceStateRequested::Running,
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+
+    Ok(())
+}
+
 #[cfg(test)]
 pub mod test {
     use crate::{