mesos-reviews mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From GitBox <...@apache.org>
Subject [GitHub] [mesos] asekretenko commented on a change in pull request #380: Fixed a bug preventing agent recovery when executor GC is interrupted.
Date Tue, 02 Mar 2021 14:40:10 GMT

asekretenko commented on a change in pull request #380:
URL: https://github.com/apache/mesos/pull/380#discussion_r585615415



##########
File path: src/tests/slave_recovery_tests.cpp
##########
@@ -3301,6 +3301,139 @@ TYPED_TEST(SlaveRecoveryTest, GCExecutor)
 }
 
 
+// When the slave is down we remove the latest run directory
+// but not the "latest" symlink, to simulate a situation where the
+// slave died in the middle of gc'ing the run meta directory.
+TYPED_TEST(SlaveRecoveryTest, ExecutorDanglingLatestSymlink)
+{
+  Try<Owned<cluster::Master>> master = this->StartMaster();
+  ASSERT_SOME(master);
+
+  slave::Flags flags = this->CreateSlaveFlags();
+  flags.strict = true;
+
+  Fetcher fetcher(flags);
+
+  Try<TypeParam*> _containerizer = TypeParam::create(flags, true, &fetcher);
+  ASSERT_SOME(_containerizer);
+  Owned<slave::Containerizer> containerizer(_containerizer.get());
+
+  Owned<MasterDetector> detector = master.get()->createDetector();
+
+  Try<Owned<cluster::Slave>> slave =
+    this->StartSlave(detector.get(), containerizer.get(), flags);
+  ASSERT_SOME(slave);
+
+  // Enable checkpointing for the framework.
+  FrameworkInfo frameworkInfo = DEFAULT_FRAMEWORK_INFO;
+  frameworkInfo.set_checkpoint(true);
+
+  MockScheduler sched;
+  MesosSchedulerDriver driver(
+      &sched, frameworkInfo, master.get()->pid, DEFAULT_CREDENTIAL);
+
+  EXPECT_CALL(sched, registered(_, _, _));
+
+  Future<vector<Offer>> offers1;
+  EXPECT_CALL(sched, resourceOffers(_, _))
+    .WillOnce(FutureArg<1>(&offers1));
+
+  driver.start();
+
+  AWAIT_READY(offers1);
+  ASSERT_FALSE(offers1->empty());
+
+  TaskInfo task = createTask(offers1.get()[0], SLEEP_COMMAND(1000));
+
+  // Capture the slave and framework ids.
+  SlaveID slaveId = offers1.get()[0].slave_id();
+  FrameworkID frameworkId = offers1.get()[0].framework_id();
+
+  Future<RegisterExecutorMessage> registerExecutor =
+    FUTURE_PROTOBUF(RegisterExecutorMessage(), _, _);
+
+  Future<Nothing> status;
+  EXPECT_CALL(sched, statusUpdate(_, _))
+    .WillOnce(FutureSatisfy(&status))
+    .WillRepeatedly(Return()); // Ignore subsequent updates.
+
+  driver.launchTasks(offers1.get()[0].id(), {task});
+
+  // Capture the executor id.
+  AWAIT_READY(registerExecutor);
+  ExecutorID executorId = registerExecutor->executor_id();
+
+  // Wait for TASK_RUNNING update.
+  AWAIT_READY(status);
+
+  // Terminate the slave.
+  slave.get()->terminate();
+
+  // The "latest" symlink should exist.
+  const string latestPath = paths::getExecutorLatestRunPath(
+      paths::getMetaRootDir(flags.work_dir),
+      slaveId,
+      frameworkId,
+      executorId);
+  ASSERT_TRUE(os::exists(latestPath));
+  // And should point to the latest run.
+  const Result<string> path = os::realpath(latestPath);
+  ASSERT_SOME(path);
+  // Delete it - "latest" will now dangle.
+  ASSERT_SOME(os::rmdir(path.get(), true));
+
+  // Recover the state.
+  Result<slave::state::State> recoverState =
+    slave::state::recover(paths::getMetaRootDir(flags.work_dir), true);
+
+  ASSERT_SOME(recoverState);
+  ASSERT_SOME(recoverState->slave);
+
+  // The executor should be recovered without any run.
+  slave::state::FrameworkState frameworkState =
+    recoverState->slave->frameworks.at(frameworkId);
+  ASSERT_EQ(1u, frameworkState.executors.size());
+  slave::state::ExecutorState& executorState =
+    frameworkState.executors.at(executorId);
+  ASSERT_NONE(executorState.latest);
+  ASSERT_TRUE(executorState.runs.empty());
+
+  Future<ReregisterSlaveMessage> reregisterSlaveMessage =
+    FUTURE_PROTOBUF(ReregisterSlaveMessage(), _, _);
+
+  Future<vector<Offer>> offers2;
+  EXPECT_CALL(sched, resourceOffers(&driver, _))
+    .WillOnce(FutureArg<1>(&offers2))
+    .WillRepeatedly(Return()); // Ignore subsequent offers.
+
+  slave = this->StartSlave(detector.get(), containerizer.get(), flags);
+  ASSERT_SOME(slave);
+
+  AWAIT_READY(reregisterSlaveMessage);
+
+  // Make sure all slave resources are reoffered.
+  AWAIT_READY(offers2);
+  EXPECT_EQ(Resources(offers1.get()[0].resources()),
+            Resources(offers2.get()[0].resources()));

Review comment:
       Shouldn't detecting agent re-registration via `ReregisterSlaveMessage` be sufficient
for the purpose of this test? If this check is necessary, a comment explaining why would be
beneficial.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



Mime
View raw message