Checkpointing work

This commit is contained in:
Ian Roddis
2022-01-12 12:50:46 -04:00
parent 04e95cfcf3
commit 9a5a247f15
21 changed files with 320 additions and 160 deletions

View File

@@ -87,8 +87,7 @@ namespace daggy::executors::task {
// Resolve the remaining futures
std::lock_guard<std::mutex> lock(promiseGuard_);
for (auto &[jobID, job] : runningJobs_) {
job.prom.set_value(
AttemptRecord{.rc = -1, .executorLog = "executor killed"});
job.fut->set(AttemptRecord{.rc = -1, .executorLog = "executor killed"});
}
runningJobs_.clear();
}
@@ -153,8 +152,9 @@ namespace daggy::executors::task {
return newValues;
}
std::future<AttemptRecord> SlurmTaskExecutor::execute(
DAGRunID runID, const std::string &taskName, const Task &task)
TaskFuture SlurmTaskExecutor::execute(DAGRunID runID,
const std::string &taskName,
const Task &task)
{
std::stringstream executorLog;
@@ -247,12 +247,12 @@ namespace daggy::executors::task {
slurm_free_submit_response_response_msg(resp_msg);
std::lock_guard<std::mutex> lock(promiseGuard_);
Job newJob{.prom{},
Job newJob{.fut = std::make_shared<Future<AttemptRecord>>(),
.stdoutFile = stdoutFile,
.stderrFile = stderrFile,
.runID = runID,
.taskName = taskName};
auto fut = newJob.prom.get_future();
auto fut = newJob.fut;
runningJobs_.emplace(jobID, std::move(newJob));
return fut;
@@ -348,7 +348,7 @@ namespace daggy::executors::task {
readAndClean(job.stdoutFile, record.outputLog);
readAndClean(job.stderrFile, record.errorLog);
job.prom.set_value(std::move(record));
job.fut->set(std::move(record));
resolvedJobs.insert(jobID);
}