Checkpointing work

This commit is contained in:
Ian Roddis
2022-01-12 12:50:46 -04:00
parent 04e95cfcf3
commit 9a5a247f15
21 changed files with 320 additions and 160 deletions

View File

@@ -47,7 +47,8 @@ namespace daggy {
ssize_t nRunningTasks_;
ssize_t nErroredTasks_;
std::unordered_map<std::string, std::future<AttemptRecord>> runningTasks_;
std::unordered_map<std::string, daggy::executors::task::TaskFuture>
runningTasks_;
std::unordered_map<std::string, size_t> taskAttemptCounts_;
std::mutex runGuard_;

View File

@@ -9,6 +9,8 @@
#include <variant>
#include <vector>
#include "Future.hpp"
namespace daggy {
// Commands and parameters
using ConfigValue = std::variant<std::string, std::vector<std::string>>;
@@ -72,6 +74,7 @@ namespace daggy {
std::string outputLog; // stdout from command
std::string errorLog; // stderr from command
};
} // namespace daggy
BETTER_ENUMS_DECLARE_STD_HASH(daggy::RunState)

View File

@@ -0,0 +1,113 @@
#pragma once
#include <atomic>
#include <chrono>
#include <exception>
#include <iostream>
#include <optional>
#include <thread>
namespace daggy {
enum class FutureState : uint8_t
{
NOT_READY,
OK,
ERROR,
};
template <class T>
class Future
{
public:
Future()
: state_{FutureState::NOT_READY}
, val_(std::nullopt)
{
}
FutureState state()
{
return state_;
}
void set(const T val)
{
if (val_) {
std::cout << "Future already has a value!" << std::endl;
throw std::runtime_error("Future already has a value");
}
val_.emplace(val);
state_ = FutureState::OK;
}
bool ready() const
{
return state_.load() != FutureState::NOT_READY;
}
void setException(const std::exception &e)
{
exp_ = e;
state_ = FutureState::ERROR;
}
T get()
{
while (!ready()) {
std::this_thread::sleep_for(std::chrono::microseconds(100));
}
if (state_ == FutureState::ERROR)
throw exp_;
return *val_;
}
private:
std::atomic<FutureState> state_;
std::optional<T> val_;
std::exception exp_;
};
template <>
struct Future<void>
{
public:
Future()
: state_{FutureState::NOT_READY}
{
}
FutureState state()
{
return state_;
}
bool ready() const
{
return state_ != FutureState::NOT_READY;
}
void set()
{
state_ = FutureState::OK;
}
void setException(const std::exception &e)
{
exp_ = e;
state_ = FutureState::ERROR;
}
void get()
{
if (state_ == FutureState::NOT_READY)
throw std::runtime_error("Value is not ready");
if (state_ == FutureState::ERROR)
throw exp_;
}
private:
std::atomic<FutureState> state_;
std::exception exp_;
};
} // namespace daggy

View File

@@ -3,17 +3,17 @@
#include <atomic>
#include <condition_variable>
#include <functional>
#include <future>
#include <iostream>
#include <list>
#include <memory>
#include <queue>
#include <thread>
#include <vector>
#include "Future.hpp"
using namespace std::chrono_literals;
namespace daggy {
class ThreadPool
{
public:
@@ -65,7 +65,7 @@ namespace daggy {
for (size_t i = 0; i < nWorkers; ++i)
workers_.emplace_back([&] {
std::packaged_task<void()> task;
std::function<void()> task;
while (true) {
{
std::unique_lock<std::mutex> lock(mtx_);
@@ -88,15 +88,30 @@ namespace daggy {
{
if (drain_)
throw std::runtime_error("Unable to add task to draining pool");
using return_type = std::invoke_result_t<F, Args...>;
std::packaged_task<return_type()> task(
std::bind(std::forward<F>(f), std::forward<Args>(args)...));
auto callable =
std::bind(std::forward<F>(f), std::forward<Args>(args)...);
auto res = std::make_shared<Future<return_type>>();
std::future<return_type> res = task.get_future();
{
std::lock_guard<std::mutex> guard(mtx_);
tasks_.emplace(std::move(task));
tasks_.emplace([res, task = std::move(callable)]() -> void {
try {
if constexpr ((std::is_same<return_type, void>::value)) {
task();
res->set();
}
else {
return_type val = task();
res->set(val);
}
}
catch (std::exception &e) {
res->setException(e);
}
});
}
cv_.notify_one();
return res;
@@ -117,7 +132,7 @@ namespace daggy {
// need to keep track of threads, so we can join them
std::vector<std::thread> workers_;
// the task queue
std::queue<std::packaged_task<void()>> tasks_;
std::queue<std::function<void()>> tasks_;
// synchronization
std::mutex mtx_;

View File

@@ -45,9 +45,8 @@ namespace daggy::executors::task {
const ConfigValues &job, const ConfigValues &expansionValues) override;
// Runs the task
std::future<AttemptRecord> execute(DAGRunID runID,
const std::string &taskName,
const Task &task) override;
TaskFuture execute(DAGRunID runID, const std::string &taskName,
const Task &task) override;
bool stop(DAGRunID runID, const std::string &taskName) override;
@@ -60,7 +59,7 @@ namespace daggy::executors::task {
struct RunningTask
{
std::promise<AttemptRecord> prom;
TaskFuture fut;
DAGRunID runID;
std::string taskName;
std::string runnerURL;

View File

@@ -25,9 +25,8 @@ namespace daggy::executors::task {
const ConfigValues &job, const ConfigValues &expansionValues) override;
// Runs the task
std::future<AttemptRecord> execute(DAGRunID runID,
const std::string &taskName,
const Task &task) override;
TaskFuture execute(DAGRunID runID, const std::string &taskName,
const Task &task) override;
bool stop(DAGRunID runID, const std::string &taskName) override;

View File

@@ -16,12 +16,11 @@ namespace daggy::executors::task {
const ConfigValues &job, const ConfigValues &expansionValues) override;
// Runs the task
std::future<AttemptRecord> execute(DAGRunID runID,
const std::string &taskName,
const Task &task) override;
TaskFuture execute(DAGRunID runID, const std::string &taskName,
const Task &task) override;
bool stop(DAGRunID runID, const std::string &taskName) override;
std::string description() const;
std::string description() const override;
};
} // namespace daggy::executors::task

View File

@@ -19,9 +19,8 @@ namespace daggy::executors::task {
const ConfigValues &job, const ConfigValues &expansionValues) override;
// Runs the task
std::future<AttemptRecord> execute(DAGRunID runID,
const std::string &taskName,
const Task &task) override;
TaskFuture execute(DAGRunID runID, const std::string &taskName,
const Task &task) override;
bool stop(DAGRunID runID, const std::string &taskName) override;
@@ -30,7 +29,7 @@ namespace daggy::executors::task {
private:
struct Job
{
std::promise<AttemptRecord> prom;
TaskFuture fut;
std::string stdoutFile;
std::string stderrFile;
DAGRunID runID;

View File

@@ -13,6 +13,8 @@
*/
namespace daggy::executors::task {
using TaskFuture = std::shared_ptr<Future<AttemptRecord>>;
class TaskExecutor
{
public:
@@ -27,9 +29,8 @@ namespace daggy::executors::task {
const ConfigValues &job, const ConfigValues &expansionValues) = 0;
// Blocking execution of a task
virtual std::future<AttemptRecord> execute(DAGRunID runID,
const std::string &taskName,
const Task &task) = 0;
virtual TaskFuture execute(DAGRunID runID, const std::string &taskName,
const Task &task) = 0;
// Kill a currently executing task. This will resolve the future.
virtual bool stop(DAGRunID runID, const std::string &taskName) = 0;