Refactoring RunState, fixing logic error in when to end errored DAG runs, added convenience functions to retry failed DAGs.

This commit is contained in:
Ian Roddis
2021-08-20 21:44:12 -03:00
parent dc8ea4c369
commit 2083b1c3f1
8 changed files with 124 additions and 42 deletions

4
.gitignore vendored
View File

@@ -1,4 +1,4 @@
build build
.cache .cache
cmake-build-* cmake-build-debug/
.idea .idea

View File

@@ -9,6 +9,8 @@
#include <functional> #include <functional>
#include <optional> #include <optional>
#include "Defines.hpp"
/* /*
The DAG structure in daggy is just to ensure that tasks are run The DAG structure in daggy is just to ensure that tasks are run
in the correct dependent order. in the correct dependent order.
@@ -16,14 +18,8 @@
namespace daggy { namespace daggy {
enum class VertexState : uint32_t {
UNVISITED = 0,
VISITING,
VISITED
};
struct Vertex { struct Vertex {
VertexState state; RunState state;
uint32_t depCount; uint32_t depCount;
std::unordered_set<size_t> children; std::unordered_set<size_t> children;
}; };
@@ -51,10 +47,15 @@ namespace daggy {
bool empty() const; bool empty() const;
// Traversal // Reset the DAG to completely unvisited
void reset(); void reset();
VertexState getVertexState(const size_t id) const; // Reset any vertex with RUNNING state to QUEUED
void resetRunning();
RunState getVertexState(const size_t id) const;
void setVertexState(const size_t id, RunState state);
bool allVisited() const; bool allVisited() const;

View File

@@ -21,4 +21,12 @@ namespace daggy {
using DAGRunID = size_t; using DAGRunID = size_t;
using TaskID = size_t; using TaskID = size_t;
enum class RunState : uint32_t {
QUEUED = 0,
RUNNING = 1,
RETRY = 1 << 1,
ERRORED = 1 << 2,
KILLED = 1 << 3,
COMPLETED = 1 << 4
};
} }

View File

@@ -18,7 +18,8 @@ namespace daggy {
std::vector<Command> expandCommands(const std::vector<std::string> &command, const ParameterValues &parameters); std::vector<Command> expandCommands(const std::vector<std::string> &command, const ParameterValues &parameters);
DAG buildDAGFromTasks(const std::vector<Task> &tasks); DAG buildDAGFromTasks(const std::vector<Task> &tasks,
const std::vector<loggers::dag_run::TaskUpdateRecord> &updates = {});
// Blocking call // Blocking call
std::vector<AttemptRecord> std::vector<AttemptRecord>

View File

@@ -3,15 +3,6 @@
namespace daggy { namespace daggy {
namespace loggers { namespace loggers {
namespace dag_run { namespace dag_run {
enum class RunState : uint32_t {
QUEUED = 0,
RUNNING = 1,
RETRY = 1 << 1,
ERRORED = 1 << 2,
KILLED = 1 << 3,
COMPLETED = 1 << 4
};
struct TaskUpdateRecord { struct TaskUpdateRecord {
TimePoint time; TimePoint time;
TaskID taskID; TaskID taskID;
@@ -26,11 +17,11 @@ namespace daggy {
// Pretty heavy weight, but // Pretty heavy weight, but
struct DAGRunRecord { struct DAGRunRecord {
std::string name; std::string name;
std::vector <Task> tasks; std::vector<Task> tasks;
std::vector <RunState> runStates; std::vector<RunState> runStates;
std::vector <std::vector<AttemptRecord>> taskAttempts; std::vector<std::vector<AttemptRecord>> taskAttempts;
std::vector <TaskUpdateRecord> taskStateChanges; std::vector<TaskUpdateRecord> taskStateChanges;
std::vector <DAGUpdateRecord> dagStateChanges; std::vector<DAGUpdateRecord> dagStateChanges;
}; };
struct DAGRunSummary { struct DAGRunSummary {
@@ -39,7 +30,7 @@ namespace daggy {
RunState runState; RunState runState;
TimePoint startTime; TimePoint startTime;
TimePoint lastUpdate; TimePoint lastUpdate;
std::unordered_map <RunState, size_t> taskStateCounts; std::unordered_map<RunState, size_t> taskStateCounts;
}; };
} }
} }

View File

@@ -7,7 +7,7 @@ namespace daggy {
bool DAG::empty() const { return vertices_.empty(); } bool DAG::empty() const { return vertices_.empty(); }
size_t DAG::addVertex() { size_t DAG::addVertex() {
vertices_.push_back(Vertex{.state = VertexState::UNVISITED, .depCount = 0}); vertices_.push_back(Vertex{.state = RunState::QUEUED, .depCount = 0});
return vertices_.size() - 1; return vertices_.size() - 1;
} }
@@ -39,7 +39,7 @@ namespace daggy {
void DAG::reset() { void DAG::reset() {
// Reset the state of all vertices // Reset the state of all vertices
for (auto &v : vertices_) { for (auto &v : vertices_) {
v.state = VertexState::UNVISITED; v.state = RunState::QUEUED;
v.depCount = 0; v.depCount = 0;
} }
@@ -51,9 +51,20 @@ namespace daggy {
} }
} }
void DAG::resetRunning() {
for (auto &v : vertices_) {
if (v.state != RunState::RUNNING) continue;
v.state = RunState::QUEUED;
}
}
void DAG::setVertexState(const size_t id, RunState state) {
vertices_[id].state = state;
}
bool DAG::allVisited() const { bool DAG::allVisited() const {
for (const auto &v : vertices_) { for (const auto &v : vertices_) {
if (v.state != VertexState::VISITED) return false; if (v.state != RunState::COMPLETED) return false;
} }
return true; return true;
} }
@@ -62,9 +73,9 @@ namespace daggy {
for (size_t i = 0; i < vertices_.size(); ++i) { for (size_t i = 0; i < vertices_.size(); ++i) {
auto &v = vertices_[i]; auto &v = vertices_[i];
if (v.state != VertexState::UNVISITED) continue; if (v.state != RunState::QUEUED) continue;
if (v.depCount != 0) continue; if (v.depCount != 0) continue;
v.state = VertexState::VISITING; v.state = RunState::RUNNING;
return i; return i;
} }
return {}; return {};
@@ -72,7 +83,7 @@ namespace daggy {
void DAG::completeVisit(const size_t id) { void DAG::completeVisit(const size_t id) {
auto &v = vertices_[id]; auto &v = vertices_[id];
v.state = VertexState::VISITED; v.state = RunState::COMPLETED;
for (auto c : v.children) { for (auto c : v.children) {
--vertices_[c].depCount; --vertices_[c].depCount;
} }

View File

@@ -52,7 +52,8 @@ namespace daggy {
return commands; return commands;
} }
DAG buildDAGFromTasks(const std::vector<Task> &tasks) { DAG buildDAGFromTasks(const std::vector<Task> &tasks,
const std::vector<loggers::dag_run::TaskUpdateRecord> &updates) {
DAG dag; DAG dag;
std::unordered_map<std::string, size_t> taskIDs; std::unordered_map<std::string, size_t> taskIDs;
@@ -68,6 +69,20 @@ namespace daggy {
} }
} }
dag.reset(); dag.reset();
// Replay any updates
for (const auto &update : updates) {
switch (update.newState) {
case RunState::RUNNING:
case RunState::RETRY:
case RunState::ERRORED:
case RunState::KILLED:
dag.setVertexState(update.taskID, RunState::RUNNING);
dag.setVertexState(update.taskID, RunState::COMPLETED);
break;
}
}
return dag; return dag;
} }
@@ -77,13 +92,13 @@ namespace daggy {
executors::task::TaskExecutor &executor, executors::task::TaskExecutor &executor,
loggers::dag_run::DAGRunLogger &logger) { loggers::dag_run::DAGRunLogger &logger) {
std::vector<AttemptRecord> attempts; std::vector<AttemptRecord> attempts;
logger.updateTaskState(runID, task.name, loggers::dag_run::RunState::RUNNING); logger.updateTaskState(runID, task.name, RunState::RUNNING);
while (attempts.size() < task.maxRetries + 1) { while (attempts.size() < task.maxRetries + 1) {
attempts.push_back(executor.runCommand(task.command)); attempts.push_back(executor.runCommand(task.command));
logger.logTaskAttempt(runID, task.name, attempts.back()); logger.logTaskAttempt(runID, task.name, attempts.back());
if (attempts.back().rc == 0) break; if (attempts.back().rc == 0) break;
logger.updateTaskState(runID, task.name, loggers::dag_run::RunState::RETRY); logger.updateTaskState(runID, task.name, RunState::RETRY);
} }
return attempts; return attempts;
} }
@@ -93,7 +108,7 @@ namespace daggy {
executors::task::TaskExecutor &executor, executors::task::TaskExecutor &executor,
loggers::dag_run::DAGRunLogger &logger, loggers::dag_run::DAGRunLogger &logger,
DAG dag) { DAG dag) {
logger.updateDAGRunState(runID, loggers::dag_run::RunState::RUNNING); logger.updateDAGRunState(runID, RunState::RUNNING);
struct TaskState { struct TaskState {
size_t tid; size_t tid;
@@ -103,6 +118,9 @@ namespace daggy {
std::vector<TaskState> taskStates; std::vector<TaskState> taskStates;
// TODO Handle case where everything is wedged due to errors
size_t running = 0;
size_t errored = 0;
while (!dag.allVisited()) { while (!dag.allVisited()) {
// Check for any completed tasks // Check for any completed tasks
for (auto &taskState : taskStates) { for (auto &taskState : taskStates) {
@@ -112,16 +130,18 @@ namespace daggy {
auto attemptRecords = taskState.fut.get(); auto attemptRecords = taskState.fut.get();
const auto &taskName = tasks[taskState.tid].name; const auto &taskName = tasks[taskState.tid].name;
if (attemptRecords.empty()) { if (attemptRecords.empty()) {
logger.updateTaskState(runID, taskName, loggers::dag_run::RunState::ERRORED); logger.updateTaskState(runID, taskName, RunState::ERRORED);
continue; ++errored;
} }
if (attemptRecords.back().rc == 0) { if (attemptRecords.back().rc == 0) {
logger.updateTaskState(runID, taskName, loggers::dag_run::RunState::COMPLETED); logger.updateTaskState(runID, taskName, RunState::COMPLETED);
dag.completeVisit(taskState.tid); dag.completeVisit(taskState.tid);
taskState.complete = true; --running;
} else { } else {
logger.updateTaskState(runID, taskName, loggers::dag_run::RunState::ERRORED); logger.updateTaskState(runID, taskName, RunState::ERRORED);
++errored;
} }
taskState.complete = true;
} }
} }
@@ -139,6 +159,7 @@ namespace daggy {
.complete = false .complete = false
}; };
taskStates.push_back(std::move(tsk)); taskStates.push_back(std::move(tsk));
++running;
auto nextTask = dag.visitNext(); auto nextTask = dag.visitNext();
if (not nextTask.has_value()) break; if (not nextTask.has_value()) break;
@@ -147,6 +168,10 @@ namespace daggy {
if (!tq->empty()) { if (!tq->empty()) {
executor.threadPool.addTasks(tq); executor.threadPool.addTasks(tq);
} }
if (running > 0 and errored == running) {
logger.updateDAGRunState(runID, RunState::ERRORED);
break;
}
std::this_thread::sleep_for(250ms); std::this_thread::sleep_for(250ms);
} }
return dag; return dag;

View File

@@ -88,4 +88,49 @@ TEST_CASE("DAG Runner", "[utilities_dag_runner]") {
REQUIRE(attempts.front().rc == 0); REQUIRE(attempts.front().rc == 0);
} }
} }
SECTION("Recovery from Error") {
auto cleanup = []() {
// Cleanup
std::vector<fs::path> paths{"/tmp/rec_error_A", "/tmp/noexist" };
for (const auto & pth : paths) {
if (fs::exists(pth)) fs::remove_all(pth);
}
};
cleanup();
// daggy::loggers::dag_run::OStreamLogger logger(std::cout);
std::string goodPrefix = "/tmp/rec_error_";
std::string badPrefix = "/tmp/noexist/rec_error_";
std::string taskJSON = R"([{"name": "A", "command": ["/usr/bin/touch", ")"
+ goodPrefix + R"(A"], "children": ["C"]}, {"name": "B", "command": ["/usr/bin/touch", ")"
+ badPrefix + R"(B"], "children": ["C"]}, {"name": "C", "command": ["/usr/bin/touch", ")"
+ badPrefix + R"(C"]}])";
auto tasks = daggy::tasksFromJSON(taskJSON);
auto dag = daggy::buildDAGFromTasks(tasks);
auto runID = logger.startDAGRun("test_run", tasks);
auto tryDAG = daggy::runDAG(runID, tasks, ex, logger, dag);
REQUIRE(!tryDAG.allVisited());
// Create the missing dir, then continue to run the DAG
fs::create_directory("/tmp/noexist");
tryDAG.resetRunning();
auto endDAG = daggy::runDAG(runID, tasks, ex, logger, tryDAG);
REQUIRE(endDAG.allVisited());
// Get the DAG Run Attempts
auto record = logger.getDAGRun(runID);
REQUIRE(record.taskAttempts[0].size() == 1); // A ran fine
REQUIRE(record.taskAttempts[1].size() == 2); // B errored and had to be retried
REQUIRE(record.taskAttempts[2].size() == 1); // C wasn't run because B errored
cleanup();
}
} }