Large re-organization to split daggyd away from the core libdaggy.
This paves the way for implementing daggys and other utilities. Squashed commit of the following: commit 1f77239ab3c9e44d190eef94531a39501c8c4dfe Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 16:25:02 2021 -0300 Adding README, stdout support for daggyd logging commit c2c237224e84a3be68aaa597ce98af1365e74a13 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 16:10:29 2021 -0300 removing old daggyd commit cfea2baf61ca10c535801c5a391d2d525a1a2d04 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 16:10:09 2021 -0300 Moving tests into their sub-project folders commit e41ca42069bea1db16dd76b6684a3f692fef6b15 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 15:57:40 2021 -0300 Splitting out daggyd from libdaggy commit be97b146c1d2446f5c03cb78707e921f18c60bd8 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 15:56:55 2021 -0300 Splitting out daggyd from libdaggy commit cb61e140e9d6d8832d61fb7037fd4c0ff6edad00 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 15:49:47 2021 -0300 moving daggy to libdaggy
This commit is contained in:
213
libdaggy/src/DAGRunner.cpp
Normal file
213
libdaggy/src/DAGRunner.cpp
Normal file
@@ -0,0 +1,213 @@
|
||||
#include <chrono>
|
||||
#include <daggy/DAGRunner.hpp>
|
||||
#include <mutex>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace daggy {
|
||||
DAGRunner::DAGRunner(DAGRunID runID, executors::task::TaskExecutor &executor,
|
||||
loggers::dag_run::DAGRunLogger &logger, TaskDAG dag,
|
||||
const TaskParameters &taskParams)
|
||||
: runID_(runID)
|
||||
, executor_(executor)
|
||||
, logger_(logger)
|
||||
, dag_(dag)
|
||||
, taskParams_(taskParams)
|
||||
, running_(true)
|
||||
, kill_(true)
|
||||
, nRunningTasks_(0)
|
||||
, nErroredTasks_(0)
|
||||
{
|
||||
}
|
||||
|
||||
DAGRunner::~DAGRunner()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(runGuard_);
|
||||
}
|
||||
|
||||
TaskDAG DAGRunner::run()
|
||||
{
|
||||
kill_ = false;
|
||||
running_ = true;
|
||||
logger_.updateDAGRunState(runID_, RunState::RUNNING);
|
||||
|
||||
bool allVisited;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(runGuard_);
|
||||
allVisited = dag_.allVisited();
|
||||
}
|
||||
while (!allVisited) {
|
||||
{
|
||||
std::lock_guard<std::mutex> runLock(runGuard_);
|
||||
if (!running_ and kill_) {
|
||||
killRunning();
|
||||
}
|
||||
collectFinished();
|
||||
queuePending();
|
||||
|
||||
if (!running_ and (nRunningTasks_ - nErroredTasks_ <= 0)) {
|
||||
logger_.updateDAGRunState(runID_, RunState::KILLED);
|
||||
break;
|
||||
}
|
||||
|
||||
if (nRunningTasks_ > 0 and nErroredTasks_ == nRunningTasks_) {
|
||||
logger_.updateDAGRunState(runID_, RunState::ERRORED);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(250ms);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(runGuard_);
|
||||
allVisited = dag_.allVisited();
|
||||
}
|
||||
}
|
||||
|
||||
if (dag_.allVisited()) {
|
||||
logger_.updateDAGRunState(runID_, RunState::COMPLETED);
|
||||
}
|
||||
|
||||
running_ = false;
|
||||
return dag_;
|
||||
}
|
||||
|
||||
void DAGRunner::resetRunning()
|
||||
{
|
||||
if (running_)
|
||||
throw std::runtime_error("Unable to reset while DAG is running.");
|
||||
|
||||
std::lock_guard<std::mutex> lock(runGuard_);
|
||||
nRunningTasks_ = 0;
|
||||
nErroredTasks_ = 0;
|
||||
runningTasks_.clear();
|
||||
taskAttemptCounts_.clear();
|
||||
dag_.resetRunning();
|
||||
}
|
||||
|
||||
void DAGRunner::killRunning()
|
||||
{
|
||||
for (const auto &[taskName, _] : runningTasks_) {
|
||||
executor_.stop(runID_, taskName);
|
||||
}
|
||||
}
|
||||
|
||||
void DAGRunner::queuePending()
|
||||
{
|
||||
if (!running_)
|
||||
return;
|
||||
|
||||
// Check for any completed tasks
|
||||
// Add all remaining tasks in a task queue to avoid dominating the thread
|
||||
// pool
|
||||
auto t = dag_.visitNext();
|
||||
while (t.has_value()) {
|
||||
// Schedule the task to run
|
||||
auto &taskName = t.value().first;
|
||||
auto &task = t.value().second;
|
||||
taskAttemptCounts_[taskName] = 1;
|
||||
|
||||
logger_.updateTaskState(runID_, taskName, RunState::RUNNING);
|
||||
runningTasks_.emplace(taskName,
|
||||
executor_.execute(runID_, taskName, task));
|
||||
++nRunningTasks_;
|
||||
|
||||
auto nextTask = dag_.visitNext();
|
||||
if (not nextTask.has_value())
|
||||
break;
|
||||
t.emplace(nextTask.value());
|
||||
}
|
||||
}
|
||||
|
||||
void DAGRunner::collectFinished()
|
||||
{
|
||||
for (auto &[taskName, fut] : runningTasks_) {
|
||||
if (fut.valid() and fut.wait_for(1ms) == std::future_status::ready) {
|
||||
auto attempt = fut.get();
|
||||
logger_.logTaskAttempt(runID_, taskName, attempt);
|
||||
|
||||
// Not a reference, since adding tasks will invalidate references
|
||||
auto vert = dag_.getVertex(taskName);
|
||||
auto &task = vert.data;
|
||||
if (attempt.rc == 0) {
|
||||
logger_.updateTaskState(runID_, taskName, RunState::COMPLETED);
|
||||
if (task.isGenerator) {
|
||||
// Parse the output and update the DAGs
|
||||
try {
|
||||
auto parsedTasks =
|
||||
tasksFromJSON(attempt.outputLog, taskParams_.jobDefaults);
|
||||
auto newTasks =
|
||||
expandTaskSet(parsedTasks, executor_, taskParams_.variables);
|
||||
updateDAGFromTasks(dag_, newTasks);
|
||||
|
||||
// Add in dependencies from current task to new tasks
|
||||
for (const auto &[ntName, ntTask] : newTasks) {
|
||||
logger_.addTask(runID_, ntName, ntTask);
|
||||
task.children.insert(ntName);
|
||||
}
|
||||
|
||||
// Efficiently add new edges from generator task
|
||||
// to children
|
||||
std::unordered_set<std::string> baseNames;
|
||||
for (const auto &[k, v] : parsedTasks) {
|
||||
baseNames.insert(v.definedName);
|
||||
}
|
||||
dag_.addEdgeIf(taskName, [&](const auto &v) {
|
||||
return baseNames.count(v.data.definedName) > 0;
|
||||
});
|
||||
|
||||
logger_.updateTask(runID_, taskName, task);
|
||||
}
|
||||
catch (std::exception &e) {
|
||||
logger_.logTaskAttempt(
|
||||
runID_, taskName,
|
||||
AttemptRecord{
|
||||
.executorLog =
|
||||
std::string{"Failed to parse JSON output: "} +
|
||||
e.what()});
|
||||
logger_.updateTaskState(runID_, taskName, RunState::ERRORED);
|
||||
++nErroredTasks_;
|
||||
}
|
||||
}
|
||||
dag_.completeVisit(taskName);
|
||||
--nRunningTasks_;
|
||||
}
|
||||
else {
|
||||
// RC isn't 0
|
||||
if (taskAttemptCounts_[taskName] <= task.maxRetries) {
|
||||
logger_.updateTaskState(runID_, taskName, RunState::RETRY);
|
||||
runningTasks_[taskName] = executor_.execute(runID_, taskName, task);
|
||||
++taskAttemptCounts_[taskName];
|
||||
}
|
||||
else {
|
||||
if (logger_.getTaskState(runID_, taskName) == +RunState::RUNNING or
|
||||
logger_.getTaskState(runID_, taskName) == +RunState::RETRY) {
|
||||
logger_.updateTaskState(runID_, taskName, RunState::ERRORED);
|
||||
++nErroredTasks_;
|
||||
}
|
||||
else {
|
||||
// Task was killed
|
||||
--nRunningTasks_;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DAGRunner::stop(bool kill, bool blocking)
|
||||
{
|
||||
kill_ = kill;
|
||||
running_ = false;
|
||||
|
||||
if (blocking) {
|
||||
while (true) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(runGuard_);
|
||||
if (nRunningTasks_ - nErroredTasks_ == 0)
|
||||
break;
|
||||
}
|
||||
std::this_thread::sleep_for(250ms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace daggy
|
||||
Reference in New Issue
Block a user