Adding a No-op task executor for testing

Fixing DFS implementation of DAG validation to be much faster
Adding in additional tests to ensure the run order of expanded tasks is preserved
Adding additional compile-time checks, resolving issues that came up as a result
This commit is contained in:
Ian Roddis
2021-09-20 19:05:56 -03:00
parent 2daaa83d82
commit 39d5ae08be
14 changed files with 187 additions and 113 deletions

View File

@@ -3,6 +3,7 @@ project(overall)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)
set(CMAKE_EXPORT_COMPILE_COMMANDS True)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror")
set(THIRD_PARTY_DIR ${CMAKE_BINARY_DIR}/third_party)

View File

@@ -9,6 +9,7 @@
#include <functional>
#include <optional>
#include <sstream>
#include <queue>
#include "Defines.hpp"
@@ -35,7 +36,7 @@ namespace daggy {
// Vertices
void addVertex(K id, V data);
const std::vector<Vertex<K, V>> &getVertices();
std::unordered_set<K> getVertices() const;
// Edges
void addEdge(const K &src, const K &dst);
@@ -75,8 +76,6 @@ namespace daggy {
private:
std::unordered_map<K, Vertex<K, V>> vertices_;
std::optional<K> findCycle_(const K & node, std::unordered_set<K> & seen) const;
};
}

View File

@@ -11,6 +11,15 @@ namespace daggy {
template<typename K, typename V>
Vertex <K, V> &DAG<K, V>::getVertex(const K &id) { return vertices_.at(id); }
template<typename K, typename V>
std::unordered_set<K> DAG<K, V>::getVertices() const {
std::unordered_set<K> keys;
for (const auto it : vertices_) {
keys.insert(it.first);
}
return keys;
}
template<typename K, typename V>
void DAG<K, V>::addVertex(K id, V data) {
if (vertices_.count(id) != 0) {
@@ -31,42 +40,37 @@ namespace daggy {
template<typename K, typename V>
void DAG<K, V>::addEdgeIf(const K &src, std::function<bool(const Vertex <K, V> &v)> predicate) {
for (const auto &[name, vertex]: vertices_) {
auto & parent = vertices_.at(src);
for (auto &[name, vertex]: vertices_) {
if (! predicate(vertex)) continue;
if (name == src) continue;
if (predicate(vertex)) addEdge(src, name);
parent.children.insert(name);
vertex.depCount++;
}
}
template<typename K, typename V>
std::optional<K> DAG<K, V>::findCycle_(const K & node, std::unordered_set<K> & seen) const {
if (seen.count(node) > 0) return node;
seen.insert(node);
std::optional<K> ret;
for (const auto & child : vertices_.at(node).children) {
auto res = findCycle_(child, seen);
if (res.has_value()) {
ret.swap(res);
break;
}
}
seen.extract(node);
return ret;
}
template<typename K, typename V>
bool DAG<K, V>::isValid() const {
std::unordered_set<K> seen;
std::unordered_map<K, size_t> depCounts;
std::queue<K> ready;
size_t processed = 0;
for (const auto & [k, v] : vertices_) {
seen.clear();
if (v.depCount != 0) continue;
auto res = findCycle_(k, seen);
if (res.has_value()) {
std::stringstream ss;
ss << "DAG contains a cycle between " << k << " and " << res.value() << std::endl;
throw std::runtime_error(ss.str());
}
depCounts[k] = v.depCount;
if (v.depCount == 0) ready.push(k);
}
return true;
while (! ready.empty()) {
const auto & k = ready.front();
for (const auto & child : vertices_.at(k).children) {
auto dc = --depCounts[child];
if (dc == 0) ready.push(child);
}
processed++;
ready.pop();
}
return processed == vertices_.size();
}
template<typename K, typename V>

View File

@@ -16,7 +16,7 @@ namespace daggy {
using Command = std::vector<std::string>;
// Time
using Clock = std::chrono::system_clock;
using Clock = std::chrono::high_resolution_clock;
using TimePoint = std::chrono::time_point<Clock>;
// DAG Runs

View File

@@ -29,7 +29,7 @@ namespace daggy {
buildDAGFromTasks(TaskSet &tasks,
const std::vector<loggers::dag_run::TaskUpdateRecord> &updates = {});
void updateDAGFromTasks(TaskDAG &dag, TaskSet &tasks);
void updateDAGFromTasks(TaskDAG &dag, const TaskSet &tasks);
TaskDAG runDAG(DAGRunID runID,
executors::task::TaskExecutor &executor,

View File

@@ -0,0 +1,20 @@
#pragma once
#include "TaskExecutor.hpp"
namespace daggy::executors::task {
class NoopTaskExecutor : public TaskExecutor {
public:
using Command = std::vector<std::string>;
// Validates the job to ensure that all required values are set and are of the right type,
bool validateTaskParameters(const ConfigValues &job) override;
std::vector<ConfigValues>
expandTaskParameters(const ConfigValues &job, const ConfigValues &expansionValues) override;
// Runs the task
std::future<AttemptRecord> execute(const std::string &taskName, const Task &task) override;
};
}

View File

@@ -152,6 +152,15 @@ namespace daggy {
const auto &taskName = it->name.GetString();
tasks.emplace(taskName, taskFromJSON(taskName, it->value, jobDefaults));
}
// Normalize tasks so all the children are populated
for (auto &[k, v] : tasks) {
for (const auto & p : v.parents) {
tasks[p].children.insert(k);
}
v.parents.clear();
}
return tasks;
}

View File

@@ -78,28 +78,19 @@ namespace daggy {
}
void updateDAGFromTasks(TaskDAG &dag, TaskSet &tasks) {
// Add all the vertices
std::unordered_map<std::string, std::unordered_set<std::string>> definedSets;
void updateDAGFromTasks(TaskDAG &dag, const TaskSet &tasks) {
// Add the missing vertices
for (const auto &[name, task]: tasks) {
dag.addVertex(name, task);
definedSets[task.definedName].insert(name);
}
// Add edges
for (const auto &[name, task]: tasks) {
for (const auto &defChild: task.children) {
for (const auto &child: definedSets[defChild]) {
dag.addEdge(name, child);
}
}
dag.addEdgeIf(name, [&](const auto &v) { return task.children.count(v.data.definedName) > 0; });
}
for (const auto &defParent: task.parents) {
for (const auto &parent: definedSets[defParent]) {
dag.addEdge(parent, name);
tasks.at(parent).children.insert(name);
}
}
if (! dag.isValid()) {
throw std::runtime_error("DAG contains a cycle");
}
}
@@ -107,8 +98,6 @@ namespace daggy {
const std::vector<loggers::dag_run::TaskUpdateRecord> &updates) {
TaskDAG dag;
updateDAGFromTasks(dag, tasks);
dag.reset();
dag.isValid();
// Replay any updates
for (const auto &update: updates) {
@@ -120,6 +109,9 @@ namespace daggy {
dag.setVertexState(update.taskName, RunState::RUNNING);
dag.setVertexState(update.taskName, RunState::COMPLETED);
break;
case RunState::COMPLETED:
case RunState::QUEUED:
break;
}
}
@@ -145,10 +137,10 @@ namespace daggy {
if (fut.valid()) {
auto attempt = fut.get();
logger.logTaskAttempt(runID, taskName, attempt);
auto &vert = dag.getVertex(taskName);
auto &task = vert.data;
if (attempt.rc == 0) {
logger.updateTaskState(runID, taskName, RunState::COMPLETED);
auto &vert = dag.getVertex(taskName);
auto &task = vert.data;
if (task.isGenerator) {
// Parse the output and update the DAGs
try {
@@ -177,7 +169,6 @@ namespace daggy {
--running;
} else {
// RC isn't 0
const auto & task = dag.getVertex(taskName).data;
if (taskAttemptCounts[taskName] <= task.maxRetries) {
logger.updateTaskState(runID, taskName, RunState::RETRY);
runningTasks[taskName] = executor.execute(taskName, task);
@@ -198,6 +189,7 @@ namespace daggy {
auto &task = t.value().second;
taskAttemptCounts[taskName] = 1;
logger.updateTaskState(runID, taskName, RunState::RUNNING);
runningTasks.emplace(taskName, executor.execute(taskName, task));
++running;

View File

@@ -1,4 +1,5 @@
target_sources(${PROJECT_NAME} PRIVATE
ForkingTaskExecutor.cpp
SlurmTaskExecutor.cpp
NoopTaskExecutor.cpp
)

View File

@@ -0,0 +1,42 @@
#include <daggy/executors/task/NoopTaskExecutor.hpp>
#include <daggy/Utilities.hpp>
namespace daggy::executors::task {
std::future<daggy::AttemptRecord>
NoopTaskExecutor::execute(const std::string &taskName, const Task &task) {
std::promise<daggy::AttemptRecord> promise;
auto ts = Clock::now();
promise.set_value(AttemptRecord{
.startTime = ts,
.stopTime = ts,
.rc = 0,
.executorLog = taskName,
.outputLog = taskName,
.errorLog = taskName
});
return promise.get_future();
}
bool NoopTaskExecutor::validateTaskParameters(const ConfigValues &job) {
auto it = job.find("command");
if (it == job.end())
throw std::runtime_error(R"(job does not have a "command" argument)");
if (!std::holds_alternative<Command>(it->second))
throw std::runtime_error(R"(taskParameter's "command" must be an array of strings)");
return true;
}
std::vector<daggy::ConfigValues>
NoopTaskExecutor::expandTaskParameters(const ConfigValues &job, const ConfigValues &expansionValues) {
std::vector<ConfigValues> newValues;
const auto command = std::get<Command>(job.at("command"));
for (const auto &expandedCommand: interpolateValues(command, expansionValues)) {
ConfigValues newCommand{job};
newCommand.at("command") = expandedCommand;
newValues.emplace_back(newCommand);
}
return newValues;
}
}

View File

@@ -11,7 +11,7 @@ TEST_CASE("dag_construction", "[dag]") {
REQUIRE(dag.empty());
REQUIRE_NOTHROW(dag.addVertex(0, 0));
for (int i = 1; i < 10; ++i) {
for (size_t i = 1; i < 10; ++i) {
dag.addVertex(i, i);
REQUIRE(dag.hasVertex(i));
REQUIRE(dag.getVertex(i).data == i);
@@ -23,7 +23,7 @@ TEST_CASE("dag_construction", "[dag]") {
// Cannot add an edge that would result in a cycle
dag.addEdge(9, 5);
REQUIRE_THROWS(dag.isValid());
REQUIRE(!dag.isValid());
// Bounds checking
SECTION("addEdge Bounds Checking") {

View File

@@ -142,19 +142,24 @@ TEST_CASE("rest_endpoint", "[server_basic]") {
REQUIRE(doc.IsObject());
REQUIRE(doc.HasMember("taskStates"));
const auto &taskStates = doc["taskStates"].GetArray();
REQUIRE(taskStates.Size() == 3);
const auto &taskStates = doc["taskStates"].GetObject();
size_t nStates = 0;
for (auto it = taskStates.MemberBegin(); it != taskStates.MemberEnd(); ++it) {
nStates++;
}
REQUIRE(nStates == 3);
complete = true;
for (size_t i = 0; i < taskStates.Size(); ++i) {
std::string state = taskStates[i].GetString();
for (auto it = taskStates.MemberBegin(); it != taskStates.MemberEnd(); ++it) {
std::string state = it->value.GetString();
if (state != "COMPLETED") {
complete = false;
break;
}
}
if (complete) break;
std::this_thread::sleep_for(std::chrono::milliseconds(500));
std::this_thread::sleep_for(std::chrono::seconds(1));
}
REQUIRE(complete);

View File

@@ -1,4 +1,5 @@
#include <iostream>
#include <chrono>
#include <filesystem>
#include <fstream>
@@ -12,6 +13,7 @@
#include "daggy/Utilities.hpp"
#include "daggy/Serialization.hpp"
#include "daggy/executors/task/ForkingTaskExecutor.hpp"
#include "daggy/executors/task/NoopTaskExecutor.hpp"
#include "daggy/loggers/dag_run/OStreamLogger.hpp"
namespace fs = std::filesystem;
@@ -56,6 +58,56 @@ TEST_CASE("string_expansion", "[utilities_parameter_expansion]") {
}
}
TEST_CASE("dag_runner_order", "[dagrun_order]") {
daggy::executors::task::NoopTaskExecutor ex;
std::stringstream ss;
daggy::loggers::dag_run::OStreamLogger logger(ss);
daggy::TimePoint startTime = daggy::Clock::now();
std::string testParams{R"({"DATE": ["2021-05-06", "2021-05-07", "2021-05-08", "2021-05-09" ]})"};
auto params = daggy::configFromJSON(testParams);
std::string taskJSON = R"({
"A": {"job": {"command": ["/usr/bin/touch", "{{DATE}}"]}, "children": [ "B","D" ]},
"B": {"job": {"command": ["/usr/bin/touch", "{{DATE}}"]}, "children": [ "C","D","E" ]},
"C": {"job": {"command": ["/usr/bin/touch", "{{DATE}}"]}, "children": [ "D"]},
"D": {"job": {"command": ["/usr/bin/touch", "{{DATE}}"]}, "children": [ "E"]},
"E": {"job": {"command": ["/usr/bin/touch", "{{DATE}}"]}}
})";
auto tasks = expandTaskSet(daggy::tasksFromJSON(taskJSON), ex, params);
REQUIRE(tasks.size() == 20);
auto dag = daggy::buildDAGFromTasks(tasks);
auto runID = logger.startDAGRun("test_run", tasks);
auto endDAG = daggy::runDAG(runID, ex, logger, dag);
REQUIRE(endDAG.allVisited());
// Ensure the run order
auto rec = logger.getDAGRun(runID);
daggy::TimePoint stopTime = daggy::Clock::now();
std::array<daggy::TimePoint, 5> minTimes; minTimes.fill(startTime);
std::array<daggy::TimePoint, 5> maxTimes; maxTimes.fill(stopTime);
for (const auto &[k, v] : rec.taskAttempts) {
size_t idx = k[0] - 65;
auto & startTime = minTimes[idx];
auto & stopTime = maxTimes[idx];
startTime = std::max(startTime, v.front().startTime);
stopTime = std::min(stopTime, v.back().stopTime);
}
for (size_t i = 0; i < 5; ++i) {
for (size_t j = i+1; j < 4; ++j) {
REQUIRE(maxTimes[i] < minTimes[j]);
}
}
}
TEST_CASE("dag_runner", "[utilities_dag_runner]") {
daggy::executors::task::ForkingTaskExecutor ex(10);
std::stringstream ss;
@@ -181,55 +233,3 @@ TEST_CASE("dag_runner", "[utilities_dag_runner]") {
REQUIRE(record.tasks["C_0"].children.empty());
}
}
TEST_CASE("dag_runner_stress", "[utilities_dag_runner_stress]") {
daggy::executors::task::ForkingTaskExecutor ex(10);
std::stringstream ss;
daggy::loggers::dag_run::OStreamLogger logger(ss);
SECTION("Stress-test") {
static std::random_device dev;
static std::mt19937 rng(dev());
std::uniform_int_distribution<size_t> nDepDist(0, 10);
const size_t N_NODES = 100;
daggy::TaskSet tasks;
std::vector<fs::path> fileNames;
std::vector<std::string> taskNames;
for (size_t i = 0; i < N_NODES; ++i) {
std::string taskName = std::to_string(i);
std::uniform_int_distribution<size_t> depDist(i+1, N_NODES-1);
std::unordered_set<std::string> deps;
size_t nChildren = nDepDist(rng);
for (size_t c = 0; c < nChildren; ++c) {
deps.insert(std::to_string(depDist(rng)));
}
tasks.emplace(taskName, daggy::Task{
.definedName = taskName,
.job = { { "command", std::vector<std::string>{"/usr/bin/echo", taskName}}},
.children = deps
});
}
auto dag = daggy::buildDAGFromTasks(tasks);
/**
auto runID = logger.startDAGRun("test_run", tasks);
auto tryDAG = daggy::runDAG(runID, ex, logger, dag);
REQUIRE(tryDAG.allVisited());
// Get the DAG Run Attempts
auto record = logger.getDAGRun(runID);
for (const auto & [k, attempts] : record.taskAttempts) {
REQUIRE(attempts.size() == 1);
}
*/
}
}

View File

@@ -95,7 +95,8 @@ void daemonize() {
/* Change the working directory to the root directory */
/* or another appropriated directory */
chdir("/");
auto rc = chdir("/");
(void)rc;
/* Close all open file descriptors */
for (auto x = sysconf(_SC_OPEN_MAX); x >= 0; x--) { close(x); }