Converting underlying DAG implementation to use vectors for storage
Reduces hashing required and speeds up traversals
This commit is contained in:
@@ -20,13 +20,13 @@
|
||||
|
||||
namespace daggy {
|
||||
|
||||
template <typename K, typename V>
|
||||
template <typename T>
|
||||
struct Vertex
|
||||
{
|
||||
RunState state = RunState::QUEUED;
|
||||
uint32_t depCount = 0;
|
||||
V data;
|
||||
std::unordered_set<K> children;
|
||||
T data;
|
||||
std::unordered_set<size_t> children;
|
||||
};
|
||||
|
||||
template <typename K, typename V>
|
||||
@@ -42,7 +42,7 @@ namespace daggy {
|
||||
void addEdge(const K &from, const K &to);
|
||||
|
||||
void addEdgeIf(const K &src,
|
||||
std::function<bool(const Vertex<K, V> &v)> predicate);
|
||||
std::function<bool(const Vertex<V> &v)> predicate);
|
||||
|
||||
[[nodiscard]] bool isValid() const;
|
||||
|
||||
@@ -61,19 +61,21 @@ namespace daggy {
|
||||
|
||||
void setVertexState(const K &id, RunState state);
|
||||
|
||||
void forEach(
|
||||
std::function<void(const std::pair<K, Vertex<K, V>> &)> fun) const;
|
||||
void forEach(std::function<void(const Vertex<V> &)> fun) const;
|
||||
|
||||
[[nodiscard]] bool allVisited() const;
|
||||
|
||||
std::optional<std::pair<K, V>> visitNext();
|
||||
|
||||
Vertex<K, V> &getVertex(const K &id);
|
||||
// WARNING: reference potentially invalidated on insertions.
|
||||
Vertex<V> &getVertex(const K &id);
|
||||
|
||||
void completeVisit(const K &id);
|
||||
|
||||
private:
|
||||
std::unordered_map<K, Vertex<K, V>> vertices_;
|
||||
std::unordered_map<K, size_t> keyMap_;
|
||||
std::vector<K> vertexName_;
|
||||
std::vector<Vertex<V>> vertices_;
|
||||
};
|
||||
} // namespace daggy
|
||||
|
||||
|
||||
@@ -14,20 +14,20 @@ namespace daggy {
|
||||
template <typename K, typename V>
|
||||
bool DAG<K, V>::hasVertex(const K &id)
|
||||
{
|
||||
return vertices_.count(id) != 0;
|
||||
return keyMap_.count(id) != 0;
|
||||
}
|
||||
|
||||
template <typename K, typename V>
|
||||
Vertex<K, V> &DAG<K, V>::getVertex(const K &id)
|
||||
Vertex<V> &DAG<K, V>::getVertex(const K &id)
|
||||
{
|
||||
return vertices_.at(id);
|
||||
return vertices_[keyMap_.at(id)];
|
||||
}
|
||||
|
||||
template <typename K, typename V>
|
||||
std::unordered_set<K> DAG<K, V>::getVertices() const
|
||||
{
|
||||
std::unordered_set<K> keys;
|
||||
for (const auto it : vertices_) {
|
||||
for (const auto it : keyMap_) {
|
||||
keys.insert(it.first);
|
||||
}
|
||||
return keys;
|
||||
@@ -36,58 +36,59 @@ namespace daggy {
|
||||
template <typename K, typename V>
|
||||
void DAG<K, V>::addVertex(K id, V data)
|
||||
{
|
||||
if (vertices_.count(id) != 0) {
|
||||
if (keyMap_.count(id) != 0) {
|
||||
std::stringstream ss;
|
||||
ss << "A vertex with ID " << id << " already exists in the DAG";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
vertices_.emplace(
|
||||
id,
|
||||
Vertex<K, V>{.state = RunState::QUEUED, .depCount = 0, .data = data});
|
||||
size_t idx = vertices_.size();
|
||||
vertexName_.emplace_back(id);
|
||||
vertices_.emplace_back(
|
||||
Vertex<V>{.state = RunState::QUEUED, .depCount = 0, .data = data});
|
||||
keyMap_.emplace(id, idx);
|
||||
}
|
||||
|
||||
template <typename K, typename V>
|
||||
void DAG<K, V>::addEdge(const K &from, const K &to)
|
||||
{
|
||||
if (vertices_.find(from) == vertices_.end())
|
||||
throw std::runtime_error("No such vertex");
|
||||
if (vertices_.find(to) == vertices_.end())
|
||||
throw std::runtime_error("No such vertex");
|
||||
vertices_.at(from).children.insert(to);
|
||||
vertices_.at(to).depCount++;
|
||||
size_t src = keyMap_.at(from);
|
||||
size_t dst = keyMap_.at(to);
|
||||
vertices_[src].children.insert(dst);
|
||||
vertices_[dst].depCount++;
|
||||
}
|
||||
|
||||
template <typename K, typename V>
|
||||
void DAG<K, V>::addEdgeIf(
|
||||
const K &src, std::function<bool(const Vertex<K, V> &v)> predicate)
|
||||
void DAG<K, V>::addEdgeIf(const K &src,
|
||||
std::function<bool(const Vertex<V> &v)> predicate)
|
||||
{
|
||||
auto &parent = vertices_.at(src);
|
||||
for (auto &[name, vertex] : vertices_) {
|
||||
if (!predicate(vertex))
|
||||
size_t parentIdx = keyMap_.at(src);
|
||||
auto &parent = vertices_[parentIdx];
|
||||
for (size_t i = 0; i < vertices_.size(); ++i) {
|
||||
if (!predicate(vertices_[i]))
|
||||
continue;
|
||||
if (name == src)
|
||||
if (i == parentIdx)
|
||||
continue;
|
||||
parent.children.insert(name);
|
||||
vertex.depCount++;
|
||||
parent.children.insert(i);
|
||||
vertices_[i].depCount++;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename K, typename V>
|
||||
bool DAG<K, V>::isValid() const
|
||||
{
|
||||
std::unordered_map<K, size_t> depCounts;
|
||||
std::queue<K> ready;
|
||||
std::vector<size_t> depCounts(vertices_.size(), 0);
|
||||
std::queue<size_t> ready;
|
||||
size_t processed = 0;
|
||||
|
||||
for (const auto &[k, v] : vertices_) {
|
||||
depCounts[k] = v.depCount;
|
||||
if (v.depCount == 0)
|
||||
ready.push(k);
|
||||
for (size_t i = 0; i < vertices_.size(); ++i) {
|
||||
depCounts[i] = vertices_[i].depCount;
|
||||
if (depCounts[i] == 0)
|
||||
ready.push(i);
|
||||
}
|
||||
|
||||
while (!ready.empty()) {
|
||||
const auto &k = ready.front();
|
||||
for (const auto &child : vertices_.at(k).children) {
|
||||
for (const auto &child : vertices_[k].children) {
|
||||
auto dc = --depCounts[child];
|
||||
if (dc == 0)
|
||||
ready.push(child);
|
||||
@@ -103,15 +104,15 @@ namespace daggy {
|
||||
void DAG<K, V>::reset()
|
||||
{
|
||||
// Reset the state of all vertices
|
||||
for (auto &[_, v] : vertices_) {
|
||||
for (auto &v : vertices_) {
|
||||
v.state = RunState::QUEUED;
|
||||
v.depCount = 0;
|
||||
}
|
||||
|
||||
// Calculate the upstream count
|
||||
for (auto &[_, v] : vertices_) {
|
||||
for (auto &v : vertices_) {
|
||||
for (auto c : v.children) {
|
||||
vertices_.at(c).depCount++;
|
||||
vertices_[c].depCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -119,7 +120,7 @@ namespace daggy {
|
||||
template <typename K, typename V>
|
||||
void DAG<K, V>::resetRunning()
|
||||
{
|
||||
for (auto &[k, v] : vertices_) {
|
||||
for (auto &v : vertices_) {
|
||||
if (v.state != +RunState::RUNNING)
|
||||
continue;
|
||||
v.state = RunState::QUEUED;
|
||||
@@ -129,29 +130,28 @@ namespace daggy {
|
||||
template <typename K, typename V>
|
||||
void DAG<K, V>::setVertexState(const K &id, RunState state)
|
||||
{
|
||||
vertices_.at(id).state = state;
|
||||
vertices_[keyMap_.at(id)].state = state;
|
||||
}
|
||||
|
||||
template <typename K, typename V>
|
||||
bool DAG<K, V>::allVisited() const
|
||||
{
|
||||
for (const auto &[_, v] : vertices_) {
|
||||
if (v.state != +RunState::COMPLETED)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
return not std::any_of(
|
||||
vertices_.begin(), vertices_.end(),
|
||||
[](const auto &v) { return v.state != +RunState::COMPLETED; });
|
||||
}
|
||||
|
||||
template <typename K, typename V>
|
||||
std::optional<std::pair<K, V>> DAG<K, V>::visitNext()
|
||||
{
|
||||
for (auto &[k, v] : vertices_) {
|
||||
for (size_t i = 0; i < vertices_.size(); ++i) {
|
||||
auto &v = vertices_[i];
|
||||
if (v.state != +RunState::QUEUED)
|
||||
continue;
|
||||
if (v.depCount != 0)
|
||||
continue;
|
||||
v.state = RunState::RUNNING;
|
||||
return std::make_pair(k, v.data);
|
||||
return std::make_pair(vertexName_[i], v.data);
|
||||
}
|
||||
return {};
|
||||
}
|
||||
@@ -159,16 +159,15 @@ namespace daggy {
|
||||
template <typename K, typename V>
|
||||
void DAG<K, V>::completeVisit(const K &id)
|
||||
{
|
||||
auto &v = vertices_.at(id);
|
||||
auto &v = vertices_[keyMap_.at(id)];
|
||||
v.state = RunState::COMPLETED;
|
||||
for (auto c : v.children) {
|
||||
--vertices_.at(c).depCount;
|
||||
--vertices_[c].depCount;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename K, typename V>
|
||||
void DAG<K, V>::forEach(
|
||||
std::function<void(const std::pair<K, Vertex<K, V>> &)> fun) const
|
||||
void DAG<K, V>::forEach(std::function<void(const Vertex<V> &)> fun) const
|
||||
{
|
||||
for (auto it = vertices_.begin(); it != vertices_.end(); ++it) {
|
||||
fun(*it);
|
||||
|
||||
@@ -146,22 +146,36 @@ namespace daggy {
|
||||
if (fut.valid()) {
|
||||
auto attempt = fut.get();
|
||||
logger.logTaskAttempt(runID, taskName, attempt);
|
||||
auto &vert = dag.getVertex(taskName);
|
||||
|
||||
// Not a reference, since adding tasks will invalidate references
|
||||
auto vert = dag.getVertex(taskName);
|
||||
auto &task = vert.data;
|
||||
if (attempt.rc == 0) {
|
||||
logger.updateTaskState(runID, taskName, RunState::COMPLETED);
|
||||
if (task.isGenerator) {
|
||||
// Parse the output and update the DAGs
|
||||
try {
|
||||
auto newTasks = expandTaskSet(tasksFromJSON(attempt.outputLog),
|
||||
executor, parameters);
|
||||
auto parsedTasks = tasksFromJSON(attempt.outputLog);
|
||||
auto newTasks =
|
||||
expandTaskSet(parsedTasks, executor, parameters);
|
||||
updateDAGFromTasks(dag, newTasks);
|
||||
|
||||
// Add in dependencies from current task to new tasks
|
||||
for (const auto &[ntName, ntTask] : newTasks) {
|
||||
logger.addTask(runID, ntName, ntTask);
|
||||
dag.addEdge(taskName, ntName);
|
||||
task.children.insert(ntName);
|
||||
}
|
||||
|
||||
// Efficiently add new edges from generator task
|
||||
// to children
|
||||
std::unordered_set<std::string> baseNames;
|
||||
for (const auto &[k, v] : parsedTasks) {
|
||||
baseNames.insert(v.definedName);
|
||||
}
|
||||
dag.addEdgeIf(taskName, [&](const auto &v) {
|
||||
return baseNames.count(v.data.definedName) > 0;
|
||||
});
|
||||
|
||||
logger.updateTask(runID, taskName, task);
|
||||
}
|
||||
catch (std::exception &e) {
|
||||
|
||||
@@ -63,7 +63,9 @@ TEST_CASE("dag_traversal", "[dag]")
|
||||
std::vector<size_t> visitOrder(N_VERTICES);
|
||||
size_t i = 0;
|
||||
while (!dag.allVisited()) {
|
||||
const auto v = dag.visitNext().value();
|
||||
auto o = dag.visitNext();
|
||||
REQUIRE(o.has_value());
|
||||
const auto v = o.value();
|
||||
dag.completeVisit(v.first);
|
||||
visitOrder[v.first] = i;
|
||||
++i;
|
||||
|
||||
@@ -150,6 +150,13 @@ TEST_CASE("dag_runner", "[utilities_dag_runner]")
|
||||
REQUIRE(attempts.front().rc == 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("runDAG_recovery", "[runDAG]")
|
||||
{
|
||||
daggy::executors::task::ForkingTaskExecutor ex(10);
|
||||
std::stringstream ss;
|
||||
daggy::loggers::dag_run::OStreamLogger logger(ss);
|
||||
|
||||
SECTION("Recovery from Error")
|
||||
{
|
||||
@@ -198,6 +205,13 @@ TEST_CASE("dag_runner", "[utilities_dag_runner]")
|
||||
|
||||
cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("runDAG_generator", "[runDAG_generator]")
|
||||
{
|
||||
daggy::executors::task::ForkingTaskExecutor ex(10);
|
||||
std::stringstream ss;
|
||||
daggy::loggers::dag_run::OStreamLogger logger(ss);
|
||||
|
||||
SECTION("Generator tasks")
|
||||
{
|
||||
@@ -211,8 +225,8 @@ TEST_CASE("dag_runner", "[utilities_dag_runner]")
|
||||
ofh << generatorOutput << std::endl;
|
||||
ofh.close();
|
||||
|
||||
daggy::TimePoint globalStartTime = daggy::Clock::now();
|
||||
std::stringstream jsonTasks;
|
||||
|
||||
jsonTasks
|
||||
<< R"({ "A": { "job": {"command": [ "/usr/bin/cat", )"
|
||||
<< std::quoted(ofn.string())
|
||||
@@ -221,8 +235,10 @@ TEST_CASE("dag_runner", "[utilities_dag_runner]")
|
||||
|
||||
auto baseTasks = daggy::tasksFromJSON(jsonTasks.str());
|
||||
REQUIRE(baseTasks.size() == 2);
|
||||
REQUIRE(baseTasks["A"].children == std::unordered_set<std::string>{"C"});
|
||||
auto tasks = daggy::expandTaskSet(baseTasks, ex, params);
|
||||
REQUIRE(tasks.size() == 2);
|
||||
REQUIRE(tasks["A_0"].children == std::unordered_set<std::string>{"C"});
|
||||
auto dag = daggy::buildDAGFromTasks(tasks);
|
||||
REQUIRE(dag.size() == 2);
|
||||
|
||||
@@ -250,5 +266,27 @@ TEST_CASE("dag_runner", "[utilities_dag_runner]")
|
||||
REQUIRE(record.tasks["B_1"].children ==
|
||||
std::unordered_set<std::string>{"C"});
|
||||
REQUIRE(record.tasks["C_0"].children.empty());
|
||||
|
||||
// Ensure they were run in the right order
|
||||
// All A's get run before B's, which run before C's
|
||||
daggy::TimePoint globalStopTime = daggy::Clock::now();
|
||||
std::array<daggy::TimePoint, 3> minTimes;
|
||||
minTimes.fill(globalStartTime);
|
||||
std::array<daggy::TimePoint, 3> maxTimes;
|
||||
maxTimes.fill(globalStopTime);
|
||||
|
||||
for (const auto &[k, v] : record.taskAttempts) {
|
||||
size_t idx = k[0] - 65;
|
||||
auto &startTime = minTimes[idx];
|
||||
auto &stopTime = maxTimes[idx];
|
||||
startTime = std::max(startTime, v.front().startTime);
|
||||
stopTime = std::min(stopTime, v.back().stopTime);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
for (size_t j = i + 1; j < 2; ++j) {
|
||||
REQUIRE(maxTimes[i] < minTimes[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user