Files
daggy/libdaggy/src/executors/task/DaggyRunnerTaskExecutor.cpp
Ian Roddis 8d00621908 Adding support for remote execution daemons.
Squashed commit of the following:

commit 69d5ef7a256b86a86d46e5ae374c00fded1497ea
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 12:15:55 2021 -0400

    Updating readme

commit 94a9f676d0f9cc0b55cdc18c4927eaea40d82c77
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 12:05:36 2021 -0400

    Fixing serialization of attempt records when querying entire dag

commit 945e5f90b24abf07c9af1bc4c6bbcb33e93b8069
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 11:37:59 2021 -0400

    Compiles cleanly...

commit 8b23e46081d47fb80dc1a2d998fc6dc4bbf301a8
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:43:03 2021 -0400

    Adding in missing source file to cmake build list

commit 6d10d9791206e2bc15788beadeea580b8e43a853
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:41:43 2021 -0400

    Adding new executors

commit 42a2c67f4d6ae99df95d917c8621d78cd99837a1
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:27:14 2021 -0400

    Fixing missing curl cmake dependency

commit 394bc4c5d51ecee7bf14712f719c8bf7e97fb0fa
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:21:58 2021 -0400

    Fixing missing curl cmake dependency

commit dd9efc8e7e7770ea1bcbccb70a1af9cfcff0414c
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 17:15:38 2021 -0400

    Checkpointing progress

commit 3b3b55d6037bb96e46de6763f486f4ecb92fe6a0
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 14:21:18 2021 -0400

    updating readme

commit 303027c11452941b2a0c0d1b04ac5942e79efd74
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 14:17:16 2021 -0400

    Namespacing daggyd
    Adding more error checking around deserialization of parameters
    Adding tests for runner agent

commit c592eaeba12e2a449bae401e8c1d9ed236416d52
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 11:20:21 2021 -0400

    Checkpointing work

commit fb1862d1cefe2b53a98659cce3c8c73d88bf5d84
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 09:52:29 2021 -0400

    Copying daggyd for daggyr template, adding in basic routes
2021-12-16 12:16:12 -04:00

228 lines
6.4 KiB
C++

#include <daggy/Serialization.hpp>
#include <daggy/Utilities.hpp>
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
#include <iomanip>
using namespace daggy::executors::task;
using namespace daggy::executors::task::daggy_runner;
using namespace daggy;
namespace daggy::executors::task::daggy_runner {
std::string capacityToJSON(const Capacity &cap)
{
return R"({ "cores": )" + std::to_string(cap.cores) + R"(, "memoryMB": )" +
std::to_string(cap.memoryMB) + "}";
}
Capacity capacityFromJSON(const rj::Value &spec)
{
Capacity cap{.cores = 0, .memoryMB = 0};
if (!spec.IsObject()) {
throw std::runtime_error("Capacity is not an object");
}
if (spec.HasMember("cores")) {
if (!spec["cores"].IsNumber()) {
throw std::runtime_error("cores member of Capacity is not an integer");
}
cap.cores = spec["cores"].GetInt64();
}
if (spec.HasMember("memoryMB")) {
if (!spec["memoryMB"].IsNumber()) {
throw std::runtime_error(
"memoryMB member of Capacity is not an integer");
}
cap.memoryMB = spec["memoryMB"].GetInt64();
}
return cap;
}
Capacity capacityFromTask(const Task &task)
{
Capacity cap{.cores = 0, .memoryMB = 0};
cap.cores = std::stoll(std::get<std::string>(task.job.at("cores")));
cap.memoryMB = std::stoll(std::get<std::string>(task.job.at("memoryMB")));
return cap;
}
void validateTaskParameters(const daggy::ConfigValues &job)
{
forking_executor::validateTaskParameters(job);
const std::array<std::string, 2> fields{"cores", "memoryMB"};
for (const auto &field : fields) {
if (job.count(field) == 0)
throw std::runtime_error("Missing required job parameter " + field);
const auto &val = job.at(field);
if (!std::holds_alternative<std::string>(val))
throw std::runtime_error(field + " in capacity is not a string");
try {
std::stoll(std::get<std::string>(val));
}
catch (std::exception &e) {
throw std::runtime_error(field + " in capacity is not an integer");
}
}
}
} // namespace daggy::executors::task::daggy_runner
DaggyRunnerTaskExecutor::DaggyRunnerTaskExecutor()
: running_(true)
, monitorWorker_(&DaggyRunnerTaskExecutor::monitor, this)
{
}
DaggyRunnerTaskExecutor::~DaggyRunnerTaskExecutor()
{
running_ = false;
monitorWorker_.join();
}
// Validates the job to ensure that all required values are set and are of
// the right type,
bool DaggyRunnerTaskExecutor::validateTaskParameters(const ConfigValues &job)
{
daggy_runner::validateTaskParameters(job);
return true;
}
std::vector<ConfigValues> DaggyRunnerTaskExecutor::expandTaskParameters(
const ConfigValues &job, const ConfigValues &expansionValues)
{
std::vector<ConfigValues> newValues;
auto command =
(job.count("command") == 0 ? Command{}
: std::get<Command>(job.at("command")));
auto environment = (job.count("environment") == 0
? Command{}
: std::get<Command>(job.at("environment")));
Command both(command);
std::copy(environment.begin(), environment.end(), std::back_inserter(both));
for (const auto &parts : interpolateValues(both, expansionValues)) {
ConfigValues newCommand{job};
newCommand["command"] =
Command(parts.begin(), parts.begin() + command.size());
newCommand["environment"] =
Command(parts.begin() + command.size(), parts.end());
newValues.emplace_back(newCommand);
}
return newValues;
}
// Runs the task
std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
DAGRunID runID, const std::string &taskName, const Task &task)
{
auto taskUsed = capacityFromTask(task);
// Get the capacities for all the runners
// Capacities for a runner can be negative, meaning that they're currently
// oversubscribed.
std::vector<std::pair<std::string, double>> impacts;
for (const auto &runner : runners_) {
try {
const auto &[code, doc] = JSON_HTTP_REQUEST(runner + "/v1/capacity");
if (code != HTTPCode::Ok) {
continue;
}
auto curCap = capacityFromJSON(doc["current"]);
auto totCap = capacityFromJSON(doc["total"]);
ssize_t cores = curCap.cores < 0 ? totCap.cores : curCap.cores;
ssize_t memoryMB =
curCap.memoryMB < 0 ? totCap.memoryMB : curCap.memoryMB;
double impact =
std::max(taskUsed.cores / cores, taskUsed.memoryMB / memoryMB);
impacts.emplace_back(runner, impact);
}
catch (const std::exception &_) {
continue;
}
}
if (impacts.empty())
throw std::runtime_error("No runners available for execution");
auto cit = impacts.begin();
for (auto it = impacts.begin(); it != impacts.end(); ++it) {
if (it->second < cit->second)
cit = it;
}
RunningTask rt{
.prom{}, .runID = runID, .taskName = taskName, .runnerURL = cit->first};
auto fut = rt.prom.get_future();
std::lock_guard<std::mutex> lock(rtGuard_);
runningTasks_.emplace(std::make_pair(runID, taskName), std::move(rt));
return fut;
}
bool DaggyRunnerTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
{
return true;
}
void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
{
runners_.insert(url);
}
void DaggyRunnerTaskExecutor::monitor()
{
while (running_) {
{
std::vector<std::pair<DAGRunID, std::string>> resolvedJobs;
std::lock_guard<std::mutex> lock(rtGuard_);
for (auto &[taskID, task] : runningTasks_) {
try {
const auto &[code, json] = JSON_HTTP_REQUEST(
task.runnerURL + "/v1/task/" + std::to_string(taskID.first) +
"/" + taskID.second);
if (code != HTTPCode::Ok) {
AttemptRecord record{
.rc = -1, .executorLog = "Unable to query runner for progress"};
task.prom.set_value(std::move(record));
resolvedJobs.emplace_back(taskID);
continue;
}
if (json["state"] == "COMPLETED") {
task.prom.set_value(attemptRecordFromJSON(json["attempt"]));
resolvedJobs.emplace_back(taskID);
}
}
catch (std::runtime_error &e) {
continue;
}
for (const auto &tid : resolvedJobs) {
runningTasks_.extract(tid);
}
}
std::this_thread::sleep_for(std::chrono::seconds(1));
}
}
}