Adding support for remote execution daemons.
Squashed commit of the following: commit 69d5ef7a256b86a86d46e5ae374c00fded1497ea Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 12:15:55 2021 -0400 Updating readme commit 94a9f676d0f9cc0b55cdc18c4927eaea40d82c77 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 12:05:36 2021 -0400 Fixing serialization of attempt records when querying entire dag commit 945e5f90b24abf07c9af1bc4c6bbcb33e93b8069 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 11:37:59 2021 -0400 Compiles cleanly... commit 8b23e46081d47fb80dc1a2d998fc6dc4bbf301a8 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:43:03 2021 -0400 Adding in missing source file to cmake build list commit 6d10d9791206e2bc15788beadeea580b8e43a853 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:41:43 2021 -0400 Adding new executors commit 42a2c67f4d6ae99df95d917c8621d78cd99837a1 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:27:14 2021 -0400 Fixing missing curl cmake dependency commit 394bc4c5d51ecee7bf14712f719c8bf7e97fb0fa Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:21:58 2021 -0400 Fixing missing curl cmake dependency commit dd9efc8e7e7770ea1bcbccb70a1af9cfcff0414c Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 17:15:38 2021 -0400 Checkpointing progress commit 3b3b55d6037bb96e46de6763f486f4ecb92fe6a0 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 14:21:18 2021 -0400 updating readme commit 303027c11452941b2a0c0d1b04ac5942e79efd74 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 14:17:16 2021 -0400 Namespacing daggyd Adding more error checking around deserialization of parameters Adding tests for runner agent commit c592eaeba12e2a449bae401e8c1d9ed236416d52 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 11:20:21 2021 -0400 Checkpointing work commit fb1862d1cefe2b53a98659cce3c8c73d88bf5d84 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 09:52:29 2021 -0400 Copying daggyd for daggyr template, adding in basic routes
This commit is contained in:
227
libdaggy/src/executors/task/DaggyRunnerTaskExecutor.cpp
Normal file
227
libdaggy/src/executors/task/DaggyRunnerTaskExecutor.cpp
Normal file
@@ -0,0 +1,227 @@
|
||||
#include <daggy/Serialization.hpp>
|
||||
#include <daggy/Utilities.hpp>
|
||||
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
|
||||
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
||||
#include <iomanip>
|
||||
|
||||
using namespace daggy::executors::task;
|
||||
using namespace daggy::executors::task::daggy_runner;
|
||||
using namespace daggy;
|
||||
|
||||
namespace daggy::executors::task::daggy_runner {
|
||||
std::string capacityToJSON(const Capacity &cap)
|
||||
{
|
||||
return R"({ "cores": )" + std::to_string(cap.cores) + R"(, "memoryMB": )" +
|
||||
std::to_string(cap.memoryMB) + "}";
|
||||
}
|
||||
|
||||
Capacity capacityFromJSON(const rj::Value &spec)
|
||||
{
|
||||
Capacity cap{.cores = 0, .memoryMB = 0};
|
||||
|
||||
if (!spec.IsObject()) {
|
||||
throw std::runtime_error("Capacity is not an object");
|
||||
}
|
||||
|
||||
if (spec.HasMember("cores")) {
|
||||
if (!spec["cores"].IsNumber()) {
|
||||
throw std::runtime_error("cores member of Capacity is not an integer");
|
||||
}
|
||||
cap.cores = spec["cores"].GetInt64();
|
||||
}
|
||||
|
||||
if (spec.HasMember("memoryMB")) {
|
||||
if (!spec["memoryMB"].IsNumber()) {
|
||||
throw std::runtime_error(
|
||||
"memoryMB member of Capacity is not an integer");
|
||||
}
|
||||
cap.memoryMB = spec["memoryMB"].GetInt64();
|
||||
}
|
||||
|
||||
return cap;
|
||||
}
|
||||
|
||||
Capacity capacityFromTask(const Task &task)
|
||||
{
|
||||
Capacity cap{.cores = 0, .memoryMB = 0};
|
||||
|
||||
cap.cores = std::stoll(std::get<std::string>(task.job.at("cores")));
|
||||
cap.memoryMB = std::stoll(std::get<std::string>(task.job.at("memoryMB")));
|
||||
|
||||
return cap;
|
||||
}
|
||||
|
||||
void validateTaskParameters(const daggy::ConfigValues &job)
|
||||
{
|
||||
forking_executor::validateTaskParameters(job);
|
||||
|
||||
const std::array<std::string, 2> fields{"cores", "memoryMB"};
|
||||
|
||||
for (const auto &field : fields) {
|
||||
if (job.count(field) == 0)
|
||||
throw std::runtime_error("Missing required job parameter " + field);
|
||||
|
||||
const auto &val = job.at(field);
|
||||
|
||||
if (!std::holds_alternative<std::string>(val))
|
||||
throw std::runtime_error(field + " in capacity is not a string");
|
||||
|
||||
try {
|
||||
std::stoll(std::get<std::string>(val));
|
||||
}
|
||||
catch (std::exception &e) {
|
||||
throw std::runtime_error(field + " in capacity is not an integer");
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace daggy::executors::task::daggy_runner
|
||||
|
||||
DaggyRunnerTaskExecutor::DaggyRunnerTaskExecutor()
|
||||
: running_(true)
|
||||
, monitorWorker_(&DaggyRunnerTaskExecutor::monitor, this)
|
||||
{
|
||||
}
|
||||
|
||||
DaggyRunnerTaskExecutor::~DaggyRunnerTaskExecutor()
|
||||
{
|
||||
running_ = false;
|
||||
monitorWorker_.join();
|
||||
}
|
||||
|
||||
// Validates the job to ensure that all required values are set and are of
|
||||
// the right type,
|
||||
bool DaggyRunnerTaskExecutor::validateTaskParameters(const ConfigValues &job)
|
||||
{
|
||||
daggy_runner::validateTaskParameters(job);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<ConfigValues> DaggyRunnerTaskExecutor::expandTaskParameters(
|
||||
const ConfigValues &job, const ConfigValues &expansionValues)
|
||||
{
|
||||
std::vector<ConfigValues> newValues;
|
||||
|
||||
auto command =
|
||||
(job.count("command") == 0 ? Command{}
|
||||
: std::get<Command>(job.at("command")));
|
||||
|
||||
auto environment = (job.count("environment") == 0
|
||||
? Command{}
|
||||
: std::get<Command>(job.at("environment")));
|
||||
|
||||
Command both(command);
|
||||
std::copy(environment.begin(), environment.end(), std::back_inserter(both));
|
||||
|
||||
for (const auto &parts : interpolateValues(both, expansionValues)) {
|
||||
ConfigValues newCommand{job};
|
||||
newCommand["command"] =
|
||||
Command(parts.begin(), parts.begin() + command.size());
|
||||
newCommand["environment"] =
|
||||
Command(parts.begin() + command.size(), parts.end());
|
||||
newValues.emplace_back(newCommand);
|
||||
}
|
||||
|
||||
return newValues;
|
||||
}
|
||||
|
||||
// Runs the task
|
||||
std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
|
||||
DAGRunID runID, const std::string &taskName, const Task &task)
|
||||
{
|
||||
auto taskUsed = capacityFromTask(task);
|
||||
|
||||
// Get the capacities for all the runners
|
||||
// Capacities for a runner can be negative, meaning that they're currently
|
||||
// oversubscribed.
|
||||
std::vector<std::pair<std::string, double>> impacts;
|
||||
for (const auto &runner : runners_) {
|
||||
try {
|
||||
const auto &[code, doc] = JSON_HTTP_REQUEST(runner + "/v1/capacity");
|
||||
if (code != HTTPCode::Ok) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto curCap = capacityFromJSON(doc["current"]);
|
||||
auto totCap = capacityFromJSON(doc["total"]);
|
||||
|
||||
ssize_t cores = curCap.cores < 0 ? totCap.cores : curCap.cores;
|
||||
ssize_t memoryMB =
|
||||
curCap.memoryMB < 0 ? totCap.memoryMB : curCap.memoryMB;
|
||||
|
||||
double impact =
|
||||
std::max(taskUsed.cores / cores, taskUsed.memoryMB / memoryMB);
|
||||
impacts.emplace_back(runner, impact);
|
||||
}
|
||||
catch (const std::exception &_) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (impacts.empty())
|
||||
throw std::runtime_error("No runners available for execution");
|
||||
|
||||
auto cit = impacts.begin();
|
||||
for (auto it = impacts.begin(); it != impacts.end(); ++it) {
|
||||
if (it->second < cit->second)
|
||||
cit = it;
|
||||
}
|
||||
|
||||
RunningTask rt{
|
||||
.prom{}, .runID = runID, .taskName = taskName, .runnerURL = cit->first};
|
||||
|
||||
auto fut = rt.prom.get_future();
|
||||
|
||||
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||
runningTasks_.emplace(std::make_pair(runID, taskName), std::move(rt));
|
||||
|
||||
return fut;
|
||||
}
|
||||
|
||||
bool DaggyRunnerTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
|
||||
{
|
||||
runners_.insert(url);
|
||||
}
|
||||
|
||||
void DaggyRunnerTaskExecutor::monitor()
|
||||
{
|
||||
while (running_) {
|
||||
{
|
||||
std::vector<std::pair<DAGRunID, std::string>> resolvedJobs;
|
||||
|
||||
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||
for (auto &[taskID, task] : runningTasks_) {
|
||||
try {
|
||||
const auto &[code, json] = JSON_HTTP_REQUEST(
|
||||
task.runnerURL + "/v1/task/" + std::to_string(taskID.first) +
|
||||
"/" + taskID.second);
|
||||
if (code != HTTPCode::Ok) {
|
||||
AttemptRecord record{
|
||||
.rc = -1, .executorLog = "Unable to query runner for progress"};
|
||||
task.prom.set_value(std::move(record));
|
||||
resolvedJobs.emplace_back(taskID);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (json["state"] == "COMPLETED") {
|
||||
task.prom.set_value(attemptRecordFromJSON(json["attempt"]));
|
||||
resolvedJobs.emplace_back(taskID);
|
||||
}
|
||||
}
|
||||
catch (std::runtime_error &e) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const auto &tid : resolvedJobs) {
|
||||
runningTasks_.extract(tid);
|
||||
}
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user