Squashed commit of the following: commit 69d5ef7a256b86a86d46e5ae374c00fded1497ea Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 12:15:55 2021 -0400 Updating readme commit 94a9f676d0f9cc0b55cdc18c4927eaea40d82c77 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 12:05:36 2021 -0400 Fixing serialization of attempt records when querying entire dag commit 945e5f90b24abf07c9af1bc4c6bbcb33e93b8069 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 11:37:59 2021 -0400 Compiles cleanly... commit 8b23e46081d47fb80dc1a2d998fc6dc4bbf301a8 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:43:03 2021 -0400 Adding in missing source file to cmake build list commit 6d10d9791206e2bc15788beadeea580b8e43a853 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:41:43 2021 -0400 Adding new executors commit 42a2c67f4d6ae99df95d917c8621d78cd99837a1 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:27:14 2021 -0400 Fixing missing curl cmake dependency commit 394bc4c5d51ecee7bf14712f719c8bf7e97fb0fa Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:21:58 2021 -0400 Fixing missing curl cmake dependency commit dd9efc8e7e7770ea1bcbccb70a1af9cfcff0414c Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 17:15:38 2021 -0400 Checkpointing progress commit 3b3b55d6037bb96e46de6763f486f4ecb92fe6a0 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 14:21:18 2021 -0400 updating readme commit 303027c11452941b2a0c0d1b04ac5942e79efd74 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 14:17:16 2021 -0400 Namespacing daggyd Adding more error checking around deserialization of parameters Adding tests for runner agent commit c592eaeba12e2a449bae401e8c1d9ed236416d52 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 11:20:21 2021 -0400 Checkpointing work commit fb1862d1cefe2b53a98659cce3c8c73d88bf5d84 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 09:52:29 2021 -0400 Copying daggyd for daggyr template, adding in basic routes
228 lines
6.4 KiB
C++
228 lines
6.4 KiB
C++
#include <daggy/Serialization.hpp>
|
|
#include <daggy/Utilities.hpp>
|
|
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
|
|
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
|
#include <iomanip>
|
|
|
|
using namespace daggy::executors::task;
|
|
using namespace daggy::executors::task::daggy_runner;
|
|
using namespace daggy;
|
|
|
|
namespace daggy::executors::task::daggy_runner {
|
|
std::string capacityToJSON(const Capacity &cap)
|
|
{
|
|
return R"({ "cores": )" + std::to_string(cap.cores) + R"(, "memoryMB": )" +
|
|
std::to_string(cap.memoryMB) + "}";
|
|
}
|
|
|
|
Capacity capacityFromJSON(const rj::Value &spec)
|
|
{
|
|
Capacity cap{.cores = 0, .memoryMB = 0};
|
|
|
|
if (!spec.IsObject()) {
|
|
throw std::runtime_error("Capacity is not an object");
|
|
}
|
|
|
|
if (spec.HasMember("cores")) {
|
|
if (!spec["cores"].IsNumber()) {
|
|
throw std::runtime_error("cores member of Capacity is not an integer");
|
|
}
|
|
cap.cores = spec["cores"].GetInt64();
|
|
}
|
|
|
|
if (spec.HasMember("memoryMB")) {
|
|
if (!spec["memoryMB"].IsNumber()) {
|
|
throw std::runtime_error(
|
|
"memoryMB member of Capacity is not an integer");
|
|
}
|
|
cap.memoryMB = spec["memoryMB"].GetInt64();
|
|
}
|
|
|
|
return cap;
|
|
}
|
|
|
|
Capacity capacityFromTask(const Task &task)
|
|
{
|
|
Capacity cap{.cores = 0, .memoryMB = 0};
|
|
|
|
cap.cores = std::stoll(std::get<std::string>(task.job.at("cores")));
|
|
cap.memoryMB = std::stoll(std::get<std::string>(task.job.at("memoryMB")));
|
|
|
|
return cap;
|
|
}
|
|
|
|
void validateTaskParameters(const daggy::ConfigValues &job)
|
|
{
|
|
forking_executor::validateTaskParameters(job);
|
|
|
|
const std::array<std::string, 2> fields{"cores", "memoryMB"};
|
|
|
|
for (const auto &field : fields) {
|
|
if (job.count(field) == 0)
|
|
throw std::runtime_error("Missing required job parameter " + field);
|
|
|
|
const auto &val = job.at(field);
|
|
|
|
if (!std::holds_alternative<std::string>(val))
|
|
throw std::runtime_error(field + " in capacity is not a string");
|
|
|
|
try {
|
|
std::stoll(std::get<std::string>(val));
|
|
}
|
|
catch (std::exception &e) {
|
|
throw std::runtime_error(field + " in capacity is not an integer");
|
|
}
|
|
}
|
|
}
|
|
} // namespace daggy::executors::task::daggy_runner
|
|
|
|
DaggyRunnerTaskExecutor::DaggyRunnerTaskExecutor()
|
|
: running_(true)
|
|
, monitorWorker_(&DaggyRunnerTaskExecutor::monitor, this)
|
|
{
|
|
}
|
|
|
|
DaggyRunnerTaskExecutor::~DaggyRunnerTaskExecutor()
|
|
{
|
|
running_ = false;
|
|
monitorWorker_.join();
|
|
}
|
|
|
|
// Validates the job to ensure that all required values are set and are of
|
|
// the right type,
|
|
bool DaggyRunnerTaskExecutor::validateTaskParameters(const ConfigValues &job)
|
|
{
|
|
daggy_runner::validateTaskParameters(job);
|
|
|
|
return true;
|
|
}
|
|
|
|
std::vector<ConfigValues> DaggyRunnerTaskExecutor::expandTaskParameters(
|
|
const ConfigValues &job, const ConfigValues &expansionValues)
|
|
{
|
|
std::vector<ConfigValues> newValues;
|
|
|
|
auto command =
|
|
(job.count("command") == 0 ? Command{}
|
|
: std::get<Command>(job.at("command")));
|
|
|
|
auto environment = (job.count("environment") == 0
|
|
? Command{}
|
|
: std::get<Command>(job.at("environment")));
|
|
|
|
Command both(command);
|
|
std::copy(environment.begin(), environment.end(), std::back_inserter(both));
|
|
|
|
for (const auto &parts : interpolateValues(both, expansionValues)) {
|
|
ConfigValues newCommand{job};
|
|
newCommand["command"] =
|
|
Command(parts.begin(), parts.begin() + command.size());
|
|
newCommand["environment"] =
|
|
Command(parts.begin() + command.size(), parts.end());
|
|
newValues.emplace_back(newCommand);
|
|
}
|
|
|
|
return newValues;
|
|
}
|
|
|
|
// Runs the task
|
|
std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
|
|
DAGRunID runID, const std::string &taskName, const Task &task)
|
|
{
|
|
auto taskUsed = capacityFromTask(task);
|
|
|
|
// Get the capacities for all the runners
|
|
// Capacities for a runner can be negative, meaning that they're currently
|
|
// oversubscribed.
|
|
std::vector<std::pair<std::string, double>> impacts;
|
|
for (const auto &runner : runners_) {
|
|
try {
|
|
const auto &[code, doc] = JSON_HTTP_REQUEST(runner + "/v1/capacity");
|
|
if (code != HTTPCode::Ok) {
|
|
continue;
|
|
}
|
|
|
|
auto curCap = capacityFromJSON(doc["current"]);
|
|
auto totCap = capacityFromJSON(doc["total"]);
|
|
|
|
ssize_t cores = curCap.cores < 0 ? totCap.cores : curCap.cores;
|
|
ssize_t memoryMB =
|
|
curCap.memoryMB < 0 ? totCap.memoryMB : curCap.memoryMB;
|
|
|
|
double impact =
|
|
std::max(taskUsed.cores / cores, taskUsed.memoryMB / memoryMB);
|
|
impacts.emplace_back(runner, impact);
|
|
}
|
|
catch (const std::exception &_) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (impacts.empty())
|
|
throw std::runtime_error("No runners available for execution");
|
|
|
|
auto cit = impacts.begin();
|
|
for (auto it = impacts.begin(); it != impacts.end(); ++it) {
|
|
if (it->second < cit->second)
|
|
cit = it;
|
|
}
|
|
|
|
RunningTask rt{
|
|
.prom{}, .runID = runID, .taskName = taskName, .runnerURL = cit->first};
|
|
|
|
auto fut = rt.prom.get_future();
|
|
|
|
std::lock_guard<std::mutex> lock(rtGuard_);
|
|
runningTasks_.emplace(std::make_pair(runID, taskName), std::move(rt));
|
|
|
|
return fut;
|
|
}
|
|
|
|
bool DaggyRunnerTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
|
|
{
|
|
runners_.insert(url);
|
|
}
|
|
|
|
void DaggyRunnerTaskExecutor::monitor()
|
|
{
|
|
while (running_) {
|
|
{
|
|
std::vector<std::pair<DAGRunID, std::string>> resolvedJobs;
|
|
|
|
std::lock_guard<std::mutex> lock(rtGuard_);
|
|
for (auto &[taskID, task] : runningTasks_) {
|
|
try {
|
|
const auto &[code, json] = JSON_HTTP_REQUEST(
|
|
task.runnerURL + "/v1/task/" + std::to_string(taskID.first) +
|
|
"/" + taskID.second);
|
|
if (code != HTTPCode::Ok) {
|
|
AttemptRecord record{
|
|
.rc = -1, .executorLog = "Unable to query runner for progress"};
|
|
task.prom.set_value(std::move(record));
|
|
resolvedJobs.emplace_back(taskID);
|
|
continue;
|
|
}
|
|
|
|
if (json["state"] == "COMPLETED") {
|
|
task.prom.set_value(attemptRecordFromJSON(json["attempt"]));
|
|
resolvedJobs.emplace_back(taskID);
|
|
}
|
|
}
|
|
catch (std::runtime_error &e) {
|
|
continue;
|
|
}
|
|
|
|
for (const auto &tid : resolvedJobs) {
|
|
runningTasks_.extract(tid);
|
|
}
|
|
}
|
|
std::this_thread::sleep_for(std::chrono::seconds(1));
|
|
}
|
|
}
|
|
}
|