Simplifying daggyr server, and returning to a

task submit / task poll model.

Squashed commit of the following:

commit 0ef57f095d15f0402915de54f83c1671120bd228
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Feb 2 08:18:03 2022 -0400

    Simplifying task polling and reducing lock scopes

commit d77ef02021cc728849c7d1fb0185dd1a861b4a3d
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Feb 2 08:02:47 2022 -0400

    Simplifying check

commit c1acf34440162abb890a959f3685c2d184242ed5
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Feb 2 08:01:13 2022 -0400

    Removing capacity tracking from runner, since it is maintained in daggyd

commit 9401246f92113ab140143c1895978b9de8bd9972
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Feb 2 07:47:28 2022 -0400

    Adding retry for submission

commit 398aa04a320347bb35f23f3f101d91ab4df25652
Author: Ian Roddis <tech@kinesin.ca>
Date:   Tue Feb 1 14:54:20 2022 -0400

    Adding in execution note, as well as requeuing the result if the peer disconnects

commit 637b14af6d5b53f25b9c38d4c8a7ed8532af5599
Author: Ian Roddis <tech@kinesin.ca>
Date:   Tue Feb 1 14:13:59 2022 -0400

    Fixing locking issues

commit 4d6716dfda8aa7f51e0abbdab833aff618915ba0
Author: Ian Roddis <tech@kinesin.ca>
Date:   Tue Feb 1 13:33:14 2022 -0400

    Single task daggyr working

commit bd48a5452a92817faf25ee44a6115aaa2f6c30d1
Author: Ian Roddis <tech@kinesin.ca>
Date:   Tue Feb 1 12:22:04 2022 -0400

    Checkpointing work
This commit is contained in:
Ian Roddis
2022-02-02 21:12:05 -04:00
parent c9bfce31e6
commit 57e93b5045
6 changed files with 116 additions and 192 deletions

View File

@@ -57,6 +57,7 @@ namespace daggy::executors::task {
private:
void monitor();
using TaskID = std::pair<DAGRunID, std::string>;
struct RunningTask
{
@@ -69,7 +70,6 @@ namespace daggy::executors::task {
// Resolves jobs through polling
std::atomic<bool> running_;
bool promptTask_;
std::thread monitorWorker_;
daggy_runner::Capacity getRunnerCapacity(const std::string &runnerURL);
@@ -79,7 +79,6 @@ namespace daggy::executors::task {
std::unordered_map<std::string, daggy_runner::Capacity> runners_;
std::mutex rtGuard_;
std::unordered_map<std::pair<DAGRunID, std::string>, RunningTask>
runningTasks_;
std::unordered_map<TaskID, RunningTask> runningTasks_;
};
} // namespace daggy::executors::task

View File

@@ -95,7 +95,7 @@ namespace daggy {
if (!running_)
return;
const size_t MAX_SUBMITS = 25;
const size_t MAX_SUBMITS = 100;
size_t n_submitted = 0;
/*

View File

@@ -234,7 +234,7 @@ namespace daggy {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curlWriter);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3);
if (trace) {
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, http_trace);
@@ -276,16 +276,20 @@ namespace daggy {
auto response = HTTP_REQUEST(url, payload, method);
rj::Document doc;
try {
checkRJParse(doc.Parse(response.body.c_str()));
}
catch (std::exception &e) {
doc.SetObject();
auto &alloc = doc.GetAllocator();
std::string message = (response.body.empty() ? e.what() : response.body);
doc.AddMember(
"error",
rj::Value().SetString(message.c_str(), message.size(), alloc), alloc);
if (!response.body.empty()) {
try {
checkRJParse(doc.Parse(response.body.c_str()));
}
catch (std::exception &e) {
doc.SetObject();
auto &alloc = doc.GetAllocator();
std::string message =
(response.body.empty() ? e.what() : response.body);
doc.AddMember(
"error",
rj::Value().SetString(message.c_str(), message.size(), alloc),
alloc);
}
}
return std::make_pair(response.code, std::move(doc));

View File

@@ -78,7 +78,6 @@ namespace daggy::executors::task::daggy_runner {
DaggyRunnerTaskExecutor::DaggyRunnerTaskExecutor()
: running_(true)
, promptTask_(false)
, monitorWorker_(&DaggyRunnerTaskExecutor::monitor, this)
{
}
@@ -176,11 +175,21 @@ TaskFuture DaggyRunnerTaskExecutor::execute(DAGRunID runID,
ss << exe_runner << "/v1/task/" << runID << "/" << taskName;
auto url = ss.str();
const auto response = HTTP_REQUEST(url, taskToJSON(task), "POST");
// TODO catching this failure state doesn't allow for runners
// dying.
while (true) {
auto response = HTTP_REQUEST(url, taskToJSON(task), "POST");
if (response.code == 200)
break;
std::cout << "Submitting " << taskName << " expected code 200, got "
<< response.code << '[' << response.body << "]\n";
std::this_thread::sleep_for(250ms);
}
RunningTask rt{.fut = std::make_shared<Future<AttemptRecord>>(),
.runID = runID,
.taskName = taskName,
.runnerURL = exe_runner,
.resources = taskUsed};
auto fut = rt.fut;
@@ -202,10 +211,10 @@ daggy_runner::Capacity DaggyRunnerTaskExecutor::getRunnerCapacity(
// Try and get the capacity
const auto &[code, doc] = JSON_HTTP_REQUEST(runnerURL + "/v1/capacity");
if (code != HTTPCode::Ok) {
return Capacity{};
throw std::runtime_error("Unable to get capacity from runner " + runnerURL);
}
return capacityFromJSON(doc["total"]);
return capacityFromJSON(doc);
}
void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
@@ -216,83 +225,67 @@ void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
void DaggyRunnerTaskExecutor::monitor()
{
std::unordered_map<std::string, Capacity> runners;
std::vector<TaskID> resolvedTasks;
std::vector<std::tuple<TaskID, std::string, TaskFuture, Capacity>>
runningTasks;
std::unordered_map<std::string, Capacity> returnedResources;
while (running_) {
std::this_thread::sleep_for(std::chrono::milliseconds(250));
std::unordered_map<std::pair<DAGRunID, std::string>,
std::optional<AttemptRecord>>
resolvedJobs;
std::this_thread::sleep_for(2s);
resolvedTasks.clear();
runningTasks.clear();
returnedResources.clear();
std::unordered_map<std::pair<DAGRunID, std::string>, Capacity>
taskResources;
// Cache what's running now
// Copy the running tasks to prevent holding the lock too long
{
std::lock_guard<std::mutex> lock(rtGuard_);
for (const auto &[tid, info] : runningTasks_) {
taskResources.emplace(tid, info.resources);
runningTasks.emplace_back(
std::make_tuple(tid, info.runnerURL, info.fut, info.resources));
}
}
{
std::lock_guard<std::mutex> lock(runnersGuard_);
for (auto &[runnerURL, caps] : runners_) {
rj::Document doc;
try {
auto [code, json] = JSON_HTTP_REQUEST(runnerURL + "/v1/poll");
if (code != HTTPCode::Ok) {
std::cout << "Unable to poll: " << code << ": " << dumpJSON(json)
<< std::endl;
continue;
}
doc.Swap(json);
}
catch (std::exception &e) {
std::cout << "Unable to poll: " << e.what() << std::endl;
for (const auto &[tid, runner, fut, resources] : runningTasks) {
rj::Document doc;
try {
std::string url =
runner + "/v1/task/" + std::to_string(tid.first) + "/" + tid.second;
auto [code, json] = JSON_HTTP_REQUEST(url);
if (code != HTTPCode::Ok) {
continue;
}
if (!doc.IsArray()) {
std::cout << "Got nonsense from poll: " << dumpJSON(doc) << std::endl;
continue;
}
doc.Swap(json);
}
catch (std::exception &e) {
continue;
}
const auto tasks = doc.GetArray();
for (size_t idx = 0; idx < tasks.Size(); ++idx) {
const auto &task = tasks[idx];
auto tid = std::make_pair(task["runID"].GetInt64(),
task["taskName"].GetString());
auto it = taskResources.find(tid);
if (it != taskResources.end()) {
caps.cores += it->second.cores;
caps.memoryMB += it->second.memoryMB;
}
auto &cap = returnedResources[runner];
cap.cores += resources.cores;
cap.memoryMB += resources.memoryMB;
auto attempt = attemptRecordFromJSON(task["attempt"]);
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
promptTask_ = true;
runnersCV_.notify_one();
auto attempt = attemptRecordFromJSON(doc);
attempt.executorLog += "\nExecuted on " + runner;
fut->set(attempt);
resolvedTasks.push_back(tid);
}
if (!returnedResources.empty()) {
{
std::lock_guard<std::mutex> rLock(runnersGuard_);
for (const auto &[runner, res] : returnedResources) {
auto &caps = runners_[runner];
caps.cores += res.cores;
caps.memoryMB += res.memoryMB;
}
}
}
std::vector<std::pair<DAGRunID, std::string>> completedTasks;
{
if (!resolvedTasks.empty()) {
std::lock_guard<std::mutex> lock(rtGuard_);
for (auto &[taskID, task] : runningTasks_) {
auto it = resolvedJobs.find(taskID);
if (it == resolvedJobs.end())
continue;
if (it->second.has_value()) {
// Task has completed
task.fut->set(std::move(it->second.value()));
completedTasks.emplace_back(taskID);
}
}
for (const auto &tid : completedTasks) {
for (const auto &tid : resolvedTasks) {
runningTasks_.extract(tid);
runnersCV_.notify_one();
}
}
}