Simplifying daggyr server, and returning to a
task submit / task poll model. Squashed commit of the following: commit 0ef57f095d15f0402915de54f83c1671120bd228 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Feb 2 08:18:03 2022 -0400 Simplifying task polling and reducing lock scopes commit d77ef02021cc728849c7d1fb0185dd1a861b4a3d Author: Ian Roddis <tech@kinesin.ca> Date: Wed Feb 2 08:02:47 2022 -0400 Simplifying check commit c1acf34440162abb890a959f3685c2d184242ed5 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Feb 2 08:01:13 2022 -0400 Removing capacity tracking from runner, since it is maintained in daggyd commit 9401246f92113ab140143c1895978b9de8bd9972 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Feb 2 07:47:28 2022 -0400 Adding retry for submission commit 398aa04a320347bb35f23f3f101d91ab4df25652 Author: Ian Roddis <tech@kinesin.ca> Date: Tue Feb 1 14:54:20 2022 -0400 Adding in execution note, as well as requeuing the result if the peer disconnects commit 637b14af6d5b53f25b9c38d4c8a7ed8532af5599 Author: Ian Roddis <tech@kinesin.ca> Date: Tue Feb 1 14:13:59 2022 -0400 Fixing locking issues commit 4d6716dfda8aa7f51e0abbdab833aff618915ba0 Author: Ian Roddis <tech@kinesin.ca> Date: Tue Feb 1 13:33:14 2022 -0400 Single task daggyr working commit bd48a5452a92817faf25ee44a6115aaa2f6c30d1 Author: Ian Roddis <tech@kinesin.ca> Date: Tue Feb 1 12:22:04 2022 -0400 Checkpointing work
This commit is contained in:
@@ -48,7 +48,7 @@ namespace daggy::daggyr {
|
||||
DAGGY_REST_HANDLER(handleReady);
|
||||
DAGGY_REST_HANDLER(handleGetCapacity);
|
||||
DAGGY_REST_HANDLER(handleRunTask);
|
||||
DAGGY_REST_HANDLER(handlePollTasks);
|
||||
DAGGY_REST_HANDLER(handlePollTask);
|
||||
DAGGY_REST_HANDLER(handleStopTask);
|
||||
DAGGY_REST_HANDLER(handleValidateTask);
|
||||
|
||||
@@ -59,33 +59,9 @@ namespace daggy::daggyr {
|
||||
executors::task::ForkingTaskExecutor executor_;
|
||||
|
||||
using TaskID = std::pair<DAGRunID, std::string>;
|
||||
|
||||
struct TaskRecord
|
||||
{
|
||||
RunState state;
|
||||
AttemptRecord attempt;
|
||||
};
|
||||
|
||||
std::mutex capacityGuard_;
|
||||
Capacity maxCapacity_;
|
||||
Capacity curCapacity_;
|
||||
|
||||
struct PendingJob
|
||||
{
|
||||
daggy::executors::task::TaskFuture fut;
|
||||
Capacity resourcesUsed;
|
||||
bool resolved;
|
||||
};
|
||||
|
||||
std::mutex resolvedGuard_;
|
||||
std::string resolved_;
|
||||
size_t nResolved_;
|
||||
|
||||
void monitor();
|
||||
std::atomic<bool> running_;
|
||||
std::thread monitorWorker_;
|
||||
|
||||
std::mutex pendingGuard_;
|
||||
std::unordered_map<TaskID, PendingJob> pending_;
|
||||
std::mutex rtGuard_;
|
||||
std::unordered_map<TaskID, daggy::executors::task::TaskFuture>
|
||||
runningTasks_;
|
||||
};
|
||||
} // namespace daggy::daggyr
|
||||
|
||||
@@ -37,11 +37,6 @@ namespace daggy::daggyr {
|
||||
, desc_("Daggy Runner API", "0.1")
|
||||
, executor_(maxCores)
|
||||
, maxCapacity_{maxCores, maxMemoryMB}
|
||||
, curCapacity_{maxCores, maxMemoryMB}
|
||||
, resolved_("[")
|
||||
, nResolved_(0)
|
||||
, running_(true)
|
||||
, monitorWorker_(&Server::monitor, this)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -67,8 +62,6 @@ namespace daggy::daggyr {
|
||||
void Server::shutdown()
|
||||
{
|
||||
endpoint_.shutdown();
|
||||
running_ = false;
|
||||
monitorWorker_.join();
|
||||
}
|
||||
|
||||
uint16_t Server::getPort() const
|
||||
@@ -102,8 +95,8 @@ namespace daggy::daggyr {
|
||||
.produces(MIME(Application, Json))
|
||||
.response(Http::Code::Ok, "Run a task");
|
||||
|
||||
versionPath.route(desc_.get("/poll"))
|
||||
.bind(&Server::handlePollTasks, this)
|
||||
versionPath.route(desc_.get("/task/:runID/:taskName"))
|
||||
.bind(&Server::handlePollTask, this)
|
||||
.produces(MIME(Application, Json))
|
||||
.response(
|
||||
Http::Code::Ok,
|
||||
@@ -152,87 +145,53 @@ namespace daggy::daggyr {
|
||||
REQ_RESPONSE(Not_Acceptable, e.what());
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(capacityGuard_);
|
||||
curCapacity_.cores -= resourcesUsed.cores;
|
||||
curCapacity_.memoryMB -= resourcesUsed.memoryMB;
|
||||
}
|
||||
auto tid = std::make_pair(runID, taskName);
|
||||
auto fut = executor_.execute(runID, taskName, task);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(pendingGuard_);
|
||||
pending_.emplace(std::make_pair(runID, taskName),
|
||||
PendingJob{
|
||||
.fut = executor_.execute(runID, taskName, task),
|
||||
.resourcesUsed = resourcesUsed,
|
||||
});
|
||||
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||
runningTasks_.emplace(std::move(tid), std::move(fut));
|
||||
}
|
||||
|
||||
response.send(Pistache::Http::Code::Ok, "");
|
||||
}
|
||||
|
||||
void Server::monitor()
|
||||
{
|
||||
std::unordered_map<TaskID, AttemptRecord> resolved;
|
||||
while (running_) {
|
||||
resolved.clear();
|
||||
std::vector<TaskID> resolvedIDs;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(pendingGuard_);
|
||||
for (const auto &[tid, job] : pending_) {
|
||||
if (job.fut->ready()) {
|
||||
resolved.emplace(tid, job.fut->get());
|
||||
resolvedIDs.push_back(tid);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &tid : resolvedIDs) {
|
||||
pending_.extract(tid);
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_map<TaskID, std::string> payloads;
|
||||
for (const auto &[tid, attempt] : resolved) {
|
||||
std::stringstream ss;
|
||||
ss << R"({ "runID": )" << tid.first << R"(, "taskName": )"
|
||||
<< std::quoted(tid.second) << ", "
|
||||
<< R"("state": "COMPLETED", "attempt":)"
|
||||
<< attemptRecordToJSON(attempt) << "}";
|
||||
payloads.emplace(tid, ss.str());
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resolvedGuard_);
|
||||
for (const auto &[_, item] : payloads) {
|
||||
if (resolved_.empty()) {
|
||||
resolved_ = "[";
|
||||
}
|
||||
|
||||
if (nResolved_ > 0)
|
||||
resolved_ += ',';
|
||||
resolved_ += item;
|
||||
++nResolved_;
|
||||
}
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
}
|
||||
}
|
||||
|
||||
void Server::handlePollTasks(const Pistache::Rest::Request &request,
|
||||
Pistache::Http::ResponseWriter response)
|
||||
void Server::handlePollTask(const Pistache::Rest::Request &request,
|
||||
Pistache::Http::ResponseWriter response)
|
||||
{
|
||||
if (!handleAuth(request))
|
||||
return;
|
||||
std::string payload = "[";
|
||||
payload.reserve(65536);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resolvedGuard_);
|
||||
payload.swap(resolved_);
|
||||
nResolved_ = 0;
|
||||
}
|
||||
payload += "]";
|
||||
|
||||
response.send(Pistache::Http::Code::Ok, payload);
|
||||
auto runID = request.param(":runID").as<DAGRunID>();
|
||||
auto taskName = request.param(":taskName").as<std::string>();
|
||||
|
||||
auto taskID = std::make_pair(runID, taskName);
|
||||
std::unordered_map<TaskID, daggy::executors::task::TaskFuture>::node_type
|
||||
node;
|
||||
bool notFound = false;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||
auto it = runningTasks_.find(taskID);
|
||||
if (it == runningTasks_.end() || !it->second->ready()) {
|
||||
notFound = true;
|
||||
}
|
||||
else {
|
||||
node = runningTasks_.extract(taskID);
|
||||
}
|
||||
}
|
||||
|
||||
if (notFound) {
|
||||
response.send(Pistache::Http::Code::Not_Found, "");
|
||||
return;
|
||||
}
|
||||
|
||||
auto prom = response.send(Pistache::Http::Code::Ok,
|
||||
attemptRecordToJSON(node.mapped()->get()));
|
||||
// If the promise fails, then reinsert the result for later polling
|
||||
if (prom.isRejected()) {
|
||||
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||
runningTasks_.insert(std::move(node));
|
||||
}
|
||||
}
|
||||
|
||||
void Server::handleStopTask(const Pistache::Rest::Request &request,
|
||||
@@ -252,14 +211,7 @@ namespace daggy::daggyr {
|
||||
void Server::handleGetCapacity(const Pistache::Rest::Request &request,
|
||||
Pistache::Http::ResponseWriter response)
|
||||
{
|
||||
std::string payload;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(capacityGuard_);
|
||||
payload = R"({ "current": )" + capacityToJSON(curCapacity_) +
|
||||
R"(, "total": )" + capacityToJSON(maxCapacity_) + "}";
|
||||
}
|
||||
|
||||
response.send(Pistache::Http::Code::Ok, payload);
|
||||
response.send(Pistache::Http::Code::Ok, capacityToJSON(maxCapacity_));
|
||||
}
|
||||
|
||||
void Server::handleReady(const Pistache::Rest::Request &request,
|
||||
|
||||
Reference in New Issue
Block a user