Checkpointing work

This commit is contained in:
Ian Roddis
2022-01-12 12:50:46 -04:00
parent 04e95cfcf3
commit 9a5a247f15
21 changed files with 320 additions and 160 deletions

View File

@@ -9,8 +9,8 @@
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
#include <daggy/loggers/dag_run/DAGRunLogger.hpp>
#include <deque>
#include <filesystem>
#include <list>
#define DAGGY_REST_HANDLER(func) \
void func(const Pistache::Rest::Request &request, \
@@ -58,6 +58,8 @@ namespace daggy::daggyr {
executors::task::ForkingTaskExecutor executor_;
using TaskID = std::pair<DAGRunID, std::string>;
struct TaskRecord
{
RunState state;
@@ -68,17 +70,21 @@ namespace daggy::daggyr {
Capacity maxCapacity_;
Capacity curCapacity_;
std::mutex pendingGuard_;
struct PendingJob
{
DAGRunID runID;
std::string taskName;
std::future<AttemptRecord> fut;
daggy::executors::task::TaskFuture fut;
Capacity resourcesUsed;
bool resolved;
};
std::list<PendingJob> pending_;
void monitor();
std::atomic<bool> running_;
std::thread monitorWorker_;
std::mutex pendingGuard_;
std::unordered_map<TaskID, PendingJob> pending_;
std::mutex resolvedGuard_;
std::deque<std::string> resolved_;
};
} // namespace daggy::daggyr

View File

@@ -38,6 +38,8 @@ namespace daggy::daggyr {
, executor_(maxCores)
, maxCapacity_{maxCores, maxMemoryMB}
, curCapacity_{maxCores, maxMemoryMB}
, running_(true)
, monitorWorker_(&Server::monitor, this)
{
}
@@ -63,6 +65,8 @@ namespace daggy::daggyr {
void Server::shutdown()
{
endpoint_.shutdown();
running_ = false;
monitorWorker_.join();
}
uint16_t Server::getPort() const
@@ -154,63 +158,92 @@ namespace daggy::daggyr {
{
std::lock_guard<std::mutex> lock(pendingGuard_);
pending_.push_back(
PendingJob{.runID = runID,
.taskName = taskName,
.fut = executor_.execute(runID, taskName, task),
.resourcesUsed = resourcesUsed});
pending_.emplace(std::make_pair(runID, taskName),
PendingJob{
.fut = executor_.execute(runID, taskName, task),
.resourcesUsed = resourcesUsed,
});
}
response.send(Pistache::Http::Code::Ok, "");
}
void Server::monitor()
{
std::unordered_map<TaskID, AttemptRecord> resolved;
while (running_) {
resolved.clear();
std::vector<TaskID> resolvedIDs;
{
std::lock_guard<std::mutex> lock(pendingGuard_);
for (const auto &[tid, job] : pending_) {
if (job.fut->ready()) {
resolved.emplace(tid, job.fut->get());
resolvedIDs.push_back(tid);
}
}
for (const auto &tid : resolvedIDs) {
pending_.extract(tid);
}
}
std::unordered_map<TaskID, std::string> payloads;
for (const auto &[tid, attempt] : resolved) {
std::stringstream ss;
ss << R"({ "runID": )" << tid.first << R"(, "taskName": )"
<< std::quoted(tid.second) << ", "
<< R"("state": "COMPLETED", "attempt":)"
<< attemptRecordToJSON(attempt) << "}";
payloads.emplace(tid, ss.str());
}
{
std::lock_guard<std::mutex> lock(resolvedGuard_);
for (const auto &[_, item] : payloads)
resolved_.push_back(item);
}
std::this_thread::sleep_for(std::chrono::seconds(1));
}
}
void Server::handlePollTasks(const Pistache::Rest::Request &request,
Pistache::Http::ResponseWriter response)
{
if (!handleAuth(request))
return;
auto ss = Clock::now();
std::stringstream payload;
payload << "[";
bool first = true;
// Check to see if it's pending
std::lock_guard<std::mutex> lock(pendingGuard_);
auto it = pending_.begin();
while (it != pending_.end()) {
if (first) {
first = false;
}
else {
payload << ", ";
}
payload << R"({ "runID": )" << it->runID << R"(, "taskName": )"
<< std::quoted(it->taskName) << ", ";
// poll it
if (it->fut.valid() and
it->fut.wait_for(1ms) == std::future_status::ready) {
auto attempt = it->fut.get();
payload << R"("state": "COMPLETED", "attempt":)"
<< attemptRecordToJSON(attempt);
{
std::lock_guard<std::mutex> rlock(capacityGuard_);
curCapacity_.cores += it->resourcesUsed.cores;
curCapacity_.memoryMB += it->resourcesUsed.memoryMB;
size_t cnt = 0;
{
std::lock_guard<std::mutex> lock(resolvedGuard_);
cnt = resolved_.size();
for (const auto &item : resolved_) {
if (first) {
first = false;
}
it = pending_.erase(it);
else {
payload << ", ";
}
payload << item;
}
else {
payload << R"("state": "PENDING")";
++it;
}
payload << "}";
resolved_.clear();
}
payload << "]";
response.send(Pistache::Http::Code::Ok, payload.str());
auto payloadStr = payload.str();
response.send(Pistache::Http::Code::Ok, payloadStr);
auto ee = Clock::now();
std::cout
<< "Completed request: with " << cnt << " updates in"
<< " total ("
<< std::chrono::duration_cast<std::chrono::nanoseconds>(ee - ss).count()
<< " ns)\n";
}
void Server::handleStopTask(const Pistache::Rest::Request &request,