Checkpointing work

This commit is contained in:
Ian Roddis
2022-01-12 12:50:46 -04:00
parent 04e95cfcf3
commit 9a5a247f15
21 changed files with 320 additions and 160 deletions

View File

@@ -38,6 +38,8 @@ namespace daggy::daggyr {
, executor_(maxCores)
, maxCapacity_{maxCores, maxMemoryMB}
, curCapacity_{maxCores, maxMemoryMB}
, running_(true)
, monitorWorker_(&Server::monitor, this)
{
}
@@ -63,6 +65,8 @@ namespace daggy::daggyr {
void Server::shutdown()
{
endpoint_.shutdown();
running_ = false;
monitorWorker_.join();
}
uint16_t Server::getPort() const
@@ -154,63 +158,92 @@ namespace daggy::daggyr {
{
std::lock_guard<std::mutex> lock(pendingGuard_);
pending_.push_back(
PendingJob{.runID = runID,
.taskName = taskName,
.fut = executor_.execute(runID, taskName, task),
.resourcesUsed = resourcesUsed});
pending_.emplace(std::make_pair(runID, taskName),
PendingJob{
.fut = executor_.execute(runID, taskName, task),
.resourcesUsed = resourcesUsed,
});
}
response.send(Pistache::Http::Code::Ok, "");
}
void Server::monitor()
{
std::unordered_map<TaskID, AttemptRecord> resolved;
while (running_) {
resolved.clear();
std::vector<TaskID> resolvedIDs;
{
std::lock_guard<std::mutex> lock(pendingGuard_);
for (const auto &[tid, job] : pending_) {
if (job.fut->ready()) {
resolved.emplace(tid, job.fut->get());
resolvedIDs.push_back(tid);
}
}
for (const auto &tid : resolvedIDs) {
pending_.extract(tid);
}
}
std::unordered_map<TaskID, std::string> payloads;
for (const auto &[tid, attempt] : resolved) {
std::stringstream ss;
ss << R"({ "runID": )" << tid.first << R"(, "taskName": )"
<< std::quoted(tid.second) << ", "
<< R"("state": "COMPLETED", "attempt":)"
<< attemptRecordToJSON(attempt) << "}";
payloads.emplace(tid, ss.str());
}
{
std::lock_guard<std::mutex> lock(resolvedGuard_);
for (const auto &[_, item] : payloads)
resolved_.push_back(item);
}
std::this_thread::sleep_for(std::chrono::seconds(1));
}
}
void Server::handlePollTasks(const Pistache::Rest::Request &request,
Pistache::Http::ResponseWriter response)
{
if (!handleAuth(request))
return;
auto ss = Clock::now();
std::stringstream payload;
payload << "[";
bool first = true;
// Check to see if it's pending
std::lock_guard<std::mutex> lock(pendingGuard_);
auto it = pending_.begin();
while (it != pending_.end()) {
if (first) {
first = false;
}
else {
payload << ", ";
}
payload << R"({ "runID": )" << it->runID << R"(, "taskName": )"
<< std::quoted(it->taskName) << ", ";
// poll it
if (it->fut.valid() and
it->fut.wait_for(1ms) == std::future_status::ready) {
auto attempt = it->fut.get();
payload << R"("state": "COMPLETED", "attempt":)"
<< attemptRecordToJSON(attempt);
{
std::lock_guard<std::mutex> rlock(capacityGuard_);
curCapacity_.cores += it->resourcesUsed.cores;
curCapacity_.memoryMB += it->resourcesUsed.memoryMB;
size_t cnt = 0;
{
std::lock_guard<std::mutex> lock(resolvedGuard_);
cnt = resolved_.size();
for (const auto &item : resolved_) {
if (first) {
first = false;
}
it = pending_.erase(it);
else {
payload << ", ";
}
payload << item;
}
else {
payload << R"("state": "PENDING")";
++it;
}
payload << "}";
resolved_.clear();
}
payload << "]";
response.send(Pistache::Http::Code::Ok, payload.str());
auto payloadStr = payload.str();
response.send(Pistache::Http::Code::Ok, payloadStr);
auto ee = Clock::now();
std::cout
<< "Completed request: with " << cnt << " updates in"
<< " total ("
<< std::chrono::duration_cast<std::chrono::nanoseconds>(ee - ss).count()
<< " ns)\n";
}
void Server::handleStopTask(const Pistache::Rest::Request &request,