Checkpointing work
This commit is contained in:
@@ -9,8 +9,8 @@
|
||||
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
|
||||
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
||||
#include <daggy/loggers/dag_run/DAGRunLogger.hpp>
|
||||
#include <deque>
|
||||
#include <filesystem>
|
||||
#include <list>
|
||||
|
||||
#define DAGGY_REST_HANDLER(func) \
|
||||
void func(const Pistache::Rest::Request &request, \
|
||||
@@ -58,6 +58,8 @@ namespace daggy::daggyr {
|
||||
|
||||
executors::task::ForkingTaskExecutor executor_;
|
||||
|
||||
using TaskID = std::pair<DAGRunID, std::string>;
|
||||
|
||||
struct TaskRecord
|
||||
{
|
||||
RunState state;
|
||||
@@ -68,17 +70,21 @@ namespace daggy::daggyr {
|
||||
Capacity maxCapacity_;
|
||||
Capacity curCapacity_;
|
||||
|
||||
std::mutex pendingGuard_;
|
||||
|
||||
struct PendingJob
|
||||
{
|
||||
DAGRunID runID;
|
||||
std::string taskName;
|
||||
std::future<AttemptRecord> fut;
|
||||
daggy::executors::task::TaskFuture fut;
|
||||
Capacity resourcesUsed;
|
||||
bool resolved;
|
||||
};
|
||||
|
||||
std::list<PendingJob> pending_;
|
||||
void monitor();
|
||||
std::atomic<bool> running_;
|
||||
std::thread monitorWorker_;
|
||||
|
||||
std::mutex pendingGuard_;
|
||||
std::unordered_map<TaskID, PendingJob> pending_;
|
||||
|
||||
std::mutex resolvedGuard_;
|
||||
std::deque<std::string> resolved_;
|
||||
};
|
||||
} // namespace daggy::daggyr
|
||||
|
||||
@@ -38,6 +38,8 @@ namespace daggy::daggyr {
|
||||
, executor_(maxCores)
|
||||
, maxCapacity_{maxCores, maxMemoryMB}
|
||||
, curCapacity_{maxCores, maxMemoryMB}
|
||||
, running_(true)
|
||||
, monitorWorker_(&Server::monitor, this)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -63,6 +65,8 @@ namespace daggy::daggyr {
|
||||
void Server::shutdown()
|
||||
{
|
||||
endpoint_.shutdown();
|
||||
running_ = false;
|
||||
monitorWorker_.join();
|
||||
}
|
||||
|
||||
uint16_t Server::getPort() const
|
||||
@@ -154,63 +158,92 @@ namespace daggy::daggyr {
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(pendingGuard_);
|
||||
pending_.push_back(
|
||||
PendingJob{.runID = runID,
|
||||
.taskName = taskName,
|
||||
.fut = executor_.execute(runID, taskName, task),
|
||||
.resourcesUsed = resourcesUsed});
|
||||
pending_.emplace(std::make_pair(runID, taskName),
|
||||
PendingJob{
|
||||
.fut = executor_.execute(runID, taskName, task),
|
||||
.resourcesUsed = resourcesUsed,
|
||||
});
|
||||
}
|
||||
|
||||
response.send(Pistache::Http::Code::Ok, "");
|
||||
}
|
||||
|
||||
void Server::monitor()
|
||||
{
|
||||
std::unordered_map<TaskID, AttemptRecord> resolved;
|
||||
while (running_) {
|
||||
resolved.clear();
|
||||
std::vector<TaskID> resolvedIDs;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(pendingGuard_);
|
||||
for (const auto &[tid, job] : pending_) {
|
||||
if (job.fut->ready()) {
|
||||
resolved.emplace(tid, job.fut->get());
|
||||
resolvedIDs.push_back(tid);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &tid : resolvedIDs) {
|
||||
pending_.extract(tid);
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_map<TaskID, std::string> payloads;
|
||||
for (const auto &[tid, attempt] : resolved) {
|
||||
std::stringstream ss;
|
||||
ss << R"({ "runID": )" << tid.first << R"(, "taskName": )"
|
||||
<< std::quoted(tid.second) << ", "
|
||||
<< R"("state": "COMPLETED", "attempt":)"
|
||||
<< attemptRecordToJSON(attempt) << "}";
|
||||
payloads.emplace(tid, ss.str());
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resolvedGuard_);
|
||||
for (const auto &[_, item] : payloads)
|
||||
resolved_.push_back(item);
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
}
|
||||
}
|
||||
|
||||
void Server::handlePollTasks(const Pistache::Rest::Request &request,
|
||||
Pistache::Http::ResponseWriter response)
|
||||
{
|
||||
if (!handleAuth(request))
|
||||
return;
|
||||
auto ss = Clock::now();
|
||||
|
||||
std::stringstream payload;
|
||||
payload << "[";
|
||||
bool first = true;
|
||||
|
||||
// Check to see if it's pending
|
||||
std::lock_guard<std::mutex> lock(pendingGuard_);
|
||||
auto it = pending_.begin();
|
||||
while (it != pending_.end()) {
|
||||
if (first) {
|
||||
first = false;
|
||||
}
|
||||
else {
|
||||
payload << ", ";
|
||||
}
|
||||
|
||||
payload << R"({ "runID": )" << it->runID << R"(, "taskName": )"
|
||||
<< std::quoted(it->taskName) << ", ";
|
||||
|
||||
// poll it
|
||||
if (it->fut.valid() and
|
||||
it->fut.wait_for(1ms) == std::future_status::ready) {
|
||||
auto attempt = it->fut.get();
|
||||
|
||||
payload << R"("state": "COMPLETED", "attempt":)"
|
||||
<< attemptRecordToJSON(attempt);
|
||||
{
|
||||
std::lock_guard<std::mutex> rlock(capacityGuard_);
|
||||
curCapacity_.cores += it->resourcesUsed.cores;
|
||||
curCapacity_.memoryMB += it->resourcesUsed.memoryMB;
|
||||
size_t cnt = 0;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resolvedGuard_);
|
||||
cnt = resolved_.size();
|
||||
for (const auto &item : resolved_) {
|
||||
if (first) {
|
||||
first = false;
|
||||
}
|
||||
it = pending_.erase(it);
|
||||
else {
|
||||
payload << ", ";
|
||||
}
|
||||
payload << item;
|
||||
}
|
||||
else {
|
||||
payload << R"("state": "PENDING")";
|
||||
++it;
|
||||
}
|
||||
payload << "}";
|
||||
resolved_.clear();
|
||||
}
|
||||
payload << "]";
|
||||
|
||||
response.send(Pistache::Http::Code::Ok, payload.str());
|
||||
auto payloadStr = payload.str();
|
||||
response.send(Pistache::Http::Code::Ok, payloadStr);
|
||||
auto ee = Clock::now();
|
||||
|
||||
std::cout
|
||||
<< "Completed request: with " << cnt << " updates in"
|
||||
<< " total ("
|
||||
<< std::chrono::duration_cast<std::chrono::nanoseconds>(ee - ss).count()
|
||||
<< " ns)\n";
|
||||
}
|
||||
|
||||
void Server::handleStopTask(const Pistache::Rest::Request &request,
|
||||
|
||||
Reference in New Issue
Block a user