Adding support for remote execution daemons.

Squashed commit of the following:

commit 69d5ef7a256b86a86d46e5ae374c00fded1497ea
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 12:15:55 2021 -0400

    Updating readme

commit 94a9f676d0f9cc0b55cdc18c4927eaea40d82c77
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 12:05:36 2021 -0400

    Fixing serialization of attempt records when querying entire dag

commit 945e5f90b24abf07c9af1bc4c6bbcb33e93b8069
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 11:37:59 2021 -0400

    Compiles cleanly...

commit 8b23e46081d47fb80dc1a2d998fc6dc4bbf301a8
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:43:03 2021 -0400

    Adding in missing source file to cmake build list

commit 6d10d9791206e2bc15788beadeea580b8e43a853
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:41:43 2021 -0400

    Adding new executors

commit 42a2c67f4d6ae99df95d917c8621d78cd99837a1
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:27:14 2021 -0400

    Fixing missing curl cmake dependency

commit 394bc4c5d51ecee7bf14712f719c8bf7e97fb0fa
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:21:58 2021 -0400

    Fixing missing curl cmake dependency

commit dd9efc8e7e7770ea1bcbccb70a1af9cfcff0414c
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 17:15:38 2021 -0400

    Checkpointing progress

commit 3b3b55d6037bb96e46de6763f486f4ecb92fe6a0
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 14:21:18 2021 -0400

    updating readme

commit 303027c11452941b2a0c0d1b04ac5942e79efd74
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 14:17:16 2021 -0400

    Namespacing daggyd
    Adding more error checking around deserialization of parameters
    Adding tests for runner agent

commit c592eaeba12e2a449bae401e8c1d9ed236416d52
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 11:20:21 2021 -0400

    Checkpointing work

commit fb1862d1cefe2b53a98659cce3c8c73d88bf5d84
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 09:52:29 2021 -0400

    Copying daggyd for daggyr template, adding in basic routes
This commit is contained in:
Ian Roddis
2021-12-16 12:16:12 -04:00
parent 14d0ef4a3f
commit 8d00621908
26 changed files with 1373 additions and 160 deletions

View File

@@ -0,0 +1,8 @@
project(libdaggyr)
add_library(${PROJECT_NAME} STATIC)
target_include_directories(${PROJECT_NAME} PUBLIC include)
target_link_libraries(${PROJECT_NAME} libdaggy stdc++fs)
add_subdirectory(src)

View File

@@ -0,0 +1,84 @@
#pragma once
#include <pistache/description.h>
#include <pistache/endpoint.h>
#include <pistache/http.h>
#include <daggy/DAGRunner.hpp>
#include <daggy/ThreadPool.hpp>
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
#include <daggy/loggers/dag_run/DAGRunLogger.hpp>
#include <filesystem>
#define DAGGY_REST_HANDLER(func) \
void func(const Pistache::Rest::Request &request, \
Pistache::Http::ResponseWriter response);
namespace fs = std::filesystem;
using namespace daggy::executors::task::daggy_runner;
namespace daggy::daggyr {
class Server
{
public:
Server(const Pistache::Address &listenSpec, ssize_t maxCores,
ssize_t maxMemoryMB);
~Server();
Server &setSSLCertificates(const fs::path &cert, const fs::path &key);
void init(size_t threads = 1);
void start();
uint16_t getPort() const;
void shutdown();
static void validateTask(const Task &task);
private:
void createDescription();
bool handleAuth(const Pistache::Rest::Request &request);
DAGGY_REST_HANDLER(handleReady);
DAGGY_REST_HANDLER(handleGetCapacity);
DAGGY_REST_HANDLER(handleRunTask);
DAGGY_REST_HANDLER(handleGetTask);
DAGGY_REST_HANDLER(handleStopTask);
DAGGY_REST_HANDLER(handleValidateTask);
Pistache::Http::Endpoint endpoint_;
Pistache::Rest::Description desc_;
Pistache::Rest::Router router_;
executors::task::ForkingTaskExecutor executor_;
struct TaskRecord
{
RunState state;
AttemptRecord attempt;
};
std::mutex capacityGuard_;
Capacity maxCapacity_;
Capacity curCapacity_;
std::mutex pendingGuard_;
struct PendingJob
{
std::future<AttemptRecord> fut;
Capacity resourcesUsed;
};
std::unordered_map<std::pair<DAGRunID, std::string>, PendingJob> pending_;
std::mutex resultsGuard_;
std::unordered_map<std::pair<DAGRunID, std::string>, AttemptRecord>
results_;
};
} // namespace daggy::daggyr

View File

@@ -0,0 +1,3 @@
target_sources(${PROJECT_NAME} PRIVATE
Server.cpp
)

View File

@@ -0,0 +1,259 @@
#include <enum.h>
#include <daggy/Serialization.hpp>
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
#include <daggyr/Server.hpp>
#include <iomanip>
#include <mutex>
#include <sstream>
#include <stdexcept>
#include <thread>
#include <utility>
#define REQ_RESPONSE(code, msg) \
std::stringstream ss; \
ss << R"({"message": )" << std::quoted(msg) << "}"; \
response.send(Pistache::Http::Code::code, ss.str()); \
return;
using namespace Pistache;
namespace daggy::daggyr {
void Server::init(size_t threads)
{
auto opts = Http::Endpoint::options()
.threads(threads)
.flags(Pistache::Tcp::Options::ReuseAddr |
Pistache::Tcp::Options::ReusePort)
.maxRequestSize(102400)
.maxResponseSize(102400);
endpoint_.init(opts);
createDescription();
}
Server::Server(const Pistache::Address &listenSpec, ssize_t maxCores,
ssize_t maxMemoryMB)
: endpoint_(listenSpec)
, desc_("Daggy Runner API", "0.1")
, executor_(maxCores)
, maxCapacity_{maxCores, maxMemoryMB}
, curCapacity_{maxCores, maxMemoryMB}
{
}
Server::~Server()
{
shutdown();
}
void Server::start()
{
router_.initFromDescription(desc_);
endpoint_.setHandler(router_.handler());
endpoint_.serveThreaded();
}
Server &Server::setSSLCertificates(const fs::path &cert, const fs::path &key)
{
endpoint_.useSSL(cert, key);
return *this;
}
void Server::shutdown()
{
endpoint_.shutdown();
}
uint16_t Server::getPort() const
{
return endpoint_.getPort();
}
void Server::createDescription()
{
desc_.info().license("MIT", "https://opensource.org/licenses/MIT");
desc_.schemes(Rest::Scheme::Http)
.basePath("/v1")
.produces(MIME(Application, Json))
.consumes(MIME(Application, Json));
desc_.route(desc_.get("/ready"))
.bind(&Server::handleReady, this)
.response(Http::Code::Ok, "Response to the /ready call")
.hide();
auto versionPath = desc_.path("/v1");
versionPath.route(desc_.post("/validate"))
.bind(&Server::handleValidateTask, this)
.produces(MIME(Application, Json))
.response(Http::Code::Ok, "Validate a task");
versionPath.route(desc_.post("/task/:runID/:taskName"))
.bind(&Server::handleRunTask, this)
.produces(MIME(Application, Json))
.response(Http::Code::Ok, "Run a task");
versionPath.route(desc_.get("/task/:runID/:taskName"))
.bind(&Server::handleGetTask, this)
.produces(MIME(Application, Json))
.response(Http::Code::Ok,
"Get the state and potentially the AttemptRecord of a task");
versionPath.route(desc_.del("/task/:runID/:taskName"))
.bind(&Server::handleStopTask, this)
.produces(MIME(Application, Json))
.response(Http::Code::Ok, "Stop a task");
versionPath.route(desc_.get("/capacity"))
.bind(&Server::handleGetCapacity, this)
.produces(MIME(Application, Json))
.response(Http::Code::Ok, "Get capacities of worker");
}
void Server::handleValidateTask(const Pistache::Rest::Request &request,
Pistache::Http::ResponseWriter response)
{
try {
auto task = taskFromJSON("sample_task", request.body());
daggy::executors::task::daggy_runner::validateTaskParameters(task.job);
}
catch (std::exception &e) {
REQ_RESPONSE(Not_Acceptable, e.what());
}
REQ_RESPONSE(Ok, "Task is valid");
}
void Server::handleRunTask(const Pistache::Rest::Request &request,
Pistache::Http::ResponseWriter response)
{
if (!handleAuth(request))
return;
auto runID = request.param(":runID").as<DAGRunID>();
auto taskName = request.param(":taskName").as<std::string>();
Capacity resourcesUsed;
Task task;
try {
task = taskFromJSON(taskName, request.body());
resourcesUsed = capacityFromTask(task);
}
catch (std::exception &e) {
REQ_RESPONSE(Not_Acceptable, e.what());
}
{
std::lock_guard<std::mutex> lock(capacityGuard_);
curCapacity_.cores -= resourcesUsed.cores;
curCapacity_.memoryMB -= resourcesUsed.memoryMB;
}
{
std::lock_guard<std::mutex> lock(pendingGuard_);
pending_.emplace(
std::make_pair(runID, taskName),
PendingJob{.fut = executor_.execute(runID, taskName, task),
.resourcesUsed = resourcesUsed});
}
response.send(Pistache::Http::Code::Ok, "");
}
void Server::handleGetTask(const Pistache::Rest::Request &request,
Pistache::Http::ResponseWriter response)
{
if (!handleAuth(request))
return;
auto runID = request.param(":runID").as<DAGRunID>();
auto taskName = request.param(":taskName").as<std::string>();
auto taskID = std::make_pair(runID, taskName);
std::string payload;
bool found = false;
{
std::lock_guard<std::mutex> lock(pendingGuard_);
auto it = pending_.find(taskID);
if (it != pending_.end()) {
// poll it
if (it->second.fut.valid() and
it->second.fut.wait_for(1ms) == std::future_status::ready) {
auto attempt = it->second.fut.get();
{
std::lock_guard<std::mutex> rlock(resultsGuard_);
results_.emplace(taskID, attempt);
}
{
std::lock_guard<std::mutex> rlock(capacityGuard_);
curCapacity_.cores += it->second.resourcesUsed.cores;
curCapacity_.memoryMB += it->second.resourcesUsed.memoryMB;
}
}
else {
payload = R"({ "state": "RUNNING" })";
found = true;
}
}
}
if (!found) {
std::lock_guard<std::mutex> lock(resultsGuard_);
auto it = results_.find(taskID);
if (it == results_.end()) {
REQ_RESPONSE(Not_Found, "No such task");
}
payload = R"({ "state": "COMPLETED", "attempt": )" +
attemptRecordToJSON(it->second) + "}";
}
response.send(Pistache::Http::Code::Ok, payload);
}
void Server::handleStopTask(const Pistache::Rest::Request &request,
Pistache::Http::ResponseWriter response)
{
if (!handleAuth(request))
return;
auto runID = request.param(":runID").as<DAGRunID>();
auto taskName = request.param(":taskName").as<std::string>();
executor_.stop(runID, taskName);
REQ_RESPONSE(Ok, "");
}
void Server::handleGetCapacity(const Pistache::Rest::Request &request,
Pistache::Http::ResponseWriter response)
{
std::string payload;
{
std::lock_guard<std::mutex> lock(capacityGuard_);
payload = R"({ "current": )" + capacityToJSON(curCapacity_) +
R"(, "total": )" + capacityToJSON(maxCapacity_) + "}";
}
response.send(Pistache::Http::Code::Ok, payload);
}
void Server::handleReady(const Pistache::Rest::Request &request,
Pistache::Http::ResponseWriter response)
{
response.send(Pistache::Http::Code::Ok, R"({ "msg": "Ready for tasks!"})");
}
/*
* handleAuth will check any auth methods and handle any responses in the
* case of failed auth. If it returns false, callers should cease handling
* the response
*/
bool Server::handleAuth(const Pistache::Rest::Request &request)
{
return true;
}
} // namespace daggy::daggyr