Adding support for remote execution daemons.

Squashed commit of the following:

commit 69d5ef7a256b86a86d46e5ae374c00fded1497ea
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 12:15:55 2021 -0400

    Updating readme

commit 94a9f676d0f9cc0b55cdc18c4927eaea40d82c77
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 12:05:36 2021 -0400

    Fixing serialization of attempt records when querying entire dag

commit 945e5f90b24abf07c9af1bc4c6bbcb33e93b8069
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 11:37:59 2021 -0400

    Compiles cleanly...

commit 8b23e46081d47fb80dc1a2d998fc6dc4bbf301a8
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:43:03 2021 -0400

    Adding in missing source file to cmake build list

commit 6d10d9791206e2bc15788beadeea580b8e43a853
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:41:43 2021 -0400

    Adding new executors

commit 42a2c67f4d6ae99df95d917c8621d78cd99837a1
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:27:14 2021 -0400

    Fixing missing curl cmake dependency

commit 394bc4c5d51ecee7bf14712f719c8bf7e97fb0fa
Author: Ian Roddis <tech@kinesin.ca>
Date:   Thu Dec 16 10:21:58 2021 -0400

    Fixing missing curl cmake dependency

commit dd9efc8e7e7770ea1bcbccb70a1af9cfcff0414c
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 17:15:38 2021 -0400

    Checkpointing progress

commit 3b3b55d6037bb96e46de6763f486f4ecb92fe6a0
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 14:21:18 2021 -0400

    updating readme

commit 303027c11452941b2a0c0d1b04ac5942e79efd74
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 14:17:16 2021 -0400

    Namespacing daggyd
    Adding more error checking around deserialization of parameters
    Adding tests for runner agent

commit c592eaeba12e2a449bae401e8c1d9ed236416d52
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 11:20:21 2021 -0400

    Checkpointing work

commit fb1862d1cefe2b53a98659cce3c8c73d88bf5d84
Author: Ian Roddis <tech@kinesin.ca>
Date:   Wed Dec 15 09:52:29 2021 -0400

    Copying daggyd for daggyr template, adding in basic routes
This commit is contained in:
Ian Roddis
2021-12-16 12:16:12 -04:00
parent 14d0ef4a3f
commit 8d00621908
26 changed files with 1373 additions and 160 deletions

View File

@@ -10,6 +10,7 @@
#include <iostream>
// Add executors here
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
#include <daggy/executors/task/SlurmTaskExecutor.hpp>
@@ -177,6 +178,27 @@ std::unique_ptr<de::TaskExecutor> executorFactory(const rj::Value &config)
else if (name == "SlurmTaskExecutor") {
return std::make_unique<de::SlurmTaskExecutor>();
}
else if (name == "DaggyRunnerTaskExecutor") {
if (!execConfig.HasMember("runners"))
throw std::runtime_error(
"DaggyRunnerExecutor config needs at least one remote runner");
auto exe = std::make_unique<de::DaggyRunnerTaskExecutor>();
const auto &runners = execConfig["runners"];
if (!runners.IsArray()) {
throw std::runtime_error(
"DaggyRunnerExecutor runners must be an array of urls");
for (size_t i = 0; i < runners.Size(); ++i) {
if (!runners[i].IsString())
throw std::runtime_error(
"DaggyRunnerExecutor runners must be an array of urls");
exe->addRunner(runners[i].GetString());
}
return exe;
}
}
else
throw std::runtime_error("Unknown executor type: " + name);
}
@@ -246,7 +268,7 @@ int main(int argc, char **argv)
Pistache::Address listenSpec(listenIP, listenPort);
daggy::Server server(listenSpec, *logger, *executor, dagThreads);
daggy::daggyd::Server server(listenSpec, *logger, *executor, dagThreads);
server.init(webThreads);
server.start();

View File

@@ -16,7 +16,7 @@
namespace fs = std::filesystem;
namespace daggy {
namespace daggy::daggyd {
class Server
{
public:
@@ -64,4 +64,4 @@ namespace daggy {
std::mutex runnerGuard_;
std::unordered_map<DAGRunID, std::shared_ptr<DAGRunner>> runners_;
};
} // namespace daggy
} // namespace daggy::daggyd

View File

@@ -18,7 +18,7 @@
using namespace Pistache;
namespace daggy {
namespace daggy::daggyd {
void Server::init(size_t threads)
{
auto opts = Http::Endpoint::options()
@@ -305,14 +305,7 @@ namespace daggy {
else {
ss << ',';
}
ss << '{' << R"("startTime":)"
<< std::quoted(timePointToString(attempt.startTime)) << ','
<< R"("stopTime":)"
<< std::quoted(timePointToString(attempt.stopTime)) << ','
<< R"("rc":)" << attempt.rc << ',' << R"("outputLog":)"
<< std::quoted(attempt.outputLog) << ',' << R"("errorLog":)"
<< std::quoted(attempt.errorLog) << ',' << R"("executorLog":)"
<< std::quoted(attempt.executorLog) << '}';
ss << attemptRecordToJSON(attempt);
}
ss << ']';
}
@@ -511,4 +504,4 @@ namespace daggy {
{
return true;
}
} // namespace daggy
} // namespace daggy::daggyd

View File

@@ -17,118 +17,6 @@ namespace rj = rapidjson;
using namespace daggy;
#ifdef DEBUG_HTTP
static int my_trace(CURL *handle, curl_infotype type, char *data, size_t size,
void *userp)
{
const char *text;
(void)handle; /* prevent compiler warning */
(void)userp;
switch (type) {
case CURLINFO_TEXT:
fprintf(stderr, "== Info: %s", data);
default: /* in case a new one is introduced to shock us */
return 0;
case CURLINFO_HEADER_OUT:
text = "=> Send header";
break;
case CURLINFO_DATA_OUT:
text = "=> Send data";
break;
case CURLINFO_SSL_DATA_OUT:
text = "=> Send SSL data";
break;
case CURLINFO_HEADER_IN:
text = "<= Recv header";
break;
case CURLINFO_DATA_IN:
text = "<= Recv data";
break;
case CURLINFO_SSL_DATA_IN:
text = "<= Recv SSL data";
break;
}
std::cerr << "\n================== " << text
<< " ==================" << std::endl
<< data << std::endl;
return 0;
}
#endif
enum HTTPCode : long
{
Ok = 200,
Not_Found = 404
};
struct HTTPResponse
{
HTTPCode code;
std::string body;
};
uint curlWriter(char *in, uint size, uint nmemb, std::stringstream *out)
{
uint r;
r = size * nmemb;
out->write(in, r);
return r;
}
HTTPResponse REQUEST(const std::string &url, const std::string &payload = "",
const std::string &method = "GET")
{
HTTPResponse response;
CURL *curl;
CURLcode res;
struct curl_slist *headers = NULL;
curl_global_init(CURL_GLOBAL_ALL);
curl = curl_easy_init();
if (curl) {
std::stringstream buffer;
#ifdef DEBUG_HTTP
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, my_trace);
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
#endif
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curlWriter);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
if (!payload.empty()) {
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, payload.size());
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, payload.c_str());
headers = curl_slist_append(headers, "Content-Type: Application/Json");
}
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method.c_str());
headers = curl_slist_append(headers, "Expect:");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
res = curl_easy_perform(curl);
if (res != CURLE_OK) {
curl_easy_cleanup(curl);
throw std::runtime_error(std::string{"CURL Failed: "} +
curl_easy_strerror(res));
}
curl_easy_cleanup(curl);
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response.code);
response.body = buffer.str();
}
curl_global_cleanup();
return response;
}
TEST_CASE("rest_endpoint", "[server_basic]")
{
std::stringstream ss;
@@ -138,7 +26,7 @@ TEST_CASE("rest_endpoint", "[server_basic]")
const size_t nDAGRunners = 10, nWebThreads = 10;
daggy::Server server(listenSpec, logger, executor, nDAGRunners);
daggy::daggyd::Server server(listenSpec, logger, executor, nDAGRunners);
server.init(nWebThreads);
server.start();
@@ -147,13 +35,13 @@ TEST_CASE("rest_endpoint", "[server_basic]")
SECTION("Ready Endpoint")
{
auto response = REQUEST(baseURL + "/ready");
auto response = HTTP_REQUEST(baseURL + "/ready");
REQUIRE(response.code == HTTPCode::Ok);
}
SECTION("Querying a non-existent dagrunid should fail ")
{
auto response = REQUEST(baseURL + "/v1/dagrun/100");
auto response = HTTP_REQUEST(baseURL + "/v1/dagrun/100");
REQUIRE(response.code != HTTPCode::Ok);
}
@@ -175,7 +63,7 @@ TEST_CASE("rest_endpoint", "[server_basic]")
// Submit, and get the runID
daggy::DAGRunID runID = 0;
{
auto response = REQUEST(baseURL + "/v1/dagrun/", dagRun, "POST");
auto response = HTTP_REQUEST(baseURL + "/v1/dagrun/", dagRun, "POST");
REQUIRE(response.code == HTTPCode::Ok);
rj::Document doc;
@@ -188,7 +76,7 @@ TEST_CASE("rest_endpoint", "[server_basic]")
// Ensure our runID shows up in the list of running DAGs
{
auto response = REQUEST(baseURL + "/v1/dagruns?all=1");
auto response = HTTP_REQUEST(baseURL + "/v1/dagruns?all=1");
REQUIRE(response.code == HTTPCode::Ok);
rj::Document doc;
@@ -217,8 +105,8 @@ TEST_CASE("rest_endpoint", "[server_basic]")
// Ensure we can get one of our tasks
{
auto response = REQUEST(baseURL + "/v1/dagrun/" + std::to_string(runID) +
"/task/cat_0");
auto response = HTTP_REQUEST(baseURL + "/v1/dagrun/" +
std::to_string(runID) + "/task/cat_0");
REQUIRE(response.code == HTTPCode::Ok);
rj::Document doc;
@@ -233,7 +121,8 @@ TEST_CASE("rest_endpoint", "[server_basic]")
// Wait until our DAG is complete
bool complete = true;
for (auto i = 0; i < 10; ++i) {
auto response = REQUEST(baseURL + "/v1/dagrun/" + std::to_string(runID));
auto response =
HTTP_REQUEST(baseURL + "/v1/dagrun/" + std::to_string(runID));
REQUIRE(response.code == HTTPCode::Ok);
rj::Document doc;
daggy::checkRJParse(doc.Parse(response.body.c_str()));
@@ -281,7 +170,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
const size_t nDAGRunners = 10, nWebThreads = 10;
daggy::Server server(listenSpec, logger, executor, nDAGRunners);
daggy::daggyd::Server server(listenSpec, logger, executor, nDAGRunners);
server.init(nWebThreads);
server.start();
@@ -304,7 +193,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
// Submit, and get the runID
daggy::DAGRunID runID;
{
auto response = REQUEST(baseURL + "/v1/dagrun/", dagRunJSON, "POST");
auto response = HTTP_REQUEST(baseURL + "/v1/dagrun/", dagRunJSON, "POST");
REQUIRE(response.code == HTTPCode::Ok);
rj::Document doc;
@@ -319,7 +208,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
// Stop the current run
{
auto response = REQUEST(
auto response = HTTP_REQUEST(
baseURL + "/v1/dagrun/" + std::to_string(runID) + "/state/KILLED", "",
"PATCH");
REQUIRE(response.code == HTTPCode::Ok);
@@ -342,7 +231,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
{
auto url = baseURL + "/v1/dagrun/" + std::to_string(runID) +
"/task/sleep_B_0/state/QUEUED";
auto response = REQUEST(url, "", "PATCH");
auto response = HTTP_REQUEST(url, "", "PATCH");
REQUIRE(response.code == HTTPCode::Ok);
REQUIRE(logger.getTaskState(runID, "sleep_B_0") ==
+daggy::RunState::QUEUED);
@@ -355,7 +244,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
lstat("resume_touch_A", &s);
auto preMTime = s.st_mtim.tv_sec;
auto response = REQUEST(
auto response = HTTP_REQUEST(
baseURL + "/v1/dagrun/" + std::to_string(runID) + "/state/QUEUED", "",
"PATCH");