Adding support for remote execution daemons.
Squashed commit of the following: commit 69d5ef7a256b86a86d46e5ae374c00fded1497ea Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 12:15:55 2021 -0400 Updating readme commit 94a9f676d0f9cc0b55cdc18c4927eaea40d82c77 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 12:05:36 2021 -0400 Fixing serialization of attempt records when querying entire dag commit 945e5f90b24abf07c9af1bc4c6bbcb33e93b8069 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 11:37:59 2021 -0400 Compiles cleanly... commit 8b23e46081d47fb80dc1a2d998fc6dc4bbf301a8 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:43:03 2021 -0400 Adding in missing source file to cmake build list commit 6d10d9791206e2bc15788beadeea580b8e43a853 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:41:43 2021 -0400 Adding new executors commit 42a2c67f4d6ae99df95d917c8621d78cd99837a1 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:27:14 2021 -0400 Fixing missing curl cmake dependency commit 394bc4c5d51ecee7bf14712f719c8bf7e97fb0fa Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:21:58 2021 -0400 Fixing missing curl cmake dependency commit dd9efc8e7e7770ea1bcbccb70a1af9cfcff0414c Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 17:15:38 2021 -0400 Checkpointing progress commit 3b3b55d6037bb96e46de6763f486f4ecb92fe6a0 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 14:21:18 2021 -0400 updating readme commit 303027c11452941b2a0c0d1b04ac5942e79efd74 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 14:17:16 2021 -0400 Namespacing daggyd Adding more error checking around deserialization of parameters Adding tests for runner agent commit c592eaeba12e2a449bae401e8c1d9ed236416d52 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 11:20:21 2021 -0400 Checkpointing work commit fb1862d1cefe2b53a98659cce3c8c73d88bf5d84 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 09:52:29 2021 -0400 Copying daggyd for daggyr template, adding in basic routes
This commit is contained in:
@@ -11,8 +11,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS True)
|
|||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror")
|
||||||
|
|
||||||
if(CMAKE_BUILD_TYPE MATCHES "Debug")
|
if(CMAKE_BUILD_TYPE MATCHES "Debug")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread -fno-omit-frame-pointer")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
|
||||||
set(TSAN_OPTIONS "suppressions=${CMAKE_CURRENT_DIR}/tests/tsan.supp")
|
# set(TSAN_OPTIONS "suppressions=${CMAKE_CURRENT_DIR}/tests/tsan.supp")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(THIRD_PARTY_DIR ${CMAKE_BINARY_DIR}/third_party)
|
set(THIRD_PARTY_DIR ${CMAKE_BINARY_DIR}/third_party)
|
||||||
@@ -48,3 +48,4 @@ enable_testing()
|
|||||||
|
|
||||||
add_subdirectory(libdaggy)
|
add_subdirectory(libdaggy)
|
||||||
add_subdirectory(daggyd)
|
add_subdirectory(daggyd)
|
||||||
|
add_subdirectory(daggyr)
|
||||||
|
|||||||
22
README.md
22
README.md
@@ -28,7 +28,7 @@ graph LR
|
|||||||
|
|
||||||
Individual tasks (vertices) are run via a task executor. Daggy supports multiple executors, from local executor (via
|
Individual tasks (vertices) are run via a task executor. Daggy supports multiple executors, from local executor (via
|
||||||
fork), to distributed work managers like [slurm](https://slurm.schedmd.com/overview.html)
|
fork), to distributed work managers like [slurm](https://slurm.schedmd.com/overview.html)
|
||||||
or [kubernetes](https://kubernetes.io/) (planned).
|
or [kubernetes](https://kubernetes.io/) (planned), or daggy's own executor.
|
||||||
|
|
||||||
State is maintained via state loggers. Currently daggy supports an in-memory state manager (OStreamLogger), and
|
State is maintained via state loggers. Currently daggy supports an in-memory state manager (OStreamLogger), and
|
||||||
[RedisJSON](https://oss.redis.com/redisjson/).
|
[RedisJSON](https://oss.redis.com/redisjson/).
|
||||||
@@ -380,6 +380,26 @@ For this reason, it's important that the `tmpDir` directory **be readable by the
|
|||||||
environment, it should be a shared filesystem. If this isn't the case, the job output will not be captured by daggy,
|
environment, it should be a shared filesystem. If this isn't the case, the job output will not be captured by daggy,
|
||||||
although it will still be available wherever it was written by slurm.
|
although it will still be available wherever it was written by slurm.
|
||||||
|
|
||||||
|
DaggyRunnerTaskExecutor
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
Daggy Runners (`daggyr` in this project) are daemons that can be run on remote hosts, then allocated work.
|
||||||
|
|
||||||
|
Tasks submitted to this type of runner require `cores` and `memoryMB` attributes. Remote runners have a specific
|
||||||
|
capacity that are consumed when tasks run on them. Right now those capacities are merely advisory; it's possible
|
||||||
|
to oversubscribe a runner, and the constraints are not enforced.
|
||||||
|
|
||||||
|
Enforcement via cgroups is planned.
|
||||||
|
|
||||||
|
|
||||||
|
| Field | Sample | Description |
|
||||||
|
|---------|--------|--------------|
|
||||||
|
| command | `[ "/usr/bin/echo", "param1" ]` | The command to run |
|
||||||
|
| commandString | `"/usr/bin/echo param1"` | The command to run as a string. Quoted args are properly handled. |
|
||||||
|
| environment | `[ "DATE=2021-05-03" ]` | Environment variables to set for script |
|
||||||
|
| cores | "1" | Number of cores required by the task |
|
||||||
|
| memoryMB | "100" | Amount of memory (RSS) required by the task, in MB |
|
||||||
|
|
||||||
Loggers
|
Loggers
|
||||||
=======
|
=======
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,7 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
// Add executors here
|
// Add executors here
|
||||||
|
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
|
||||||
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
||||||
#include <daggy/executors/task/SlurmTaskExecutor.hpp>
|
#include <daggy/executors/task/SlurmTaskExecutor.hpp>
|
||||||
|
|
||||||
@@ -177,6 +178,27 @@ std::unique_ptr<de::TaskExecutor> executorFactory(const rj::Value &config)
|
|||||||
else if (name == "SlurmTaskExecutor") {
|
else if (name == "SlurmTaskExecutor") {
|
||||||
return std::make_unique<de::SlurmTaskExecutor>();
|
return std::make_unique<de::SlurmTaskExecutor>();
|
||||||
}
|
}
|
||||||
|
else if (name == "DaggyRunnerTaskExecutor") {
|
||||||
|
if (!execConfig.HasMember("runners"))
|
||||||
|
throw std::runtime_error(
|
||||||
|
"DaggyRunnerExecutor config needs at least one remote runner");
|
||||||
|
|
||||||
|
auto exe = std::make_unique<de::DaggyRunnerTaskExecutor>();
|
||||||
|
|
||||||
|
const auto &runners = execConfig["runners"];
|
||||||
|
if (!runners.IsArray()) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"DaggyRunnerExecutor runners must be an array of urls");
|
||||||
|
|
||||||
|
for (size_t i = 0; i < runners.Size(); ++i) {
|
||||||
|
if (!runners[i].IsString())
|
||||||
|
throw std::runtime_error(
|
||||||
|
"DaggyRunnerExecutor runners must be an array of urls");
|
||||||
|
exe->addRunner(runners[i].GetString());
|
||||||
|
}
|
||||||
|
return exe;
|
||||||
|
}
|
||||||
|
}
|
||||||
else
|
else
|
||||||
throw std::runtime_error("Unknown executor type: " + name);
|
throw std::runtime_error("Unknown executor type: " + name);
|
||||||
}
|
}
|
||||||
@@ -246,7 +268,7 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
Pistache::Address listenSpec(listenIP, listenPort);
|
Pistache::Address listenSpec(listenIP, listenPort);
|
||||||
|
|
||||||
daggy::Server server(listenSpec, *logger, *executor, dagThreads);
|
daggy::daggyd::Server server(listenSpec, *logger, *executor, dagThreads);
|
||||||
server.init(webThreads);
|
server.init(webThreads);
|
||||||
server.start();
|
server.start();
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
namespace daggy {
|
namespace daggy::daggyd {
|
||||||
class Server
|
class Server
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
@@ -64,4 +64,4 @@ namespace daggy {
|
|||||||
std::mutex runnerGuard_;
|
std::mutex runnerGuard_;
|
||||||
std::unordered_map<DAGRunID, std::shared_ptr<DAGRunner>> runners_;
|
std::unordered_map<DAGRunID, std::shared_ptr<DAGRunner>> runners_;
|
||||||
};
|
};
|
||||||
} // namespace daggy
|
} // namespace daggy::daggyd
|
||||||
|
|||||||
@@ -18,7 +18,7 @@
|
|||||||
|
|
||||||
using namespace Pistache;
|
using namespace Pistache;
|
||||||
|
|
||||||
namespace daggy {
|
namespace daggy::daggyd {
|
||||||
void Server::init(size_t threads)
|
void Server::init(size_t threads)
|
||||||
{
|
{
|
||||||
auto opts = Http::Endpoint::options()
|
auto opts = Http::Endpoint::options()
|
||||||
@@ -305,14 +305,7 @@ namespace daggy {
|
|||||||
else {
|
else {
|
||||||
ss << ',';
|
ss << ',';
|
||||||
}
|
}
|
||||||
ss << '{' << R"("startTime":)"
|
ss << attemptRecordToJSON(attempt);
|
||||||
<< std::quoted(timePointToString(attempt.startTime)) << ','
|
|
||||||
<< R"("stopTime":)"
|
|
||||||
<< std::quoted(timePointToString(attempt.stopTime)) << ','
|
|
||||||
<< R"("rc":)" << attempt.rc << ',' << R"("outputLog":)"
|
|
||||||
<< std::quoted(attempt.outputLog) << ',' << R"("errorLog":)"
|
|
||||||
<< std::quoted(attempt.errorLog) << ',' << R"("executorLog":)"
|
|
||||||
<< std::quoted(attempt.executorLog) << '}';
|
|
||||||
}
|
}
|
||||||
ss << ']';
|
ss << ']';
|
||||||
}
|
}
|
||||||
@@ -511,4 +504,4 @@ namespace daggy {
|
|||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace daggy
|
} // namespace daggy::daggyd
|
||||||
|
|||||||
@@ -17,118 +17,6 @@ namespace rj = rapidjson;
|
|||||||
|
|
||||||
using namespace daggy;
|
using namespace daggy;
|
||||||
|
|
||||||
#ifdef DEBUG_HTTP
|
|
||||||
static int my_trace(CURL *handle, curl_infotype type, char *data, size_t size,
|
|
||||||
void *userp)
|
|
||||||
{
|
|
||||||
const char *text;
|
|
||||||
(void)handle; /* prevent compiler warning */
|
|
||||||
(void)userp;
|
|
||||||
|
|
||||||
switch (type) {
|
|
||||||
case CURLINFO_TEXT:
|
|
||||||
fprintf(stderr, "== Info: %s", data);
|
|
||||||
default: /* in case a new one is introduced to shock us */
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
case CURLINFO_HEADER_OUT:
|
|
||||||
text = "=> Send header";
|
|
||||||
break;
|
|
||||||
case CURLINFO_DATA_OUT:
|
|
||||||
text = "=> Send data";
|
|
||||||
break;
|
|
||||||
case CURLINFO_SSL_DATA_OUT:
|
|
||||||
text = "=> Send SSL data";
|
|
||||||
break;
|
|
||||||
case CURLINFO_HEADER_IN:
|
|
||||||
text = "<= Recv header";
|
|
||||||
break;
|
|
||||||
case CURLINFO_DATA_IN:
|
|
||||||
text = "<= Recv data";
|
|
||||||
break;
|
|
||||||
case CURLINFO_SSL_DATA_IN:
|
|
||||||
text = "<= Recv SSL data";
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cerr << "\n================== " << text
|
|
||||||
<< " ==================" << std::endl
|
|
||||||
<< data << std::endl;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
enum HTTPCode : long
|
|
||||||
{
|
|
||||||
Ok = 200,
|
|
||||||
Not_Found = 404
|
|
||||||
};
|
|
||||||
|
|
||||||
struct HTTPResponse
|
|
||||||
{
|
|
||||||
HTTPCode code;
|
|
||||||
std::string body;
|
|
||||||
};
|
|
||||||
|
|
||||||
uint curlWriter(char *in, uint size, uint nmemb, std::stringstream *out)
|
|
||||||
{
|
|
||||||
uint r;
|
|
||||||
r = size * nmemb;
|
|
||||||
out->write(in, r);
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
HTTPResponse REQUEST(const std::string &url, const std::string &payload = "",
|
|
||||||
const std::string &method = "GET")
|
|
||||||
{
|
|
||||||
HTTPResponse response;
|
|
||||||
|
|
||||||
CURL *curl;
|
|
||||||
CURLcode res;
|
|
||||||
struct curl_slist *headers = NULL;
|
|
||||||
|
|
||||||
curl_global_init(CURL_GLOBAL_ALL);
|
|
||||||
|
|
||||||
curl = curl_easy_init();
|
|
||||||
if (curl) {
|
|
||||||
std::stringstream buffer;
|
|
||||||
|
|
||||||
#ifdef DEBUG_HTTP
|
|
||||||
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, my_trace);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
|
||||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curlWriter);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
|
|
||||||
|
|
||||||
if (!payload.empty()) {
|
|
||||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, payload.size());
|
|
||||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, payload.c_str());
|
|
||||||
headers = curl_slist_append(headers, "Content-Type: Application/Json");
|
|
||||||
}
|
|
||||||
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method.c_str());
|
|
||||||
headers = curl_slist_append(headers, "Expect:");
|
|
||||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
|
||||||
|
|
||||||
res = curl_easy_perform(curl);
|
|
||||||
|
|
||||||
if (res != CURLE_OK) {
|
|
||||||
curl_easy_cleanup(curl);
|
|
||||||
throw std::runtime_error(std::string{"CURL Failed: "} +
|
|
||||||
curl_easy_strerror(res));
|
|
||||||
}
|
|
||||||
curl_easy_cleanup(curl);
|
|
||||||
|
|
||||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response.code);
|
|
||||||
response.body = buffer.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_global_cleanup();
|
|
||||||
|
|
||||||
return response;
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_CASE("rest_endpoint", "[server_basic]")
|
TEST_CASE("rest_endpoint", "[server_basic]")
|
||||||
{
|
{
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
@@ -138,7 +26,7 @@ TEST_CASE("rest_endpoint", "[server_basic]")
|
|||||||
|
|
||||||
const size_t nDAGRunners = 10, nWebThreads = 10;
|
const size_t nDAGRunners = 10, nWebThreads = 10;
|
||||||
|
|
||||||
daggy::Server server(listenSpec, logger, executor, nDAGRunners);
|
daggy::daggyd::Server server(listenSpec, logger, executor, nDAGRunners);
|
||||||
server.init(nWebThreads);
|
server.init(nWebThreads);
|
||||||
server.start();
|
server.start();
|
||||||
|
|
||||||
@@ -147,13 +35,13 @@ TEST_CASE("rest_endpoint", "[server_basic]")
|
|||||||
|
|
||||||
SECTION("Ready Endpoint")
|
SECTION("Ready Endpoint")
|
||||||
{
|
{
|
||||||
auto response = REQUEST(baseURL + "/ready");
|
auto response = HTTP_REQUEST(baseURL + "/ready");
|
||||||
REQUIRE(response.code == HTTPCode::Ok);
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
}
|
}
|
||||||
|
|
||||||
SECTION("Querying a non-existent dagrunid should fail ")
|
SECTION("Querying a non-existent dagrunid should fail ")
|
||||||
{
|
{
|
||||||
auto response = REQUEST(baseURL + "/v1/dagrun/100");
|
auto response = HTTP_REQUEST(baseURL + "/v1/dagrun/100");
|
||||||
REQUIRE(response.code != HTTPCode::Ok);
|
REQUIRE(response.code != HTTPCode::Ok);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,7 +63,7 @@ TEST_CASE("rest_endpoint", "[server_basic]")
|
|||||||
// Submit, and get the runID
|
// Submit, and get the runID
|
||||||
daggy::DAGRunID runID = 0;
|
daggy::DAGRunID runID = 0;
|
||||||
{
|
{
|
||||||
auto response = REQUEST(baseURL + "/v1/dagrun/", dagRun, "POST");
|
auto response = HTTP_REQUEST(baseURL + "/v1/dagrun/", dagRun, "POST");
|
||||||
REQUIRE(response.code == HTTPCode::Ok);
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
|
|
||||||
rj::Document doc;
|
rj::Document doc;
|
||||||
@@ -188,7 +76,7 @@ TEST_CASE("rest_endpoint", "[server_basic]")
|
|||||||
|
|
||||||
// Ensure our runID shows up in the list of running DAGs
|
// Ensure our runID shows up in the list of running DAGs
|
||||||
{
|
{
|
||||||
auto response = REQUEST(baseURL + "/v1/dagruns?all=1");
|
auto response = HTTP_REQUEST(baseURL + "/v1/dagruns?all=1");
|
||||||
REQUIRE(response.code == HTTPCode::Ok);
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
|
|
||||||
rj::Document doc;
|
rj::Document doc;
|
||||||
@@ -217,8 +105,8 @@ TEST_CASE("rest_endpoint", "[server_basic]")
|
|||||||
|
|
||||||
// Ensure we can get one of our tasks
|
// Ensure we can get one of our tasks
|
||||||
{
|
{
|
||||||
auto response = REQUEST(baseURL + "/v1/dagrun/" + std::to_string(runID) +
|
auto response = HTTP_REQUEST(baseURL + "/v1/dagrun/" +
|
||||||
"/task/cat_0");
|
std::to_string(runID) + "/task/cat_0");
|
||||||
REQUIRE(response.code == HTTPCode::Ok);
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
|
|
||||||
rj::Document doc;
|
rj::Document doc;
|
||||||
@@ -233,7 +121,8 @@ TEST_CASE("rest_endpoint", "[server_basic]")
|
|||||||
// Wait until our DAG is complete
|
// Wait until our DAG is complete
|
||||||
bool complete = true;
|
bool complete = true;
|
||||||
for (auto i = 0; i < 10; ++i) {
|
for (auto i = 0; i < 10; ++i) {
|
||||||
auto response = REQUEST(baseURL + "/v1/dagrun/" + std::to_string(runID));
|
auto response =
|
||||||
|
HTTP_REQUEST(baseURL + "/v1/dagrun/" + std::to_string(runID));
|
||||||
REQUIRE(response.code == HTTPCode::Ok);
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
rj::Document doc;
|
rj::Document doc;
|
||||||
daggy::checkRJParse(doc.Parse(response.body.c_str()));
|
daggy::checkRJParse(doc.Parse(response.body.c_str()));
|
||||||
@@ -281,7 +170,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
|
|||||||
|
|
||||||
const size_t nDAGRunners = 10, nWebThreads = 10;
|
const size_t nDAGRunners = 10, nWebThreads = 10;
|
||||||
|
|
||||||
daggy::Server server(listenSpec, logger, executor, nDAGRunners);
|
daggy::daggyd::Server server(listenSpec, logger, executor, nDAGRunners);
|
||||||
server.init(nWebThreads);
|
server.init(nWebThreads);
|
||||||
server.start();
|
server.start();
|
||||||
|
|
||||||
@@ -304,7 +193,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
|
|||||||
// Submit, and get the runID
|
// Submit, and get the runID
|
||||||
daggy::DAGRunID runID;
|
daggy::DAGRunID runID;
|
||||||
{
|
{
|
||||||
auto response = REQUEST(baseURL + "/v1/dagrun/", dagRunJSON, "POST");
|
auto response = HTTP_REQUEST(baseURL + "/v1/dagrun/", dagRunJSON, "POST");
|
||||||
REQUIRE(response.code == HTTPCode::Ok);
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
|
|
||||||
rj::Document doc;
|
rj::Document doc;
|
||||||
@@ -319,7 +208,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
|
|||||||
|
|
||||||
// Stop the current run
|
// Stop the current run
|
||||||
{
|
{
|
||||||
auto response = REQUEST(
|
auto response = HTTP_REQUEST(
|
||||||
baseURL + "/v1/dagrun/" + std::to_string(runID) + "/state/KILLED", "",
|
baseURL + "/v1/dagrun/" + std::to_string(runID) + "/state/KILLED", "",
|
||||||
"PATCH");
|
"PATCH");
|
||||||
REQUIRE(response.code == HTTPCode::Ok);
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
@@ -342,7 +231,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
|
|||||||
{
|
{
|
||||||
auto url = baseURL + "/v1/dagrun/" + std::to_string(runID) +
|
auto url = baseURL + "/v1/dagrun/" + std::to_string(runID) +
|
||||||
"/task/sleep_B_0/state/QUEUED";
|
"/task/sleep_B_0/state/QUEUED";
|
||||||
auto response = REQUEST(url, "", "PATCH");
|
auto response = HTTP_REQUEST(url, "", "PATCH");
|
||||||
REQUIRE(response.code == HTTPCode::Ok);
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
REQUIRE(logger.getTaskState(runID, "sleep_B_0") ==
|
REQUIRE(logger.getTaskState(runID, "sleep_B_0") ==
|
||||||
+daggy::RunState::QUEUED);
|
+daggy::RunState::QUEUED);
|
||||||
@@ -355,7 +244,7 @@ TEST_CASE("Server cancels and resumes execution", "[server_resume]")
|
|||||||
lstat("resume_touch_A", &s);
|
lstat("resume_touch_A", &s);
|
||||||
auto preMTime = s.st_mtim.tv_sec;
|
auto preMTime = s.st_mtim.tv_sec;
|
||||||
|
|
||||||
auto response = REQUEST(
|
auto response = HTTP_REQUEST(
|
||||||
baseURL + "/v1/dagrun/" + std::to_string(runID) + "/state/QUEUED", "",
|
baseURL + "/v1/dagrun/" + std::to_string(runID) + "/state/QUEUED", "",
|
||||||
"PATCH");
|
"PATCH");
|
||||||
|
|
||||||
|
|||||||
3
daggyr/CMakeLists.txt
Normal file
3
daggyr/CMakeLists.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
add_subdirectory(libdaggyr)
|
||||||
|
add_subdirectory(daggyr)
|
||||||
|
add_subdirectory(tests)
|
||||||
68
daggyr/README.md
Normal file
68
daggyr/README.md
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
# Daggy Runner
|
||||||
|
|
||||||
|
`daggyr` is a REST server process that acts as a remote task executor.
|
||||||
|
|
||||||
|
# Running it
|
||||||
|
|
||||||
|
```bash
|
||||||
|
daggyr # That's it, will listen on 127.0.0.1:2504 , and run with a local executor
|
||||||
|
daggyr -d # Daemonize
|
||||||
|
|
||||||
|
daggyr --config FILE # Run with a config file
|
||||||
|
```
|
||||||
|
|
||||||
|
# Capacity and Allocation
|
||||||
|
|
||||||
|
On startup, a server's capacity is determined automatically. The capacities are:
|
||||||
|
|
||||||
|
| Capacity | Determined by | Default | Notes |
|
||||||
|
|-----------|---------------------------------------|-----------------------------|----------------------------------|
|
||||||
|
| cores | `std::thread::hardware_concurrency()` | `max(1, max - 2)` | A value of 0 will mean all cores |
|
||||||
|
| memory_mb | `sysinfo.h` | `max(100, totalram * 0.75)` | `totalram` is converted to MB |
|
||||||
|
|
||||||
|
When a `daggyd` process is selecting a runner to send a task to, it will
|
||||||
|
query the current capacities, and choose the runner that:
|
||||||
|
|
||||||
|
- Can satisfy the requirements of the task
|
||||||
|
- Has the lowest impact, which is the largest relative drop in available capacity across all capacities.
|
||||||
|
|
||||||
|
For instance, if a job were submitted that requires 2 cores and 5g of memory,
|
||||||
|
and three runners reported the following capacities:
|
||||||
|
|
||||||
|
| Runner | free_cores | impact_cores | free_memory | impact_memory | max_impact |
|
||||||
|
|--------|------------|--------------|-------------|---------------|------------|
|
||||||
|
| 1 | 70 | 2.8% | 20g | 25.00% | 25% |
|
||||||
|
| 2 | 4 | 50.0% | 80g | 6.25% | 50% |
|
||||||
|
| 3 | 10 | 20.0% | 30g | 16.67% | 20% |
|
||||||
|
|
||||||
|
Runner 3 would be selected. Even though it doesn't have the most memory
|
||||||
|
or CPU capacity, allocating the job to it minimizes the impact to the
|
||||||
|
overall availability.
|
||||||
|
|
||||||
|
# Submission and Execution
|
||||||
|
|
||||||
|
Tasks submitted to the runner will be executed with [cgroups](https://www.man7.org/linux/man-pages/man7/cgroups.7.html)
|
||||||
|
to enforce limits.
|
||||||
|
|
||||||
|
Jobs are submitted asynchronously, and rely on the client to poll for
|
||||||
|
results using the `GET /api/v1/task/:task_id` to get the resulting
|
||||||
|
TaskAttempt.
|
||||||
|
|
||||||
|
Runners are **stateless**, meaning that killing one will kill any
|
||||||
|
running tasks and any stored results will be lost.
|
||||||
|
|
||||||
|
# Config Files
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"web-threads": 50,
|
||||||
|
"port": 2504,
|
||||||
|
"ip": "localhost",
|
||||||
|
"capacity_overrides": {
|
||||||
|
"cores": 10,
|
||||||
|
"memory_mb": 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Capacities can be overriden from the auto-discovered results.
|
||||||
4
daggyr/daggyr/CMakeLists.txt
Normal file
4
daggyr/daggyr/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
project(daggyr)
|
||||||
|
file(GLOB SOURCES daggyr.cpp)
|
||||||
|
add_executable(${PROJECT_NAME} ${SOURCES})
|
||||||
|
target_link_libraries(${PROJECT_NAME} argparse libdaggyr libdaggy curl)
|
||||||
193
daggyr/daggyr/daggyr.cpp
Normal file
193
daggyr/daggyr/daggyr.cpp
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
#include <rapidjson/document.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <sys/sysinfo.h>
|
||||||
|
|
||||||
|
#include <argparse.hpp>
|
||||||
|
#include <atomic>
|
||||||
|
#include <csignal>
|
||||||
|
#include <daggy/Serialization.hpp>
|
||||||
|
#include <daggyr/Server.hpp>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
// Add executors here
|
||||||
|
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
||||||
|
#include <daggy/executors/task/SlurmTaskExecutor.hpp>
|
||||||
|
|
||||||
|
// Add loggers here
|
||||||
|
#include <daggy/executors/task/TaskExecutor.hpp>
|
||||||
|
#include <daggy/loggers/dag_run/DAGRunLogger.hpp>
|
||||||
|
#include <daggy/loggers/dag_run/OStreamLogger.hpp>
|
||||||
|
#include <daggy/loggers/dag_run/RedisLogger.hpp>
|
||||||
|
|
||||||
|
namespace rj = rapidjson;
|
||||||
|
|
||||||
|
static std::atomic<bool> running{true};
|
||||||
|
|
||||||
|
void signalHandler(int signal)
|
||||||
|
{
|
||||||
|
switch (signal) {
|
||||||
|
case SIGHUP:
|
||||||
|
break;
|
||||||
|
case SIGINT:
|
||||||
|
case SIGTERM:
|
||||||
|
running = false;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void daemonize()
|
||||||
|
{
|
||||||
|
pid_t pid;
|
||||||
|
|
||||||
|
struct sigaction newSigAction;
|
||||||
|
sigset_t newSigSet;
|
||||||
|
|
||||||
|
/* Check if parent process id is set */
|
||||||
|
if (getppid() == 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Set signal mask - signals we want to block */
|
||||||
|
sigemptyset(&newSigSet);
|
||||||
|
sigaddset(&newSigSet,
|
||||||
|
SIGCHLD); /* ignore child - i.e. we don't need to wait for it */
|
||||||
|
sigaddset(&newSigSet, SIGTSTP); /* ignore Tty stop signals */
|
||||||
|
sigaddset(&newSigSet, SIGTTOU); /* ignore Tty background writes */
|
||||||
|
sigaddset(&newSigSet, SIGTTIN); /* ignore Tty background reads */
|
||||||
|
sigprocmask(SIG_BLOCK, &newSigSet,
|
||||||
|
nullptr); /* Block the above specified signals */
|
||||||
|
|
||||||
|
/* Set up a signal handler */
|
||||||
|
newSigAction.sa_handler = signalHandler;
|
||||||
|
sigemptyset(&newSigAction.sa_mask);
|
||||||
|
newSigAction.sa_flags = 0;
|
||||||
|
|
||||||
|
/* Signals to handle */
|
||||||
|
sigaction(SIGHUP, &newSigAction, nullptr); /* catch hangup signal */
|
||||||
|
sigaction(SIGTERM, &newSigAction, nullptr); /* catch term signal */
|
||||||
|
sigaction(SIGINT, &newSigAction, nullptr); /* catch interrupt signal */
|
||||||
|
|
||||||
|
// Fork once
|
||||||
|
pid = fork();
|
||||||
|
if (pid < 0) {
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
if (pid > 0) {
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* On success: The child process becomes session leader */
|
||||||
|
if (setsid() < 0) {
|
||||||
|
std::cerr << "Unable to setsid" << std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Catch, ignore and handle signals */
|
||||||
|
signal(SIGCHLD, SIG_IGN);
|
||||||
|
signal(SIGHUP, SIG_IGN);
|
||||||
|
|
||||||
|
/* Fork off for the second time*/
|
||||||
|
pid = fork();
|
||||||
|
if (pid < 0)
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
if (pid > 0)
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
|
||||||
|
umask(0);
|
||||||
|
|
||||||
|
/* Change the working directory to the root directory */
|
||||||
|
/* or another appropriated directory */
|
||||||
|
auto rc = chdir("/");
|
||||||
|
(void)rc;
|
||||||
|
|
||||||
|
/* Close all open file descriptors */
|
||||||
|
for (int x = sysconf(_SC_OPEN_MAX); x >= 0; x--) {
|
||||||
|
close(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
argparse::ArgumentParser args("Daggy");
|
||||||
|
|
||||||
|
args.add_argument("-v", "--verbose")
|
||||||
|
.default_value(false)
|
||||||
|
.implicit_value(true);
|
||||||
|
args.add_argument("-d", "--daemon").default_value(false).implicit_value(true);
|
||||||
|
args.add_argument("--config").default_value(std::string{});
|
||||||
|
|
||||||
|
try {
|
||||||
|
args.parse_args(argc, argv);
|
||||||
|
}
|
||||||
|
catch (std::exception &e) {
|
||||||
|
std::cout << "Error: " << e.what() << std::endl;
|
||||||
|
std::cout << args;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct sysinfo systemInfo;
|
||||||
|
|
||||||
|
sysinfo(&systemInfo);
|
||||||
|
|
||||||
|
bool verbose = args.get<bool>("--verbose");
|
||||||
|
bool asDaemon = args.get<bool>("--daemon");
|
||||||
|
auto configFile = args.get<std::string>("--config");
|
||||||
|
std::string listenIP = "127.0.0.1";
|
||||||
|
int listenPort = 2504;
|
||||||
|
size_t webThreads = 50;
|
||||||
|
ssize_t maxCores = std::max(1U, std::thread::hardware_concurrency() - 2);
|
||||||
|
ssize_t maxMemoryMB =
|
||||||
|
std::max((systemInfo.totalram / (1024 * 1024) * 0.75), 100.0);
|
||||||
|
|
||||||
|
if (!configFile.empty()) {
|
||||||
|
std::ifstream ifh(configFile);
|
||||||
|
std::string config;
|
||||||
|
std::getline(ifh, config, '\0');
|
||||||
|
ifh.close();
|
||||||
|
|
||||||
|
rj::Document doc;
|
||||||
|
daggy::checkRJParse(doc.Parse(config.c_str()));
|
||||||
|
|
||||||
|
if (doc.HasMember("ip"))
|
||||||
|
listenIP = doc["ip"].GetString();
|
||||||
|
if (doc.HasMember("port"))
|
||||||
|
listenPort = doc["port"].GetInt();
|
||||||
|
if (doc.HasMember("web-threads"))
|
||||||
|
webThreads = doc["web-threads"].GetInt64();
|
||||||
|
if (doc.HasMember("capacity-overrides")) {
|
||||||
|
const auto &co = doc["capacity-overrides"];
|
||||||
|
if (co.HasMember("cores"))
|
||||||
|
maxCores = co["cores"].GetInt64();
|
||||||
|
if (co.HasMember("memoryMB"))
|
||||||
|
maxCores = co["memoryMB"].GetInt64();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
std::cout << "Server running at http://" << listenIP << ':' << listenPort
|
||||||
|
<< std::endl
|
||||||
|
<< "Max Cores: " << maxCores << std::endl
|
||||||
|
<< "Max Memory: " << maxMemoryMB << " MB" << std::endl
|
||||||
|
<< "Max Web Clients: " << webThreads << std::endl
|
||||||
|
<< std::endl
|
||||||
|
<< "Ctrl-C to exit" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (asDaemon) {
|
||||||
|
daemonize();
|
||||||
|
}
|
||||||
|
|
||||||
|
Pistache::Address listenSpec(listenIP, listenPort);
|
||||||
|
daggy::daggyr::Server server(listenSpec, maxCores, maxMemoryMB);
|
||||||
|
server.init(webThreads);
|
||||||
|
server.start();
|
||||||
|
|
||||||
|
running = true;
|
||||||
|
while (running) {
|
||||||
|
std::this_thread::sleep_for(std::chrono::seconds(30));
|
||||||
|
}
|
||||||
|
server.shutdown();
|
||||||
|
}
|
||||||
8
daggyr/libdaggyr/CMakeLists.txt
Normal file
8
daggyr/libdaggyr/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
project(libdaggyr)
|
||||||
|
|
||||||
|
add_library(${PROJECT_NAME} STATIC)
|
||||||
|
|
||||||
|
target_include_directories(${PROJECT_NAME} PUBLIC include)
|
||||||
|
target_link_libraries(${PROJECT_NAME} libdaggy stdc++fs)
|
||||||
|
|
||||||
|
add_subdirectory(src)
|
||||||
84
daggyr/libdaggyr/include/daggyr/Server.hpp
Normal file
84
daggyr/libdaggyr/include/daggyr/Server.hpp
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <pistache/description.h>
|
||||||
|
#include <pistache/endpoint.h>
|
||||||
|
#include <pistache/http.h>
|
||||||
|
|
||||||
|
#include <daggy/DAGRunner.hpp>
|
||||||
|
#include <daggy/ThreadPool.hpp>
|
||||||
|
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
|
||||||
|
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
||||||
|
#include <daggy/loggers/dag_run/DAGRunLogger.hpp>
|
||||||
|
#include <filesystem>
|
||||||
|
|
||||||
|
#define DAGGY_REST_HANDLER(func) \
|
||||||
|
void func(const Pistache::Rest::Request &request, \
|
||||||
|
Pistache::Http::ResponseWriter response);
|
||||||
|
|
||||||
|
namespace fs = std::filesystem;
|
||||||
|
using namespace daggy::executors::task::daggy_runner;
|
||||||
|
|
||||||
|
namespace daggy::daggyr {
|
||||||
|
|
||||||
|
class Server
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
Server(const Pistache::Address &listenSpec, ssize_t maxCores,
|
||||||
|
ssize_t maxMemoryMB);
|
||||||
|
~Server();
|
||||||
|
|
||||||
|
Server &setSSLCertificates(const fs::path &cert, const fs::path &key);
|
||||||
|
|
||||||
|
void init(size_t threads = 1);
|
||||||
|
|
||||||
|
void start();
|
||||||
|
|
||||||
|
uint16_t getPort() const;
|
||||||
|
|
||||||
|
void shutdown();
|
||||||
|
|
||||||
|
static void validateTask(const Task &task);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void createDescription();
|
||||||
|
|
||||||
|
bool handleAuth(const Pistache::Rest::Request &request);
|
||||||
|
|
||||||
|
DAGGY_REST_HANDLER(handleReady);
|
||||||
|
DAGGY_REST_HANDLER(handleGetCapacity);
|
||||||
|
DAGGY_REST_HANDLER(handleRunTask);
|
||||||
|
DAGGY_REST_HANDLER(handleGetTask);
|
||||||
|
DAGGY_REST_HANDLER(handleStopTask);
|
||||||
|
DAGGY_REST_HANDLER(handleValidateTask);
|
||||||
|
|
||||||
|
Pistache::Http::Endpoint endpoint_;
|
||||||
|
Pistache::Rest::Description desc_;
|
||||||
|
Pistache::Rest::Router router_;
|
||||||
|
|
||||||
|
executors::task::ForkingTaskExecutor executor_;
|
||||||
|
|
||||||
|
struct TaskRecord
|
||||||
|
{
|
||||||
|
RunState state;
|
||||||
|
AttemptRecord attempt;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::mutex capacityGuard_;
|
||||||
|
Capacity maxCapacity_;
|
||||||
|
Capacity curCapacity_;
|
||||||
|
|
||||||
|
std::mutex pendingGuard_;
|
||||||
|
|
||||||
|
struct PendingJob
|
||||||
|
{
|
||||||
|
std::future<AttemptRecord> fut;
|
||||||
|
Capacity resourcesUsed;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unordered_map<std::pair<DAGRunID, std::string>, PendingJob> pending_;
|
||||||
|
|
||||||
|
std::mutex resultsGuard_;
|
||||||
|
std::unordered_map<std::pair<DAGRunID, std::string>, AttemptRecord>
|
||||||
|
results_;
|
||||||
|
};
|
||||||
|
} // namespace daggy::daggyr
|
||||||
3
daggyr/libdaggyr/src/CMakeLists.txt
Normal file
3
daggyr/libdaggyr/src/CMakeLists.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
target_sources(${PROJECT_NAME} PRIVATE
|
||||||
|
Server.cpp
|
||||||
|
)
|
||||||
259
daggyr/libdaggyr/src/Server.cpp
Normal file
259
daggyr/libdaggyr/src/Server.cpp
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
#include <enum.h>
|
||||||
|
|
||||||
|
#include <daggy/Serialization.hpp>
|
||||||
|
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
|
||||||
|
#include <daggyr/Server.hpp>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <mutex>
|
||||||
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <thread>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#define REQ_RESPONSE(code, msg) \
|
||||||
|
std::stringstream ss; \
|
||||||
|
ss << R"({"message": )" << std::quoted(msg) << "}"; \
|
||||||
|
response.send(Pistache::Http::Code::code, ss.str()); \
|
||||||
|
return;
|
||||||
|
|
||||||
|
using namespace Pistache;
|
||||||
|
|
||||||
|
namespace daggy::daggyr {
|
||||||
|
void Server::init(size_t threads)
|
||||||
|
{
|
||||||
|
auto opts = Http::Endpoint::options()
|
||||||
|
.threads(threads)
|
||||||
|
.flags(Pistache::Tcp::Options::ReuseAddr |
|
||||||
|
Pistache::Tcp::Options::ReusePort)
|
||||||
|
.maxRequestSize(102400)
|
||||||
|
.maxResponseSize(102400);
|
||||||
|
endpoint_.init(opts);
|
||||||
|
createDescription();
|
||||||
|
}
|
||||||
|
|
||||||
|
Server::Server(const Pistache::Address &listenSpec, ssize_t maxCores,
|
||||||
|
ssize_t maxMemoryMB)
|
||||||
|
: endpoint_(listenSpec)
|
||||||
|
, desc_("Daggy Runner API", "0.1")
|
||||||
|
, executor_(maxCores)
|
||||||
|
, maxCapacity_{maxCores, maxMemoryMB}
|
||||||
|
, curCapacity_{maxCores, maxMemoryMB}
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
Server::~Server()
|
||||||
|
{
|
||||||
|
shutdown();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Server::start()
|
||||||
|
{
|
||||||
|
router_.initFromDescription(desc_);
|
||||||
|
|
||||||
|
endpoint_.setHandler(router_.handler());
|
||||||
|
endpoint_.serveThreaded();
|
||||||
|
}
|
||||||
|
|
||||||
|
Server &Server::setSSLCertificates(const fs::path &cert, const fs::path &key)
|
||||||
|
{
|
||||||
|
endpoint_.useSSL(cert, key);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Server::shutdown()
|
||||||
|
{
|
||||||
|
endpoint_.shutdown();
|
||||||
|
}
|
||||||
|
|
||||||
|
uint16_t Server::getPort() const
|
||||||
|
{
|
||||||
|
return endpoint_.getPort();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Server::createDescription()
|
||||||
|
{
|
||||||
|
desc_.info().license("MIT", "https://opensource.org/licenses/MIT");
|
||||||
|
|
||||||
|
desc_.schemes(Rest::Scheme::Http)
|
||||||
|
.basePath("/v1")
|
||||||
|
.produces(MIME(Application, Json))
|
||||||
|
.consumes(MIME(Application, Json));
|
||||||
|
|
||||||
|
desc_.route(desc_.get("/ready"))
|
||||||
|
.bind(&Server::handleReady, this)
|
||||||
|
.response(Http::Code::Ok, "Response to the /ready call")
|
||||||
|
.hide();
|
||||||
|
|
||||||
|
auto versionPath = desc_.path("/v1");
|
||||||
|
|
||||||
|
versionPath.route(desc_.post("/validate"))
|
||||||
|
.bind(&Server::handleValidateTask, this)
|
||||||
|
.produces(MIME(Application, Json))
|
||||||
|
.response(Http::Code::Ok, "Validate a task");
|
||||||
|
|
||||||
|
versionPath.route(desc_.post("/task/:runID/:taskName"))
|
||||||
|
.bind(&Server::handleRunTask, this)
|
||||||
|
.produces(MIME(Application, Json))
|
||||||
|
.response(Http::Code::Ok, "Run a task");
|
||||||
|
|
||||||
|
versionPath.route(desc_.get("/task/:runID/:taskName"))
|
||||||
|
.bind(&Server::handleGetTask, this)
|
||||||
|
.produces(MIME(Application, Json))
|
||||||
|
.response(Http::Code::Ok,
|
||||||
|
"Get the state and potentially the AttemptRecord of a task");
|
||||||
|
|
||||||
|
versionPath.route(desc_.del("/task/:runID/:taskName"))
|
||||||
|
.bind(&Server::handleStopTask, this)
|
||||||
|
.produces(MIME(Application, Json))
|
||||||
|
.response(Http::Code::Ok, "Stop a task");
|
||||||
|
|
||||||
|
versionPath.route(desc_.get("/capacity"))
|
||||||
|
.bind(&Server::handleGetCapacity, this)
|
||||||
|
.produces(MIME(Application, Json))
|
||||||
|
.response(Http::Code::Ok, "Get capacities of worker");
|
||||||
|
}
|
||||||
|
|
||||||
|
void Server::handleValidateTask(const Pistache::Rest::Request &request,
|
||||||
|
Pistache::Http::ResponseWriter response)
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
auto task = taskFromJSON("sample_task", request.body());
|
||||||
|
daggy::executors::task::daggy_runner::validateTaskParameters(task.job);
|
||||||
|
}
|
||||||
|
catch (std::exception &e) {
|
||||||
|
REQ_RESPONSE(Not_Acceptable, e.what());
|
||||||
|
}
|
||||||
|
REQ_RESPONSE(Ok, "Task is valid");
|
||||||
|
}
|
||||||
|
|
||||||
|
void Server::handleRunTask(const Pistache::Rest::Request &request,
|
||||||
|
Pistache::Http::ResponseWriter response)
|
||||||
|
{
|
||||||
|
if (!handleAuth(request))
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto runID = request.param(":runID").as<DAGRunID>();
|
||||||
|
auto taskName = request.param(":taskName").as<std::string>();
|
||||||
|
|
||||||
|
Capacity resourcesUsed;
|
||||||
|
Task task;
|
||||||
|
try {
|
||||||
|
task = taskFromJSON(taskName, request.body());
|
||||||
|
resourcesUsed = capacityFromTask(task);
|
||||||
|
}
|
||||||
|
catch (std::exception &e) {
|
||||||
|
REQ_RESPONSE(Not_Acceptable, e.what());
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(capacityGuard_);
|
||||||
|
curCapacity_.cores -= resourcesUsed.cores;
|
||||||
|
curCapacity_.memoryMB -= resourcesUsed.memoryMB;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(pendingGuard_);
|
||||||
|
pending_.emplace(
|
||||||
|
std::make_pair(runID, taskName),
|
||||||
|
PendingJob{.fut = executor_.execute(runID, taskName, task),
|
||||||
|
.resourcesUsed = resourcesUsed});
|
||||||
|
}
|
||||||
|
|
||||||
|
response.send(Pistache::Http::Code::Ok, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
void Server::handleGetTask(const Pistache::Rest::Request &request,
|
||||||
|
Pistache::Http::ResponseWriter response)
|
||||||
|
{
|
||||||
|
if (!handleAuth(request))
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto runID = request.param(":runID").as<DAGRunID>();
|
||||||
|
auto taskName = request.param(":taskName").as<std::string>();
|
||||||
|
|
||||||
|
auto taskID = std::make_pair(runID, taskName);
|
||||||
|
|
||||||
|
std::string payload;
|
||||||
|
|
||||||
|
bool found = false;
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(pendingGuard_);
|
||||||
|
auto it = pending_.find(taskID);
|
||||||
|
if (it != pending_.end()) {
|
||||||
|
// poll it
|
||||||
|
if (it->second.fut.valid() and
|
||||||
|
it->second.fut.wait_for(1ms) == std::future_status::ready) {
|
||||||
|
auto attempt = it->second.fut.get();
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> rlock(resultsGuard_);
|
||||||
|
results_.emplace(taskID, attempt);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> rlock(capacityGuard_);
|
||||||
|
curCapacity_.cores += it->second.resourcesUsed.cores;
|
||||||
|
curCapacity_.memoryMB += it->second.resourcesUsed.memoryMB;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
payload = R"({ "state": "RUNNING" })";
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!found) {
|
||||||
|
std::lock_guard<std::mutex> lock(resultsGuard_);
|
||||||
|
auto it = results_.find(taskID);
|
||||||
|
if (it == results_.end()) {
|
||||||
|
REQ_RESPONSE(Not_Found, "No such task");
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = R"({ "state": "COMPLETED", "attempt": )" +
|
||||||
|
attemptRecordToJSON(it->second) + "}";
|
||||||
|
}
|
||||||
|
response.send(Pistache::Http::Code::Ok, payload);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Server::handleStopTask(const Pistache::Rest::Request &request,
|
||||||
|
Pistache::Http::ResponseWriter response)
|
||||||
|
{
|
||||||
|
if (!handleAuth(request))
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto runID = request.param(":runID").as<DAGRunID>();
|
||||||
|
auto taskName = request.param(":taskName").as<std::string>();
|
||||||
|
|
||||||
|
executor_.stop(runID, taskName);
|
||||||
|
|
||||||
|
REQ_RESPONSE(Ok, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
void Server::handleGetCapacity(const Pistache::Rest::Request &request,
|
||||||
|
Pistache::Http::ResponseWriter response)
|
||||||
|
{
|
||||||
|
std::string payload;
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(capacityGuard_);
|
||||||
|
payload = R"({ "current": )" + capacityToJSON(curCapacity_) +
|
||||||
|
R"(, "total": )" + capacityToJSON(maxCapacity_) + "}";
|
||||||
|
}
|
||||||
|
|
||||||
|
response.send(Pistache::Http::Code::Ok, payload);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Server::handleReady(const Pistache::Rest::Request &request,
|
||||||
|
Pistache::Http::ResponseWriter response)
|
||||||
|
{
|
||||||
|
response.send(Pistache::Http::Code::Ok, R"({ "msg": "Ready for tasks!"})");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* handleAuth will check any auth methods and handle any responses in the
|
||||||
|
* case of failed auth. If it returns false, callers should cease handling
|
||||||
|
* the response
|
||||||
|
*/
|
||||||
|
bool Server::handleAuth(const Pistache::Rest::Request &request)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} // namespace daggy::daggyr
|
||||||
9
daggyr/tests/CMakeLists.txt
Normal file
9
daggyr/tests/CMakeLists.txt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
project(daggyr_tests)
|
||||||
|
|
||||||
|
add_executable(${PROJECT_NAME} main.cpp
|
||||||
|
# unit tests
|
||||||
|
unit_server.cpp
|
||||||
|
)
|
||||||
|
target_link_libraries(${PROJECT_NAME} libdaggyr libdaggy stdc++fs Catch2::Catch2)
|
||||||
|
|
||||||
|
add_test(${PROJECT_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME})
|
||||||
15
daggyr/tests/main.cpp
Normal file
15
daggyr/tests/main.cpp
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "daggy/DAG.hpp"
|
||||||
|
|
||||||
|
#define CATCH_CONFIG_MAIN
|
||||||
|
|
||||||
|
#include <catch2/catch.hpp>
|
||||||
|
|
||||||
|
TEST_CASE("Sanity tests", "[sanity]")
|
||||||
|
{
|
||||||
|
REQUIRE(1 == 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// compile and run
|
||||||
|
// g++ -std=c++17 -o test test.cpp && ./test
|
||||||
172
daggyr/tests/unit_server.cpp
Normal file
172
daggyr/tests/unit_server.cpp
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
#include <curl/curl.h>
|
||||||
|
#include <rapidjson/document.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
|
||||||
|
#include <catch2/catch.hpp>
|
||||||
|
#include <daggy/Serialization.hpp>
|
||||||
|
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
||||||
|
#include <daggy/executors/task/NoopTaskExecutor.hpp>
|
||||||
|
#include <daggy/loggers/dag_run/OStreamLogger.hpp>
|
||||||
|
#include <daggyr/Server.hpp>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <iostream>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
namespace rj = rapidjson;
|
||||||
|
|
||||||
|
using namespace daggy;
|
||||||
|
|
||||||
|
TEST_CASE("rest_endpoint", "[server_basic]")
|
||||||
|
{
|
||||||
|
std::stringstream ss;
|
||||||
|
Pistache::Address listenSpec("localhost", Pistache::Port(0));
|
||||||
|
|
||||||
|
const ssize_t maxCores = 10, maxMemoryMB = 1000;
|
||||||
|
|
||||||
|
daggyr::Server server(listenSpec, maxCores, maxMemoryMB);
|
||||||
|
server.init(10);
|
||||||
|
server.start();
|
||||||
|
|
||||||
|
const std::string host = "localhost:";
|
||||||
|
const std::string baseURL = host + std::to_string(server.getPort());
|
||||||
|
|
||||||
|
SECTION("Ready Endpoint")
|
||||||
|
{
|
||||||
|
auto response = HTTP_REQUEST(baseURL + "/ready");
|
||||||
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
|
}
|
||||||
|
|
||||||
|
SECTION("Querying a non-existent task should yield a 404")
|
||||||
|
{
|
||||||
|
auto response = HTTP_REQUEST(baseURL + "/v1/task/100/sample_name");
|
||||||
|
REQUIRE(response.code == HTTPCode::Not_Found);
|
||||||
|
}
|
||||||
|
|
||||||
|
SECTION("Task Missing Cores should Fail")
|
||||||
|
{
|
||||||
|
std::string taskSpec =
|
||||||
|
R"({ "job": { "command": [ "/usr/bin/touch", "dagrun_{{FILE}}" ]}, "memoryMB": 100 })";
|
||||||
|
|
||||||
|
auto response =
|
||||||
|
HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", taskSpec, "POST");
|
||||||
|
REQUIRE(response.code == HTTPCode::Not_Acceptable);
|
||||||
|
}
|
||||||
|
|
||||||
|
SECTION("Task Missing MemoryMB should Fail")
|
||||||
|
{
|
||||||
|
std::string taskSpec =
|
||||||
|
R"({ "job": { "command": [ "/usr/bin/touch", "dagrun_{{FILE}}" ]}, "cores": 100 })";
|
||||||
|
|
||||||
|
auto response =
|
||||||
|
HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", taskSpec, "POST");
|
||||||
|
REQUIRE(response.code == HTTPCode::Not_Acceptable);
|
||||||
|
}
|
||||||
|
|
||||||
|
SECTION("Task submission and get result")
|
||||||
|
{
|
||||||
|
std::string taskSpec =
|
||||||
|
R"({ "job": { "command": [ "/usr/bin/echo", "hello", "world" ], "cores": "1", "memoryMB": "100" } })";
|
||||||
|
|
||||||
|
// Submit
|
||||||
|
{
|
||||||
|
auto response =
|
||||||
|
HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", taskSpec, "POST");
|
||||||
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
auto [code, doc] = JSON_HTTP_REQUEST(baseURL + "/v1/task/0/sample_task");
|
||||||
|
REQUIRE(doc.IsObject());
|
||||||
|
REQUIRE(doc.HasMember("state"));
|
||||||
|
|
||||||
|
std::string state = doc["state"].GetString();
|
||||||
|
if (state != "COMPLETED") {
|
||||||
|
std::this_thread::sleep_for(250ms);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
REQUIRE(doc.HasMember("attempt"));
|
||||||
|
auto attempt = attemptRecordFromJSON(doc["attempt"]);
|
||||||
|
|
||||||
|
REQUIRE(attempt.rc == 0);
|
||||||
|
REQUIRE(attempt.outputLog == "hello world\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SECTION("Task capacity changes")
|
||||||
|
{
|
||||||
|
std::string taskSpec =
|
||||||
|
R"({ "job": { "command": [ "/usr/bin/sleep", "5" ], "cores": "1", "memoryMB": "100" } })";
|
||||||
|
|
||||||
|
auto getCapacity = [&]() -> daggy::executors::task::daggy_runner::Capacity {
|
||||||
|
daggy::executors::task::daggy_runner::Capacity cap;
|
||||||
|
auto [code, doc] = JSON_HTTP_REQUEST(baseURL + "/v1/capacity");
|
||||||
|
REQUIRE(doc.IsObject());
|
||||||
|
REQUIRE(doc.HasMember("current"));
|
||||||
|
const auto &cur = doc["current"];
|
||||||
|
REQUIRE(cur.IsObject());
|
||||||
|
REQUIRE(cur.HasMember("cores"));
|
||||||
|
REQUIRE(cur.HasMember("memoryMB"));
|
||||||
|
|
||||||
|
cap.cores = cur["cores"].GetInt64();
|
||||||
|
cap.memoryMB = cur["memoryMB"].GetInt64();
|
||||||
|
|
||||||
|
return cap;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto preCap = getCapacity();
|
||||||
|
|
||||||
|
// Submit
|
||||||
|
{
|
||||||
|
auto response =
|
||||||
|
HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", taskSpec, "POST");
|
||||||
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto postCap = getCapacity();
|
||||||
|
|
||||||
|
REQUIRE(postCap.cores == preCap.cores - 1);
|
||||||
|
REQUIRE(postCap.memoryMB == preCap.memoryMB - 100);
|
||||||
|
|
||||||
|
// Ensure the current job is running
|
||||||
|
{
|
||||||
|
auto [code, doc] = JSON_HTTP_REQUEST(baseURL + "/v1/task/0/sample_task");
|
||||||
|
REQUIRE(doc.IsObject());
|
||||||
|
REQUIRE(doc.HasMember("state"));
|
||||||
|
REQUIRE(doc["state"] != "COMPLETED");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop it
|
||||||
|
{
|
||||||
|
auto [code, doc] =
|
||||||
|
JSON_HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", "", "DELETE");
|
||||||
|
REQUIRE(code == HTTPCode::Ok);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab it and ensure it was killed
|
||||||
|
while (true) {
|
||||||
|
auto response = HTTP_REQUEST(baseURL + "/v1/task/0/sample_task");
|
||||||
|
|
||||||
|
REQUIRE(response.code == HTTPCode::Ok);
|
||||||
|
rj::Document doc;
|
||||||
|
daggy::checkRJParse(doc.Parse(response.body.c_str()));
|
||||||
|
REQUIRE(doc.IsObject());
|
||||||
|
REQUIRE(doc.HasMember("state"));
|
||||||
|
|
||||||
|
std::string state = doc["state"].GetString();
|
||||||
|
if (state != "COMPLETED") {
|
||||||
|
std::this_thread::sleep_for(250ms);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
REQUIRE(doc.HasMember("attempt"));
|
||||||
|
auto attempt = attemptRecordFromJSON(doc["attempt"]);
|
||||||
|
|
||||||
|
REQUIRE(attempt.rc != 0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
server.shutdown();
|
||||||
|
}
|
||||||
@@ -11,7 +11,7 @@ IF (DAGGY_ENABLE_REDIS)
|
|||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${PROJECT_NAME} PUBLIC include)
|
target_include_directories(${PROJECT_NAME} PUBLIC include)
|
||||||
target_link_libraries(${PROJECT_NAME} pistache pthread rapidjson better-enums)
|
target_link_libraries(${PROJECT_NAME} pistache curl pthread rapidjson better-enums)
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
#include <rapidjson/document.h>
|
#include <rapidjson/document.h>
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
@@ -12,6 +13,8 @@
|
|||||||
#include "daggy/executors/task/TaskExecutor.hpp"
|
#include "daggy/executors/task/TaskExecutor.hpp"
|
||||||
#include "daggy/loggers/dag_run/DAGRunLogger.hpp"
|
#include "daggy/loggers/dag_run/DAGRunLogger.hpp"
|
||||||
|
|
||||||
|
namespace rj = rapidjson;
|
||||||
|
|
||||||
namespace daggy {
|
namespace daggy {
|
||||||
using TaskDAG = DAG<std::string, Task>;
|
using TaskDAG = DAG<std::string, Task>;
|
||||||
|
|
||||||
@@ -40,4 +43,48 @@ namespace daggy {
|
|||||||
void updateDAGFromTasks(TaskDAG &dag, const TaskSet &tasks);
|
void updateDAGFromTasks(TaskDAG &dag, const TaskSet &tasks);
|
||||||
|
|
||||||
std::ostream &operator<<(std::ostream &os, const TimePoint &tp);
|
std::ostream &operator<<(std::ostream &os, const TimePoint &tp);
|
||||||
|
|
||||||
|
// HTTP helpers
|
||||||
|
enum HTTPCode : long
|
||||||
|
{
|
||||||
|
Ok = 200,
|
||||||
|
Not_Found = 404,
|
||||||
|
Not_Acceptable = 406
|
||||||
|
};
|
||||||
|
|
||||||
|
struct HTTPResponse
|
||||||
|
{
|
||||||
|
HTTPCode code;
|
||||||
|
std::string body;
|
||||||
|
};
|
||||||
|
|
||||||
|
HTTPResponse HTTP_REQUEST(const std::string &url,
|
||||||
|
const std::string &payload = "",
|
||||||
|
const std::string &method = "GET",
|
||||||
|
bool trace = false);
|
||||||
|
|
||||||
|
std::pair<HTTPCode, rj::Document> JSON_HTTP_REQUEST(
|
||||||
|
const std::string &url, const std::string &payload = "",
|
||||||
|
const std::string &method = "GET", bool trace = false);
|
||||||
} // namespace daggy
|
} // namespace daggy
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void hash_combine(std::size_t &seed, T const &key)
|
||||||
|
{
|
||||||
|
std::hash<T> hasher;
|
||||||
|
seed ^= hasher(key) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace std {
|
||||||
|
template <typename T1, typename T2>
|
||||||
|
struct hash<std::pair<T1, T2>>
|
||||||
|
{
|
||||||
|
std::size_t operator()(std::pair<T1, T2> const &p) const
|
||||||
|
{
|
||||||
|
std::size_t seed(0);
|
||||||
|
::hash_combine(seed, p.first);
|
||||||
|
::hash_combine(seed, p.second);
|
||||||
|
return seed;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace std
|
||||||
|
|||||||
@@ -0,0 +1,69 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <rapidjson/document.h>
|
||||||
|
|
||||||
|
#include "TaskExecutor.hpp"
|
||||||
|
|
||||||
|
namespace rj = rapidjson;
|
||||||
|
|
||||||
|
namespace daggy::executors::task {
|
||||||
|
|
||||||
|
namespace daggy_runner {
|
||||||
|
struct Capacity
|
||||||
|
{
|
||||||
|
ssize_t cores;
|
||||||
|
ssize_t memoryMB;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string capacityToJSON(const Capacity &cap);
|
||||||
|
Capacity capacityFromJSON(const rj::Value &spec);
|
||||||
|
Capacity capacityFromTask(const Task &task);
|
||||||
|
|
||||||
|
void validateTaskParameters(const ConfigValues &job);
|
||||||
|
} // namespace daggy_runner
|
||||||
|
|
||||||
|
class DaggyRunnerTaskExecutor : public TaskExecutor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using Command = std::vector<std::string>;
|
||||||
|
|
||||||
|
DaggyRunnerTaskExecutor();
|
||||||
|
~DaggyRunnerTaskExecutor() override;
|
||||||
|
|
||||||
|
// Validates the job to ensure that all required values are set and are of
|
||||||
|
// the right type,
|
||||||
|
bool validateTaskParameters(const ConfigValues &job) override;
|
||||||
|
|
||||||
|
std::vector<ConfigValues> expandTaskParameters(
|
||||||
|
const ConfigValues &job, const ConfigValues &expansionValues) override;
|
||||||
|
|
||||||
|
// Runs the task
|
||||||
|
std::future<AttemptRecord> execute(DAGRunID runID,
|
||||||
|
const std::string &taskName,
|
||||||
|
const Task &task) override;
|
||||||
|
|
||||||
|
bool stop(DAGRunID runID, const std::string &taskName) override;
|
||||||
|
|
||||||
|
void addRunner(const std::string &url);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void monitor();
|
||||||
|
|
||||||
|
struct RunningTask
|
||||||
|
{
|
||||||
|
std::promise<AttemptRecord> prom;
|
||||||
|
DAGRunID runID;
|
||||||
|
std::string taskName;
|
||||||
|
std::string runnerURL;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Resolves jobs through polling
|
||||||
|
std::atomic<bool> running_;
|
||||||
|
std::thread monitorWorker_;
|
||||||
|
|
||||||
|
std::unordered_set<std::string> runners_;
|
||||||
|
std::mutex rtGuard_;
|
||||||
|
std::unordered_map<std::pair<DAGRunID, std::string>, RunningTask>
|
||||||
|
runningTasks_;
|
||||||
|
};
|
||||||
|
} // namespace daggy::executors::task
|
||||||
@@ -5,6 +5,10 @@
|
|||||||
#include "TaskExecutor.hpp"
|
#include "TaskExecutor.hpp"
|
||||||
|
|
||||||
namespace daggy::executors::task {
|
namespace daggy::executors::task {
|
||||||
|
namespace forking_executor {
|
||||||
|
void validateTaskParameters(const ConfigValues &job);
|
||||||
|
}
|
||||||
|
|
||||||
class ForkingTaskExecutor : public TaskExecutor
|
class ForkingTaskExecutor : public TaskExecutor
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|||||||
@@ -153,6 +153,7 @@ namespace daggy {
|
|||||||
for (auto it = params.MemberBegin(); it != params.MemberEnd(); ++it) {
|
for (auto it = params.MemberBegin(); it != params.MemberEnd(); ++it) {
|
||||||
if (!it->name.IsString())
|
if (!it->name.IsString())
|
||||||
throw std::runtime_error("job key must be a string.");
|
throw std::runtime_error("job key must be a string.");
|
||||||
|
|
||||||
if (it->value.IsArray()) {
|
if (it->value.IsArray()) {
|
||||||
std::vector<std::string> values;
|
std::vector<std::string> values;
|
||||||
for (size_t i = 0; i < it->value.Size(); ++i) {
|
for (size_t i = 0; i < it->value.Size(); ++i) {
|
||||||
@@ -160,10 +161,13 @@ namespace daggy {
|
|||||||
}
|
}
|
||||||
task.job.insert_or_assign(it->name.GetString(), values);
|
task.job.insert_or_assign(it->name.GetString(), values);
|
||||||
}
|
}
|
||||||
else {
|
else if (it->value.IsString()) {
|
||||||
task.job.insert_or_assign(it->name.GetString(),
|
task.job.insert_or_assign(it->name.GetString(),
|
||||||
it->value.GetString());
|
it->value.GetString());
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
throw std::runtime_error("Value in parameters is not a string");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,53 @@
|
|||||||
|
|
||||||
using namespace std::chrono_literals;
|
using namespace std::chrono_literals;
|
||||||
|
|
||||||
|
static int http_trace(CURL *handle, curl_infotype type, char *data, size_t size,
|
||||||
|
void *userp)
|
||||||
|
{
|
||||||
|
const char *text;
|
||||||
|
(void)handle; /* prevent compiler warning */
|
||||||
|
(void)userp;
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case CURLINFO_TEXT:
|
||||||
|
fprintf(stderr, "== Info: %s", data);
|
||||||
|
default: /* in case a new one is introduced to shock us */
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
case CURLINFO_HEADER_OUT:
|
||||||
|
text = "=> Send header";
|
||||||
|
break;
|
||||||
|
case CURLINFO_DATA_OUT:
|
||||||
|
text = "=> Send data";
|
||||||
|
break;
|
||||||
|
case CURLINFO_SSL_DATA_OUT:
|
||||||
|
text = "=> Send SSL data";
|
||||||
|
break;
|
||||||
|
case CURLINFO_HEADER_IN:
|
||||||
|
text = "<= Recv header";
|
||||||
|
break;
|
||||||
|
case CURLINFO_DATA_IN:
|
||||||
|
text = "<= Recv data";
|
||||||
|
break;
|
||||||
|
case CURLINFO_SSL_DATA_IN:
|
||||||
|
text = "<= Recv SSL data";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cerr << "\n================== " << text
|
||||||
|
<< " ==================" << std::endl
|
||||||
|
<< data << std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint curlWriter(char *in, uint size, uint nmemb, std::stringstream *out)
|
||||||
|
{
|
||||||
|
uint r;
|
||||||
|
r = size * nmemb;
|
||||||
|
out->write(in, r);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
namespace daggy {
|
namespace daggy {
|
||||||
std::string globalSub(std::string string, const std::string &pattern,
|
std::string globalSub(std::string string, const std::string &pattern,
|
||||||
const std::string &replacement)
|
const std::string &replacement)
|
||||||
@@ -168,4 +215,67 @@ namespace daggy {
|
|||||||
os << tp.time_since_epoch().count() << std::endl;
|
os << tp.time_since_epoch().count() << std::endl;
|
||||||
return os;
|
return os;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HTTPResponse HTTP_REQUEST(const std::string &url, const std::string &payload,
|
||||||
|
const std::string &method, bool trace)
|
||||||
|
{
|
||||||
|
HTTPResponse response{.code = HTTPCode::Ok, .body = ""};
|
||||||
|
|
||||||
|
CURL *curl;
|
||||||
|
CURLcode res;
|
||||||
|
struct curl_slist *headers = NULL;
|
||||||
|
|
||||||
|
curl_global_init(CURL_GLOBAL_ALL);
|
||||||
|
|
||||||
|
curl = curl_easy_init();
|
||||||
|
if (curl) {
|
||||||
|
std::stringstream buffer;
|
||||||
|
|
||||||
|
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curlWriter);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
|
||||||
|
|
||||||
|
if (trace) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, http_trace);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!payload.empty()) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, payload.size());
|
||||||
|
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, payload.c_str());
|
||||||
|
headers = curl_slist_append(headers, "Content-Type: Application/Json");
|
||||||
|
}
|
||||||
|
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method.c_str());
|
||||||
|
headers = curl_slist_append(headers, "Expect:");
|
||||||
|
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||||
|
|
||||||
|
res = curl_easy_perform(curl);
|
||||||
|
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
throw std::runtime_error(std::string{"CURL Failed: "} +
|
||||||
|
curl_easy_strerror(res));
|
||||||
|
}
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
|
||||||
|
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response.code);
|
||||||
|
response.body = buffer.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_global_cleanup();
|
||||||
|
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<HTTPCode, rj::Document> JSON_HTTP_REQUEST(
|
||||||
|
const std::string &url, const std::string &payload,
|
||||||
|
const std::string &method, bool trace)
|
||||||
|
{
|
||||||
|
auto response = HTTP_REQUEST(url, payload, method);
|
||||||
|
|
||||||
|
rj::Document doc;
|
||||||
|
checkRJParse(doc.Parse(response.body.c_str()));
|
||||||
|
return std::make_pair(response.code, std::move(doc));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace daggy
|
} // namespace daggy
|
||||||
|
|||||||
@@ -2,4 +2,5 @@ target_sources(${PROJECT_NAME} PRIVATE
|
|||||||
SlurmTaskExecutor.cpp
|
SlurmTaskExecutor.cpp
|
||||||
NoopTaskExecutor.cpp
|
NoopTaskExecutor.cpp
|
||||||
ForkingTaskExecutor.cpp
|
ForkingTaskExecutor.cpp
|
||||||
|
DaggyRunnerTaskExecutor.cpp
|
||||||
)
|
)
|
||||||
|
|||||||
227
libdaggy/src/executors/task/DaggyRunnerTaskExecutor.cpp
Normal file
227
libdaggy/src/executors/task/DaggyRunnerTaskExecutor.cpp
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
#include <daggy/Serialization.hpp>
|
||||||
|
#include <daggy/Utilities.hpp>
|
||||||
|
#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
|
||||||
|
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
||||||
|
#include <iomanip>
|
||||||
|
|
||||||
|
using namespace daggy::executors::task;
|
||||||
|
using namespace daggy::executors::task::daggy_runner;
|
||||||
|
using namespace daggy;
|
||||||
|
|
||||||
|
namespace daggy::executors::task::daggy_runner {
|
||||||
|
std::string capacityToJSON(const Capacity &cap)
|
||||||
|
{
|
||||||
|
return R"({ "cores": )" + std::to_string(cap.cores) + R"(, "memoryMB": )" +
|
||||||
|
std::to_string(cap.memoryMB) + "}";
|
||||||
|
}
|
||||||
|
|
||||||
|
Capacity capacityFromJSON(const rj::Value &spec)
|
||||||
|
{
|
||||||
|
Capacity cap{.cores = 0, .memoryMB = 0};
|
||||||
|
|
||||||
|
if (!spec.IsObject()) {
|
||||||
|
throw std::runtime_error("Capacity is not an object");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (spec.HasMember("cores")) {
|
||||||
|
if (!spec["cores"].IsNumber()) {
|
||||||
|
throw std::runtime_error("cores member of Capacity is not an integer");
|
||||||
|
}
|
||||||
|
cap.cores = spec["cores"].GetInt64();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (spec.HasMember("memoryMB")) {
|
||||||
|
if (!spec["memoryMB"].IsNumber()) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"memoryMB member of Capacity is not an integer");
|
||||||
|
}
|
||||||
|
cap.memoryMB = spec["memoryMB"].GetInt64();
|
||||||
|
}
|
||||||
|
|
||||||
|
return cap;
|
||||||
|
}
|
||||||
|
|
||||||
|
Capacity capacityFromTask(const Task &task)
|
||||||
|
{
|
||||||
|
Capacity cap{.cores = 0, .memoryMB = 0};
|
||||||
|
|
||||||
|
cap.cores = std::stoll(std::get<std::string>(task.job.at("cores")));
|
||||||
|
cap.memoryMB = std::stoll(std::get<std::string>(task.job.at("memoryMB")));
|
||||||
|
|
||||||
|
return cap;
|
||||||
|
}
|
||||||
|
|
||||||
|
void validateTaskParameters(const daggy::ConfigValues &job)
|
||||||
|
{
|
||||||
|
forking_executor::validateTaskParameters(job);
|
||||||
|
|
||||||
|
const std::array<std::string, 2> fields{"cores", "memoryMB"};
|
||||||
|
|
||||||
|
for (const auto &field : fields) {
|
||||||
|
if (job.count(field) == 0)
|
||||||
|
throw std::runtime_error("Missing required job parameter " + field);
|
||||||
|
|
||||||
|
const auto &val = job.at(field);
|
||||||
|
|
||||||
|
if (!std::holds_alternative<std::string>(val))
|
||||||
|
throw std::runtime_error(field + " in capacity is not a string");
|
||||||
|
|
||||||
|
try {
|
||||||
|
std::stoll(std::get<std::string>(val));
|
||||||
|
}
|
||||||
|
catch (std::exception &e) {
|
||||||
|
throw std::runtime_error(field + " in capacity is not an integer");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace daggy::executors::task::daggy_runner
|
||||||
|
|
||||||
|
DaggyRunnerTaskExecutor::DaggyRunnerTaskExecutor()
|
||||||
|
: running_(true)
|
||||||
|
, monitorWorker_(&DaggyRunnerTaskExecutor::monitor, this)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
DaggyRunnerTaskExecutor::~DaggyRunnerTaskExecutor()
|
||||||
|
{
|
||||||
|
running_ = false;
|
||||||
|
monitorWorker_.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validates the job to ensure that all required values are set and are of
|
||||||
|
// the right type,
|
||||||
|
bool DaggyRunnerTaskExecutor::validateTaskParameters(const ConfigValues &job)
|
||||||
|
{
|
||||||
|
daggy_runner::validateTaskParameters(job);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<ConfigValues> DaggyRunnerTaskExecutor::expandTaskParameters(
|
||||||
|
const ConfigValues &job, const ConfigValues &expansionValues)
|
||||||
|
{
|
||||||
|
std::vector<ConfigValues> newValues;
|
||||||
|
|
||||||
|
auto command =
|
||||||
|
(job.count("command") == 0 ? Command{}
|
||||||
|
: std::get<Command>(job.at("command")));
|
||||||
|
|
||||||
|
auto environment = (job.count("environment") == 0
|
||||||
|
? Command{}
|
||||||
|
: std::get<Command>(job.at("environment")));
|
||||||
|
|
||||||
|
Command both(command);
|
||||||
|
std::copy(environment.begin(), environment.end(), std::back_inserter(both));
|
||||||
|
|
||||||
|
for (const auto &parts : interpolateValues(both, expansionValues)) {
|
||||||
|
ConfigValues newCommand{job};
|
||||||
|
newCommand["command"] =
|
||||||
|
Command(parts.begin(), parts.begin() + command.size());
|
||||||
|
newCommand["environment"] =
|
||||||
|
Command(parts.begin() + command.size(), parts.end());
|
||||||
|
newValues.emplace_back(newCommand);
|
||||||
|
}
|
||||||
|
|
||||||
|
return newValues;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Runs the task
|
||||||
|
std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
|
||||||
|
DAGRunID runID, const std::string &taskName, const Task &task)
|
||||||
|
{
|
||||||
|
auto taskUsed = capacityFromTask(task);
|
||||||
|
|
||||||
|
// Get the capacities for all the runners
|
||||||
|
// Capacities for a runner can be negative, meaning that they're currently
|
||||||
|
// oversubscribed.
|
||||||
|
std::vector<std::pair<std::string, double>> impacts;
|
||||||
|
for (const auto &runner : runners_) {
|
||||||
|
try {
|
||||||
|
const auto &[code, doc] = JSON_HTTP_REQUEST(runner + "/v1/capacity");
|
||||||
|
if (code != HTTPCode::Ok) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto curCap = capacityFromJSON(doc["current"]);
|
||||||
|
auto totCap = capacityFromJSON(doc["total"]);
|
||||||
|
|
||||||
|
ssize_t cores = curCap.cores < 0 ? totCap.cores : curCap.cores;
|
||||||
|
ssize_t memoryMB =
|
||||||
|
curCap.memoryMB < 0 ? totCap.memoryMB : curCap.memoryMB;
|
||||||
|
|
||||||
|
double impact =
|
||||||
|
std::max(taskUsed.cores / cores, taskUsed.memoryMB / memoryMB);
|
||||||
|
impacts.emplace_back(runner, impact);
|
||||||
|
}
|
||||||
|
catch (const std::exception &_) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (impacts.empty())
|
||||||
|
throw std::runtime_error("No runners available for execution");
|
||||||
|
|
||||||
|
auto cit = impacts.begin();
|
||||||
|
for (auto it = impacts.begin(); it != impacts.end(); ++it) {
|
||||||
|
if (it->second < cit->second)
|
||||||
|
cit = it;
|
||||||
|
}
|
||||||
|
|
||||||
|
RunningTask rt{
|
||||||
|
.prom{}, .runID = runID, .taskName = taskName, .runnerURL = cit->first};
|
||||||
|
|
||||||
|
auto fut = rt.prom.get_future();
|
||||||
|
|
||||||
|
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||||
|
runningTasks_.emplace(std::make_pair(runID, taskName), std::move(rt));
|
||||||
|
|
||||||
|
return fut;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DaggyRunnerTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
|
||||||
|
{
|
||||||
|
runners_.insert(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DaggyRunnerTaskExecutor::monitor()
|
||||||
|
{
|
||||||
|
while (running_) {
|
||||||
|
{
|
||||||
|
std::vector<std::pair<DAGRunID, std::string>> resolvedJobs;
|
||||||
|
|
||||||
|
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||||
|
for (auto &[taskID, task] : runningTasks_) {
|
||||||
|
try {
|
||||||
|
const auto &[code, json] = JSON_HTTP_REQUEST(
|
||||||
|
task.runnerURL + "/v1/task/" + std::to_string(taskID.first) +
|
||||||
|
"/" + taskID.second);
|
||||||
|
if (code != HTTPCode::Ok) {
|
||||||
|
AttemptRecord record{
|
||||||
|
.rc = -1, .executorLog = "Unable to query runner for progress"};
|
||||||
|
task.prom.set_value(std::move(record));
|
||||||
|
resolvedJobs.emplace_back(taskID);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (json["state"] == "COMPLETED") {
|
||||||
|
task.prom.set_value(attemptRecordFromJSON(json["attempt"]));
|
||||||
|
resolvedJobs.emplace_back(taskID);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (std::runtime_error &e) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto &tid : resolvedJobs) {
|
||||||
|
runningTasks_.extract(tid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -9,6 +9,30 @@
|
|||||||
|
|
||||||
using namespace daggy::executors::task;
|
using namespace daggy::executors::task;
|
||||||
|
|
||||||
|
namespace daggy::executors::task::forking_executor {
|
||||||
|
void validateTaskParameters(const daggy::ConfigValues &job)
|
||||||
|
{
|
||||||
|
// command or commandString is required
|
||||||
|
if (job.count("command")) {
|
||||||
|
if (!std::holds_alternative<daggy::Command>(job.at("command")))
|
||||||
|
throw std::runtime_error(R"(command must be an array of strings)");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (job.count("commandString") == 0) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
R"(command or commandString must be defined.)");
|
||||||
|
}
|
||||||
|
if (!std::holds_alternative<std::string>(job.at("commandString")))
|
||||||
|
throw std::runtime_error(R"(commandString must be a string)");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (job.count("environment")) {
|
||||||
|
if (!std::holds_alternative<daggy::Command>(job.at("environment")))
|
||||||
|
throw std::runtime_error(R"(environment must be an array of strings)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace daggy::executors::task::forking_executor
|
||||||
|
|
||||||
std::string slurp(int fd)
|
std::string slurp(int fd)
|
||||||
{
|
{
|
||||||
std::string result;
|
std::string result;
|
||||||
@@ -190,23 +214,7 @@ daggy::AttemptRecord ForkingTaskExecutor::runTask(const Task &task,
|
|||||||
|
|
||||||
bool ForkingTaskExecutor::validateTaskParameters(const ConfigValues &job)
|
bool ForkingTaskExecutor::validateTaskParameters(const ConfigValues &job)
|
||||||
{
|
{
|
||||||
// command or commandString is required
|
forking_executor::validateTaskParameters(job);
|
||||||
if (job.count("command")) {
|
|
||||||
if (!std::holds_alternative<Command>(job.at("command")))
|
|
||||||
throw std::runtime_error(R"(command must be an array of strings)");
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (job.count("commandString") == 0) {
|
|
||||||
throw std::runtime_error(R"(command or commandString must be defined.)");
|
|
||||||
}
|
|
||||||
if (!std::holds_alternative<std::string>(job.at("commandString")))
|
|
||||||
throw std::runtime_error(R"(commandString must be a string)");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (job.count("environment")) {
|
|
||||||
if (!std::holds_alternative<Command>(job.at("environment")))
|
|
||||||
throw std::runtime_error(R"(environment must be an array of strings)");
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user