commit b06b11cbb5d09c6d091551e39767cd3316f88376 Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Oct 5 11:57:37 2021 -0300 Fixing failing unit test commit fe2a43a19b2a16a9aedd9e95e71e672935ecaeb1 Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Oct 5 11:54:01 2021 -0300 Adding endpoints and updating documentation commit 46e0deeefb8b06291ae5e2d6b8ec4749c5b0ea6f Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Oct 5 11:49:43 2021 -0300 Completing unit tests and relevant fixes commit e0569f370624844feee6aae4708bfe683f4156cf Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 4 17:30:59 2021 -0300 Adding in gcc tsan for debug builds to help with race conditions, fixing many of those, and fixing really crummy assumption about how futures worked that will speed up task execution by a ton. commit c748a4f592e1ada5546908be5281d04f4749539d Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 4 10:14:43 2021 -0300 Checkpointing work that seems to have resolved the race condition commit 7a79f2943e0d50545d976a28b4b379340a90dded Author: Ian Roddis <gitlab@ie2r.com> Date: Wed Sep 29 09:27:07 2021 -0300 Completing the rough-in for DAG killing / pausing / resuming commit 4cf8d81d5f6fcf4a7dd83d8fca3e23f153aa8acb Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Sep 28 14:53:50 2021 -0300 Adding dagrunner unit tests, adding a resetRunning method to resume commit 54e2c1f9f5e7d5b339d71be024e0e390c4d2bf61 Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Sep 28 14:45:57 2021 -0300 Refactoring runDAG into DAGRunner commit 682be7a11e2fae850e1bc3e207628d2335768c2b Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Sep 28 14:34:43 2021 -0300 Adding DAGRunner class to replace Utilities::runDAG, making Slurm cancellation rc agree with SIGKILL commit 4171b3a6998791abfc71b04f8de1ae93c4f90a78 Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Sep 28 14:14:17 2021 -0300 Adding unit tests for stopping jobs to slurm commit dc0b1ff26a5d98471164132d35bb8a552cc75ff8 Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Sep 28 14:04:15 2021 -0300 Adding in stop method for task executors commit e752b44f55113be54392bcbb5c3d6f251d673cfa Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Sep 28 12:32:06 2021 -0300 Adding additional tests for loggers commit f0773d5a84a422738fc17c9277a2b735a21a3d04 Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Sep 28 12:29:21 2021 -0300 Unit tests pass commit 993ff2810de2d53dc6a59ab53d620fecf152d4a0 Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Sep 28 12:24:34 2021 -0300 Adding handling for new routes, still need to add tests for new routes commit 676623b14e45759872a2dbcbc98f6a744e022a71 Author: Ian Roddis <gitlab@ie2r.com> Date: Tue Sep 28 12:12:43 2021 -0300 Adding handling for new routes, still need to add tests for new routes commit b9edb6ba291eb064f4c459a308ea6912fba9fa02 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Sep 27 11:59:14 2021 -0300 Defining new endpoints, fixing dag resumption code, adding PAUSED state, refactoring DAGSpec and adding deserializer
383 lines
11 KiB
C++
383 lines
11 KiB
C++
#include <curl/curl.h>
|
|
#include <pistache/client.h>
|
|
#include <rapidjson/document.h>
|
|
#include <sys/stat.h>
|
|
|
|
#include <catch2/catch.hpp>
|
|
#include <daggy/Serialization.hpp>
|
|
#include <daggy/Server.hpp>
|
|
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
|
#include <daggy/executors/task/NoopTaskExecutor.hpp>
|
|
#include <daggy/loggers/dag_run/OStreamLogger.hpp>
|
|
#include <filesystem>
|
|
#include <iostream>
|
|
#include <thread>
|
|
|
|
namespace rj = rapidjson;
|
|
|
|
using namespace daggy;
|
|
|
|
#ifdef DEBUG_HTTP
|
|
static int my_trace(CURL *handle, curl_infotype type, char *data, size_t size,
|
|
void *userp)
|
|
{
|
|
const char *text;
|
|
(void)handle; /* prevent compiler warning */
|
|
(void)userp;
|
|
|
|
switch (type) {
|
|
case CURLINFO_TEXT:
|
|
fprintf(stderr, "== Info: %s", data);
|
|
default: /* in case a new one is introduced to shock us */
|
|
return 0;
|
|
|
|
case CURLINFO_HEADER_OUT:
|
|
text = "=> Send header";
|
|
break;
|
|
case CURLINFO_DATA_OUT:
|
|
text = "=> Send data";
|
|
break;
|
|
case CURLINFO_SSL_DATA_OUT:
|
|
text = "=> Send SSL data";
|
|
break;
|
|
case CURLINFO_HEADER_IN:
|
|
text = "<= Recv header";
|
|
break;
|
|
case CURLINFO_DATA_IN:
|
|
text = "<= Recv data";
|
|
break;
|
|
case CURLINFO_SSL_DATA_IN:
|
|
text = "<= Recv SSL data";
|
|
break;
|
|
}
|
|
|
|
std::cerr << "\n================== " << text
|
|
<< " ==================" << std::endl
|
|
<< data << std::endl;
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
enum HTTPCode : long
|
|
{
|
|
Ok = 200,
|
|
Not_Found = 404
|
|
};
|
|
|
|
struct HTTPResponse
|
|
{
|
|
HTTPCode code;
|
|
std::string body;
|
|
};
|
|
|
|
uint curlWriter(char *in, uint size, uint nmemb, std::stringstream *out)
|
|
{
|
|
uint r;
|
|
r = size * nmemb;
|
|
out->write(in, r);
|
|
return r;
|
|
}
|
|
|
|
HTTPResponse REQUEST(const std::string &url, const std::string &payload = "",
|
|
const std::string &method = "GET")
|
|
{
|
|
HTTPResponse response;
|
|
|
|
CURL *curl;
|
|
CURLcode res;
|
|
struct curl_slist *headers = NULL;
|
|
|
|
curl_global_init(CURL_GLOBAL_ALL);
|
|
|
|
curl = curl_easy_init();
|
|
if (curl) {
|
|
std::stringstream buffer;
|
|
|
|
#ifdef DEBUG_HTTP
|
|
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, my_trace);
|
|
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
|
|
#endif
|
|
|
|
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curlWriter);
|
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
|
|
|
|
if (!payload.empty()) {
|
|
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, payload.size());
|
|
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, payload.c_str());
|
|
headers = curl_slist_append(headers, "Content-Type: Application/Json");
|
|
}
|
|
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method.c_str());
|
|
headers = curl_slist_append(headers, "Expect:");
|
|
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
|
|
|
res = curl_easy_perform(curl);
|
|
|
|
if (res != CURLE_OK) {
|
|
curl_easy_cleanup(curl);
|
|
throw std::runtime_error(std::string{"CURL Failed: "} +
|
|
curl_easy_strerror(res));
|
|
}
|
|
curl_easy_cleanup(curl);
|
|
|
|
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response.code);
|
|
response.body = buffer.str();
|
|
}
|
|
|
|
curl_global_cleanup();
|
|
|
|
return response;
|
|
}
|
|
|
|
TEST_CASE("rest_endpoint", "[server_basic]")
|
|
{
|
|
std::stringstream ss;
|
|
daggy::executors::task::ForkingTaskExecutor executor(10);
|
|
daggy::loggers::dag_run::OStreamLogger logger(ss);
|
|
Pistache::Address listenSpec("localhost", Pistache::Port(0));
|
|
|
|
const size_t nDAGRunners = 10, nWebThreads = 10;
|
|
|
|
daggy::Server server(listenSpec, logger, executor, nDAGRunners);
|
|
server.init(nWebThreads);
|
|
server.start();
|
|
|
|
const std::string host = "localhost:";
|
|
const std::string baseURL = host + std::to_string(server.getPort());
|
|
|
|
SECTION("Ready Endpoint")
|
|
{
|
|
auto response = REQUEST(baseURL + "/ready");
|
|
REQUIRE(response.code == HTTPCode::Ok);
|
|
}
|
|
|
|
SECTION("Querying a non-existent dagrunid should fail ")
|
|
{
|
|
auto response = REQUEST(baseURL + "/v1/dagrun/100");
|
|
REQUIRE(response.code != HTTPCode::Ok);
|
|
}
|
|
|
|
SECTION("Simple DAGRun Submission")
|
|
{
|
|
std::string dagRun = R"({
|
|
"tag": "unit_server",
|
|
"parameters": { "FILE": [ "A", "B" ] },
|
|
"tasks": {
|
|
"touch": { "job": { "command": [ "/usr/bin/touch", "dagrun_{{FILE}}" ]} },
|
|
"cat": { "job": { "command": [ "/usr/bin/cat", "dagrun_A", "dagrun_B" ]},
|
|
"parents": [ "touch" ]
|
|
}
|
|
}
|
|
})";
|
|
|
|
auto dagSpec = daggy::dagFromJSON(dagRun);
|
|
|
|
// Submit, and get the runID
|
|
daggy::DAGRunID runID = 0;
|
|
{
|
|
auto response = REQUEST(baseURL + "/v1/dagrun/", dagRun, "POST");
|
|
REQUIRE(response.code == HTTPCode::Ok);
|
|
|
|
rj::Document doc;
|
|
daggy::checkRJParse(doc.Parse(response.body.c_str()));
|
|
REQUIRE(doc.IsObject());
|
|
REQUIRE(doc.HasMember("runID"));
|
|
|
|
runID = doc["runID"].GetUint64();
|
|
}
|
|
|
|
// Ensure our runID shows up in the list of running DAGs
|
|
{
|
|
auto response = REQUEST(baseURL + "/v1/dagruns?all=1");
|
|
REQUIRE(response.code == HTTPCode::Ok);
|
|
|
|
rj::Document doc;
|
|
daggy::checkRJParse(doc.Parse(response.body.c_str()));
|
|
REQUIRE(doc.IsArray());
|
|
REQUIRE(doc.Size() >= 1);
|
|
|
|
// Ensure that our DAG is in the list and matches our given DAGRunID
|
|
bool found = false;
|
|
const auto &runs = doc.GetArray();
|
|
for (size_t i = 0; i < runs.Size(); ++i) {
|
|
const auto &run = runs[i];
|
|
REQUIRE(run.IsObject());
|
|
REQUIRE(run.HasMember("tag"));
|
|
REQUIRE(run.HasMember("runID"));
|
|
|
|
std::string runName = run["tag"].GetString();
|
|
if (runName == "unit_server") {
|
|
REQUIRE(run["runID"].GetUint64() == runID);
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
REQUIRE(found);
|
|
}
|
|
|
|
// Ensure we can get one of our tasks
|
|
{
|
|
auto response = REQUEST(baseURL + "/v1/dagrun/" + std::to_string(runID) +
|
|
"/task/cat_0");
|
|
REQUIRE(response.code == HTTPCode::Ok);
|
|
|
|
rj::Document doc;
|
|
daggy::checkRJParse(doc.Parse(response.body.c_str()));
|
|
|
|
REQUIRE_NOTHROW(daggy::taskFromJSON("cat", doc));
|
|
auto task = daggy::taskFromJSON("cat", doc);
|
|
|
|
REQUIRE(task == dagSpec.tasks.at("cat"));
|
|
}
|
|
|
|
// Wait until our DAG is complete
|
|
bool complete = true;
|
|
for (auto i = 0; i < 10; ++i) {
|
|
auto response = REQUEST(baseURL + "/v1/dagrun/" + std::to_string(runID));
|
|
REQUIRE(response.code == HTTPCode::Ok);
|
|
rj::Document doc;
|
|
daggy::checkRJParse(doc.Parse(response.body.c_str()));
|
|
REQUIRE(doc.IsObject());
|
|
|
|
REQUIRE(doc.HasMember("taskStates"));
|
|
const auto &taskStates = doc["taskStates"].GetObject();
|
|
|
|
size_t nStates = 0;
|
|
for (auto it = taskStates.MemberBegin(); it != taskStates.MemberEnd();
|
|
++it) {
|
|
nStates++;
|
|
}
|
|
REQUIRE(nStates == 3);
|
|
|
|
complete = true;
|
|
for (auto it = taskStates.MemberBegin(); it != taskStates.MemberEnd();
|
|
++it) {
|
|
std::string state = it->value.GetString();
|
|
if (state != "COMPLETED") {
|
|
complete = false;
|
|
break;
|
|
}
|
|
}
|
|
if (complete)
|
|
break;
|
|
std::this_thread::sleep_for(std::chrono::seconds(1));
|
|
}
|
|
REQUIRE(complete);
|
|
|
|
std::this_thread::sleep_for(std::chrono::seconds(2));
|
|
for (const auto &pth : std::vector<fs::path>{"dagrun_A", "dagrun_B"}) {
|
|
REQUIRE(fs::exists(pth));
|
|
fs::remove(pth);
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST_CASE("Server cancels and resumes execution", "[server_resume]")
|
|
{
|
|
std::stringstream ss;
|
|
daggy::executors::task::ForkingTaskExecutor executor(10);
|
|
daggy::loggers::dag_run::OStreamLogger logger(ss);
|
|
Pistache::Address listenSpec("localhost", Pistache::Port(0));
|
|
|
|
const size_t nDAGRunners = 10, nWebThreads = 10;
|
|
|
|
daggy::Server server(listenSpec, logger, executor, nDAGRunners);
|
|
server.init(nWebThreads);
|
|
server.start();
|
|
|
|
const std::string host = "localhost:";
|
|
const std::string baseURL = host + std::to_string(server.getPort());
|
|
|
|
SECTION("Cancel / Resume DAGRun")
|
|
{
|
|
std::string dagRunJSON = R"({
|
|
"tag": "unit_server",
|
|
"tasks": {
|
|
"touch_A": { "job": { "command": [ "/usr/bin/touch", "resume_touch_a" ]}, "children": ["touch_C"] },
|
|
"sleep_B": { "job": { "command": [ "/usr/bin/sleep", "3" ]}, "children": ["touch_C"] },
|
|
"touch_C": { "job": { "command": [ "/usr/bin/touch", "resume_touch_c" ]} }
|
|
}
|
|
})";
|
|
|
|
auto dagSpec = daggy::dagFromJSON(dagRunJSON);
|
|
|
|
// Submit, and get the runID
|
|
daggy::DAGRunID runID;
|
|
{
|
|
auto response = REQUEST(baseURL + "/v1/dagrun/", dagRunJSON, "POST");
|
|
REQUIRE(response.code == HTTPCode::Ok);
|
|
|
|
rj::Document doc;
|
|
daggy::checkRJParse(doc.Parse(response.body.c_str()));
|
|
REQUIRE(doc.IsObject());
|
|
REQUIRE(doc.HasMember("runID"));
|
|
|
|
runID = doc["runID"].GetUint64();
|
|
}
|
|
|
|
std::this_thread::sleep_for(1s);
|
|
|
|
// Stop the current run
|
|
{
|
|
auto response = REQUEST(
|
|
baseURL + "/v1/dagrun/" + std::to_string(runID) + "/state/KILLED", "",
|
|
"PATCH");
|
|
REQUIRE(response.code == HTTPCode::Ok);
|
|
REQUIRE(logger.getDAGRunState(runID) == +daggy::RunState::KILLED);
|
|
}
|
|
|
|
// Verify that the run still exists
|
|
{
|
|
auto dagRun = logger.getDAGRun(runID);
|
|
REQUIRE(dagRun.taskRunStates.at("touch_A_0") ==
|
|
+daggy::RunState::COMPLETED);
|
|
REQUIRE(fs::exists("resume_touch_a"));
|
|
|
|
REQUIRE(dagRun.taskRunStates.at("sleep_B_0") ==
|
|
+daggy::RunState::ERRORED);
|
|
REQUIRE(dagRun.taskRunStates.at("touch_C_0") == +daggy::RunState::QUEUED);
|
|
}
|
|
|
|
// Set the errored task state
|
|
{
|
|
auto url = baseURL + "/v1/dagrun/" + std::to_string(runID) +
|
|
"/task/sleep_B_0/state/QUEUED";
|
|
auto response = REQUEST(url, "", "PATCH");
|
|
REQUIRE(response.code == HTTPCode::Ok);
|
|
REQUIRE(logger.getTaskState(runID, "sleep_B_0") ==
|
|
+daggy::RunState::QUEUED);
|
|
}
|
|
|
|
// Resume
|
|
{
|
|
struct stat s;
|
|
|
|
lstat("resume_touch_A", &s);
|
|
auto preMTime = s.st_mtim.tv_sec;
|
|
|
|
auto response = REQUEST(
|
|
baseURL + "/v1/dagrun/" + std::to_string(runID) + "/state/QUEUED", "",
|
|
"PATCH");
|
|
|
|
// Wait for run to complete
|
|
std::this_thread::sleep_for(5s);
|
|
REQUIRE(logger.getDAGRunState(runID) == +daggy::RunState::COMPLETED);
|
|
|
|
REQUIRE(fs::exists("resume_touch_c"));
|
|
REQUIRE(fs::exists("resume_touch_a"));
|
|
|
|
for (const auto &[taskName, task] : dagSpec.tasks) {
|
|
REQUIRE(logger.getTaskState(runID, taskName + "_0") ==
|
|
+daggy::RunState::COMPLETED);
|
|
}
|
|
|
|
// Ensure "touch_A" wasn't run again
|
|
lstat("resume_touch_A", &s);
|
|
auto postMTime = s.st_mtim.tv_sec;
|
|
REQUIRE(preMTime == postMTime);
|
|
}
|
|
}
|
|
|
|
server.shutdown();
|
|
}
|