Adding support for remote execution daemons.

Squashed commit of the following: commit 69d5ef7a256b86a86d46e5ae374c00fded1497ea Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 12:15:55 2021 -0400 Updating readme commit 94a9f676d0f9cc0b55cdc18c4927eaea40d82c77 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 12:05:36 2021 -0400 Fixing serialization of attempt records when querying entire dag commit 945e5f90b24abf07c9af1bc4c6bbcb33e93b8069 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 11:37:59 2021 -0400 Compiles cleanly... commit 8b23e46081d47fb80dc1a2d998fc6dc4bbf301a8 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:43:03 2021 -0400 Adding in missing source file to cmake build list commit 6d10d9791206e2bc15788beadeea580b8e43a853 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:41:43 2021 -0400 Adding new executors commit 42a2c67f4d6ae99df95d917c8621d78cd99837a1 Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:27:14 2021 -0400 Fixing missing curl cmake dependency commit 394bc4c5d51ecee7bf14712f719c8bf7e97fb0fa Author: Ian Roddis <tech@kinesin.ca> Date: Thu Dec 16 10:21:58 2021 -0400 Fixing missing curl cmake dependency commit dd9efc8e7e7770ea1bcbccb70a1af9cfcff0414c Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 17:15:38 2021 -0400 Checkpointing progress commit 3b3b55d6037bb96e46de6763f486f4ecb92fe6a0 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 14:21:18 2021 -0400 updating readme commit 303027c11452941b2a0c0d1b04ac5942e79efd74 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 14:17:16 2021 -0400 Namespacing daggyd Adding more error checking around deserialization of parameters Adding tests for runner agent commit c592eaeba12e2a449bae401e8c1d9ed236416d52 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 11:20:21 2021 -0400 Checkpointing work commit fb1862d1cefe2b53a98659cce3c8c73d88bf5d84 Author: Ian Roddis <tech@kinesin.ca> Date: Wed Dec 15 09:52:29 2021 -0400 Copying daggyd for daggyr template, adding in basic routes
2021-12-16 12:16:12 -04:00
parent 14d0ef4a3f
commit 8d00621908
26 changed files with 1373 additions and 160 deletions
--- a/daggyr/CMakeLists.txt
+++ b/daggyr/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(libdaggyr)
+add_subdirectory(daggyr)
+add_subdirectory(tests)
--- a/daggyr/README.md
+++ b/daggyr/README.md
@@ -0,0 +1,68 @@
+# Daggy Runner
+
+`daggyr` is a REST server process that acts as a remote task executor.
+
+# Running it
+
+```bash
+daggyr    # That's it, will listen on 127.0.0.1:2504 , and run with a local executor
+daggyr -d # Daemonize
+
+daggyr --config FILE # Run with a config file
+```
+
+# Capacity and Allocation
+
+On startup, a server's capacity is determined automatically. The capacities are:
+
+| Capacity  | Determined by                         | Default                     | Notes                            |
+|-----------|---------------------------------------|-----------------------------|----------------------------------|
+| cores     | `std::thread::hardware_concurrency()` | `max(1, max - 2)`           | A value of 0 will mean all cores |
+| memory_mb | `sysinfo.h`                           | `max(100, totalram * 0.75)` | `totalram` is converted to MB    |
+
+When a `daggyd` process is selecting a runner to send a task to, it will
+query the current capacities, and choose the runner that:
+
+- Can satisfy the requirements of the task
+- Has the lowest impact, which is the largest relative drop in available capacity across all capacities.
+
+For instance, if a job were submitted that requires 2 cores and 5g of memory,
+and three runners reported the following capacities:
+
+| Runner | free_cores | impact_cores | free_memory | impact_memory | max_impact |
+|--------|------------|--------------|-------------|---------------|------------|
+| 1      | 70         | 2.8%         | 20g         | 25.00%        | 25%        |
+| 2      | 4          | 50.0%        | 80g         | 6.25%         | 50%        |
+| 3      | 10         | 20.0%        | 30g         | 16.67%        | 20%        |
+
+Runner 3 would be selected. Even though it doesn't have the most memory
+or CPU capacity, allocating the job to it minimizes the impact to the
+overall availability.
+
+# Submission and Execution
+
+Tasks submitted to the runner will be executed with [cgroups](https://www.man7.org/linux/man-pages/man7/cgroups.7.html)
+to enforce limits.
+
+Jobs are submitted asynchronously, and rely on the client to poll for
+results using the `GET /api/v1/task/:task_id` to get the resulting
+TaskAttempt.
+
+Runners are **stateless**, meaning that killing one will kill any
+running tasks and any stored results will be lost.
+
+# Config Files
+
+```json
+{
+  "web-threads": 50,
+  "port":  2504,
+  "ip": "localhost",
+  "capacity_overrides": {
+    "cores": 10,
+    "memory_mb": 100
+  }
+}
+```
+
+Capacities can be overriden from the auto-discovered results.
--- a/daggyr/daggyr/CMakeLists.txt
+++ b/daggyr/daggyr/CMakeLists.txt
@@ -0,0 +1,4 @@
+project(daggyr)
+file(GLOB SOURCES daggyr.cpp)
+add_executable(${PROJECT_NAME} ${SOURCES})
+target_link_libraries(${PROJECT_NAME} argparse libdaggyr libdaggy curl)
--- a/daggyr/daggyr/daggyr.cpp
+++ b/daggyr/daggyr/daggyr.cpp
@@ -0,0 +1,193 @@
+#include <rapidjson/document.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+
+#include <argparse.hpp>
+#include <atomic>
+#include <csignal>
+#include <daggy/Serialization.hpp>
+#include <daggyr/Server.hpp>
+#include <fstream>
+#include <iostream>
+
+// Add executors here
+#include <daggy/executors/task/ForkingTaskExecutor.hpp>
+#include <daggy/executors/task/SlurmTaskExecutor.hpp>
+
+// Add loggers here
+#include <daggy/executors/task/TaskExecutor.hpp>
+#include <daggy/loggers/dag_run/DAGRunLogger.hpp>
+#include <daggy/loggers/dag_run/OStreamLogger.hpp>
+#include <daggy/loggers/dag_run/RedisLogger.hpp>
+
+namespace rj = rapidjson;
+
+static std::atomic<bool> running{true};
+
+void signalHandler(int signal)
+{
+  switch (signal) {
+    case SIGHUP:
+      break;
+    case SIGINT:
+    case SIGTERM:
+      running = false;
+      break;
+    default:
+      break;
+  }
+}
+
+void daemonize()
+{
+  pid_t pid;
+
+  struct sigaction newSigAction;
+  sigset_t newSigSet;
+
+  /* Check if parent process id is set */
+  if (getppid() == 1) {
+    return;
+  }
+
+  /* Set signal mask - signals we want to block */
+  sigemptyset(&newSigSet);
+  sigaddset(&newSigSet,
+            SIGCHLD); /* ignore child - i.e. we don't need to wait for it */
+  sigaddset(&newSigSet, SIGTSTP); /* ignore Tty stop signals */
+  sigaddset(&newSigSet, SIGTTOU); /* ignore Tty background writes */
+  sigaddset(&newSigSet, SIGTTIN); /* ignore Tty background reads */
+  sigprocmask(SIG_BLOCK, &newSigSet,
+              nullptr); /* Block the above specified signals */
+
+  /* Set up a signal handler */
+  newSigAction.sa_handler = signalHandler;
+  sigemptyset(&newSigAction.sa_mask);
+  newSigAction.sa_flags = 0;
+
+  /* Signals to handle */
+  sigaction(SIGHUP, &newSigAction, nullptr);  /* catch hangup signal */
+  sigaction(SIGTERM, &newSigAction, nullptr); /* catch term signal */
+  sigaction(SIGINT, &newSigAction, nullptr);  /* catch interrupt signal */
+
+  // Fork once
+  pid = fork();
+  if (pid < 0) {
+    exit(EXIT_FAILURE);
+  }
+  if (pid > 0) {
+    exit(EXIT_SUCCESS);
+  }
+
+  /* On success: The child process becomes session leader */
+  if (setsid() < 0) {
+    std::cerr << "Unable to setsid" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  /* Catch, ignore and handle signals */
+  signal(SIGCHLD, SIG_IGN);
+  signal(SIGHUP, SIG_IGN);
+
+  /* Fork off for the second time*/
+  pid = fork();
+  if (pid < 0)
+    exit(EXIT_FAILURE);
+  if (pid > 0)
+    exit(EXIT_SUCCESS);
+
+  umask(0);
+
+  /* Change the working directory to the root directory */
+  /* or another appropriated directory */
+  auto rc = chdir("/");
+  (void)rc;
+
+  /* Close all open file descriptors */
+  for (int x = sysconf(_SC_OPEN_MAX); x >= 0; x--) {
+    close(x);
+  }
+}
+
+int main(int argc, char **argv)
+{
+  argparse::ArgumentParser args("Daggy");
+
+  args.add_argument("-v", "--verbose")
+      .default_value(false)
+      .implicit_value(true);
+  args.add_argument("-d", "--daemon").default_value(false).implicit_value(true);
+  args.add_argument("--config").default_value(std::string{});
+
+  try {
+    args.parse_args(argc, argv);
+  }
+  catch (std::exception &e) {
+    std::cout << "Error: " << e.what() << std::endl;
+    std::cout << args;
+    exit(1);
+  }
+
+  struct sysinfo systemInfo;
+
+  sysinfo(&systemInfo);
+
+  bool verbose         = args.get<bool>("--verbose");
+  bool asDaemon        = args.get<bool>("--daemon");
+  auto configFile      = args.get<std::string>("--config");
+  std::string listenIP = "127.0.0.1";
+  int listenPort       = 2504;
+  size_t webThreads    = 50;
+  ssize_t maxCores     = std::max(1U, std::thread::hardware_concurrency() - 2);
+  ssize_t maxMemoryMB =
+      std::max((systemInfo.totalram / (1024 * 1024) * 0.75), 100.0);
+
+  if (!configFile.empty()) {
+    std::ifstream ifh(configFile);
+    std::string config;
+    std::getline(ifh, config, '\0');
+    ifh.close();
+
+    rj::Document doc;
+    daggy::checkRJParse(doc.Parse(config.c_str()));
+
+    if (doc.HasMember("ip"))
+      listenIP = doc["ip"].GetString();
+    if (doc.HasMember("port"))
+      listenPort = doc["port"].GetInt();
+    if (doc.HasMember("web-threads"))
+      webThreads = doc["web-threads"].GetInt64();
+    if (doc.HasMember("capacity-overrides")) {
+      const auto &co = doc["capacity-overrides"];
+      if (co.HasMember("cores"))
+        maxCores = co["cores"].GetInt64();
+      if (co.HasMember("memoryMB"))
+        maxCores = co["memoryMB"].GetInt64();
+    }
+  }
+
+  if (verbose) {
+    std::cout << "Server running at http://" << listenIP << ':' << listenPort
+              << std::endl
+              << "Max Cores: " << maxCores << std::endl
+              << "Max Memory: " << maxMemoryMB << " MB" << std::endl
+              << "Max Web Clients: " << webThreads << std::endl
+              << std::endl
+              << "Ctrl-C to exit" << std::endl;
+  }
+
+  if (asDaemon) {
+    daemonize();
+  }
+
+  Pistache::Address listenSpec(listenIP, listenPort);
+  daggy::daggyr::Server server(listenSpec, maxCores, maxMemoryMB);
+  server.init(webThreads);
+  server.start();
+
+  running = true;
+  while (running) {
+    std::this_thread::sleep_for(std::chrono::seconds(30));
+  }
+  server.shutdown();
+}
--- a/daggyr/libdaggyr/CMakeLists.txt
+++ b/daggyr/libdaggyr/CMakeLists.txt
@@ -0,0 +1,8 @@
+project(libdaggyr)
+
+add_library(${PROJECT_NAME} STATIC)
+
+target_include_directories(${PROJECT_NAME} PUBLIC include)
+target_link_libraries(${PROJECT_NAME} libdaggy stdc++fs)
+
+add_subdirectory(src)
--- a/daggyr/libdaggyr/include/daggyr/Server.hpp
+++ b/daggyr/libdaggyr/include/daggyr/Server.hpp
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <pistache/description.h>
+#include <pistache/endpoint.h>
+#include <pistache/http.h>
+
+#include <daggy/DAGRunner.hpp>
+#include <daggy/ThreadPool.hpp>
+#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
+#include <daggy/executors/task/ForkingTaskExecutor.hpp>
+#include <daggy/loggers/dag_run/DAGRunLogger.hpp>
+#include <filesystem>
+
+#define DAGGY_REST_HANDLER(func)                    \
+  void func(const Pistache::Rest::Request &request, \
+            Pistache::Http::ResponseWriter response);
+
+namespace fs = std::filesystem;
+using namespace daggy::executors::task::daggy_runner;
+
+namespace daggy::daggyr {
+
+  class Server
+  {
+  public:
+    Server(const Pistache::Address &listenSpec, ssize_t maxCores,
+           ssize_t maxMemoryMB);
+    ~Server();
+
+    Server &setSSLCertificates(const fs::path &cert, const fs::path &key);
+
+    void init(size_t threads = 1);
+
+    void start();
+
+    uint16_t getPort() const;
+
+    void shutdown();
+
+    static void validateTask(const Task &task);
+
+  private:
+    void createDescription();
+
+    bool handleAuth(const Pistache::Rest::Request &request);
+
+    DAGGY_REST_HANDLER(handleReady);
+    DAGGY_REST_HANDLER(handleGetCapacity);
+    DAGGY_REST_HANDLER(handleRunTask);
+    DAGGY_REST_HANDLER(handleGetTask);
+    DAGGY_REST_HANDLER(handleStopTask);
+    DAGGY_REST_HANDLER(handleValidateTask);
+
+    Pistache::Http::Endpoint endpoint_;
+    Pistache::Rest::Description desc_;
+    Pistache::Rest::Router router_;
+
+    executors::task::ForkingTaskExecutor executor_;
+
+    struct TaskRecord
+    {
+      RunState state;
+      AttemptRecord attempt;
+    };
+
+    std::mutex capacityGuard_;
+    Capacity maxCapacity_;
+    Capacity curCapacity_;
+
+    std::mutex pendingGuard_;
+
+    struct PendingJob
+    {
+      std::future<AttemptRecord> fut;
+      Capacity resourcesUsed;
+    };
+
+    std::unordered_map<std::pair<DAGRunID, std::string>, PendingJob> pending_;
+
+    std::mutex resultsGuard_;
+    std::unordered_map<std::pair<DAGRunID, std::string>, AttemptRecord>
+        results_;
+  };
+}  // namespace daggy::daggyr
--- a/daggyr/libdaggyr/src/CMakeLists.txt
+++ b/daggyr/libdaggyr/src/CMakeLists.txt
@@ -0,0 +1,3 @@
+target_sources(${PROJECT_NAME} PRIVATE
+    Server.cpp
+)
--- a/daggyr/libdaggyr/src/Server.cpp
+++ b/daggyr/libdaggyr/src/Server.cpp
@@ -0,0 +1,259 @@
+#include <enum.h>
+
+#include <daggy/Serialization.hpp>
+#include <daggy/executors/task/DaggyRunnerTaskExecutor.hpp>
+#include <daggyr/Server.hpp>
+#include <iomanip>
+#include <mutex>
+#include <sstream>
+#include <stdexcept>
+#include <thread>
+#include <utility>
+
+#define REQ_RESPONSE(code, msg)                        \
+  std::stringstream ss;                                \
+  ss << R"({"message": )" << std::quoted(msg) << "}";  \
+  response.send(Pistache::Http::Code::code, ss.str()); \
+  return;
+
+using namespace Pistache;
+
+namespace daggy::daggyr {
+  void Server::init(size_t threads)
+  {
+    auto opts = Http::Endpoint::options()
+                    .threads(threads)
+                    .flags(Pistache::Tcp::Options::ReuseAddr |
+                           Pistache::Tcp::Options::ReusePort)
+                    .maxRequestSize(102400)
+                    .maxResponseSize(102400);
+    endpoint_.init(opts);
+    createDescription();
+  }
+
+  Server::Server(const Pistache::Address &listenSpec, ssize_t maxCores,
+                 ssize_t maxMemoryMB)
+    : endpoint_(listenSpec)
+    , desc_("Daggy Runner API", "0.1")
+    , executor_(maxCores)
+    , maxCapacity_{maxCores, maxMemoryMB}
+    , curCapacity_{maxCores, maxMemoryMB}
+  {
+  }
+
+  Server::~Server()
+  {
+    shutdown();
+  }
+
+  void Server::start()
+  {
+    router_.initFromDescription(desc_);
+
+    endpoint_.setHandler(router_.handler());
+    endpoint_.serveThreaded();
+  }
+
+  Server &Server::setSSLCertificates(const fs::path &cert, const fs::path &key)
+  {
+    endpoint_.useSSL(cert, key);
+    return *this;
+  }
+
+  void Server::shutdown()
+  {
+    endpoint_.shutdown();
+  }
+
+  uint16_t Server::getPort() const
+  {
+    return endpoint_.getPort();
+  }
+
+  void Server::createDescription()
+  {
+    desc_.info().license("MIT", "https://opensource.org/licenses/MIT");
+
+    desc_.schemes(Rest::Scheme::Http)
+        .basePath("/v1")
+        .produces(MIME(Application, Json))
+        .consumes(MIME(Application, Json));
+
+    desc_.route(desc_.get("/ready"))
+        .bind(&Server::handleReady, this)
+        .response(Http::Code::Ok, "Response to the /ready call")
+        .hide();
+
+    auto versionPath = desc_.path("/v1");
+
+    versionPath.route(desc_.post("/validate"))
+        .bind(&Server::handleValidateTask, this)
+        .produces(MIME(Application, Json))
+        .response(Http::Code::Ok, "Validate a task");
+
+    versionPath.route(desc_.post("/task/:runID/:taskName"))
+        .bind(&Server::handleRunTask, this)
+        .produces(MIME(Application, Json))
+        .response(Http::Code::Ok, "Run a task");
+
+    versionPath.route(desc_.get("/task/:runID/:taskName"))
+        .bind(&Server::handleGetTask, this)
+        .produces(MIME(Application, Json))
+        .response(Http::Code::Ok,
+                  "Get the state and potentially the AttemptRecord of a task");
+
+    versionPath.route(desc_.del("/task/:runID/:taskName"))
+        .bind(&Server::handleStopTask, this)
+        .produces(MIME(Application, Json))
+        .response(Http::Code::Ok, "Stop a task");
+
+    versionPath.route(desc_.get("/capacity"))
+        .bind(&Server::handleGetCapacity, this)
+        .produces(MIME(Application, Json))
+        .response(Http::Code::Ok, "Get capacities of worker");
+  }
+
+  void Server::handleValidateTask(const Pistache::Rest::Request &request,
+                                  Pistache::Http::ResponseWriter response)
+  {
+    try {
+      auto task = taskFromJSON("sample_task", request.body());
+      daggy::executors::task::daggy_runner::validateTaskParameters(task.job);
+    }
+    catch (std::exception &e) {
+      REQ_RESPONSE(Not_Acceptable, e.what());
+    }
+    REQ_RESPONSE(Ok, "Task is valid");
+  }
+
+  void Server::handleRunTask(const Pistache::Rest::Request &request,
+                             Pistache::Http::ResponseWriter response)
+  {
+    if (!handleAuth(request))
+      return;
+
+    auto runID    = request.param(":runID").as<DAGRunID>();
+    auto taskName = request.param(":taskName").as<std::string>();
+
+    Capacity resourcesUsed;
+    Task task;
+    try {
+      task          = taskFromJSON(taskName, request.body());
+      resourcesUsed = capacityFromTask(task);
+    }
+    catch (std::exception &e) {
+      REQ_RESPONSE(Not_Acceptable, e.what());
+    }
+
+    {
+      std::lock_guard<std::mutex> lock(capacityGuard_);
+      curCapacity_.cores -= resourcesUsed.cores;
+      curCapacity_.memoryMB -= resourcesUsed.memoryMB;
+    }
+
+    {
+      std::lock_guard<std::mutex> lock(pendingGuard_);
+      pending_.emplace(
+          std::make_pair(runID, taskName),
+          PendingJob{.fut           = executor_.execute(runID, taskName, task),
+                     .resourcesUsed = resourcesUsed});
+    }
+
+    response.send(Pistache::Http::Code::Ok, "");
+  }
+
+  void Server::handleGetTask(const Pistache::Rest::Request &request,
+                             Pistache::Http::ResponseWriter response)
+  {
+    if (!handleAuth(request))
+      return;
+
+    auto runID    = request.param(":runID").as<DAGRunID>();
+    auto taskName = request.param(":taskName").as<std::string>();
+
+    auto taskID = std::make_pair(runID, taskName);
+
+    std::string payload;
+
+    bool found = false;
+    {
+      std::lock_guard<std::mutex> lock(pendingGuard_);
+      auto it = pending_.find(taskID);
+      if (it != pending_.end()) {
+        // poll it
+        if (it->second.fut.valid() and
+            it->second.fut.wait_for(1ms) == std::future_status::ready) {
+          auto attempt = it->second.fut.get();
+          {
+            std::lock_guard<std::mutex> rlock(resultsGuard_);
+            results_.emplace(taskID, attempt);
+          }
+          {
+            std::lock_guard<std::mutex> rlock(capacityGuard_);
+            curCapacity_.cores += it->second.resourcesUsed.cores;
+            curCapacity_.memoryMB += it->second.resourcesUsed.memoryMB;
+          }
+        }
+        else {
+          payload = R"({ "state": "RUNNING" })";
+          found   = true;
+        }
+      }
+    }
+
+    if (!found) {
+      std::lock_guard<std::mutex> lock(resultsGuard_);
+      auto it = results_.find(taskID);
+      if (it == results_.end()) {
+        REQ_RESPONSE(Not_Found, "No such task");
+      }
+
+      payload = R"({ "state": "COMPLETED", "attempt": )" +
+                attemptRecordToJSON(it->second) + "}";
+    }
+    response.send(Pistache::Http::Code::Ok, payload);
+  }
+
+  void Server::handleStopTask(const Pistache::Rest::Request &request,
+                              Pistache::Http::ResponseWriter response)
+  {
+    if (!handleAuth(request))
+      return;
+
+    auto runID    = request.param(":runID").as<DAGRunID>();
+    auto taskName = request.param(":taskName").as<std::string>();
+
+    executor_.stop(runID, taskName);
+
+    REQ_RESPONSE(Ok, "");
+  }
+
+  void Server::handleGetCapacity(const Pistache::Rest::Request &request,
+                                 Pistache::Http::ResponseWriter response)
+  {
+    std::string payload;
+    {
+      std::lock_guard<std::mutex> lock(capacityGuard_);
+      payload = R"({ "current": )" + capacityToJSON(curCapacity_) +
+                R"(, "total": )" + capacityToJSON(maxCapacity_) + "}";
+    }
+
+    response.send(Pistache::Http::Code::Ok, payload);
+  }
+
+  void Server::handleReady(const Pistache::Rest::Request &request,
+                           Pistache::Http::ResponseWriter response)
+  {
+    response.send(Pistache::Http::Code::Ok, R"({ "msg": "Ready for tasks!"})");
+  }
+
+  /*
+   * handleAuth will check any auth methods and handle any responses in the
+   * case of failed auth. If it returns false, callers should cease handling
+   * the response
+   */
+  bool Server::handleAuth(const Pistache::Rest::Request &request)
+  {
+    return true;
+  }
+}  // namespace daggy::daggyr
--- a/daggyr/tests/CMakeLists.txt
+++ b/daggyr/tests/CMakeLists.txt
@@ -0,0 +1,9 @@
+project(daggyr_tests)
+
+add_executable(${PROJECT_NAME} main.cpp
+        # unit tests
+        unit_server.cpp
+        )
+target_link_libraries(${PROJECT_NAME} libdaggyr libdaggy stdc++fs Catch2::Catch2)
+
+add_test(${PROJECT_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME})
--- a/daggyr/tests/main.cpp
+++ b/daggyr/tests/main.cpp
@@ -0,0 +1,15 @@
+#include <iostream>
+
+#include "daggy/DAG.hpp"
+
+#define CATCH_CONFIG_MAIN
+
+#include <catch2/catch.hpp>
+
+TEST_CASE("Sanity tests", "[sanity]")
+{
+  REQUIRE(1 == 1);
+}
+
+// compile and run
+// g++ -std=c++17 -o test test.cpp  && ./test
--- a/daggyr/tests/unit_server.cpp
+++ b/daggyr/tests/unit_server.cpp
@@ -0,0 +1,172 @@
+#include <curl/curl.h>
+#include <rapidjson/document.h>
+#include <sys/stat.h>
+
+#include <catch2/catch.hpp>
+#include <daggy/Serialization.hpp>
+#include <daggy/executors/task/ForkingTaskExecutor.hpp>
+#include <daggy/executors/task/NoopTaskExecutor.hpp>
+#include <daggy/loggers/dag_run/OStreamLogger.hpp>
+#include <daggyr/Server.hpp>
+#include <filesystem>
+#include <iostream>
+#include <thread>
+
+namespace rj = rapidjson;
+
+using namespace daggy;
+
+TEST_CASE("rest_endpoint", "[server_basic]")
+{
+  std::stringstream ss;
+  Pistache::Address listenSpec("localhost", Pistache::Port(0));
+
+  const ssize_t maxCores = 10, maxMemoryMB = 1000;
+
+  daggyr::Server server(listenSpec, maxCores, maxMemoryMB);
+  server.init(10);
+  server.start();
+
+  const std::string host    = "localhost:";
+  const std::string baseURL = host + std::to_string(server.getPort());
+
+  SECTION("Ready Endpoint")
+  {
+    auto response = HTTP_REQUEST(baseURL + "/ready");
+    REQUIRE(response.code == HTTPCode::Ok);
+  }
+
+  SECTION("Querying a non-existent task should yield a 404")
+  {
+    auto response = HTTP_REQUEST(baseURL + "/v1/task/100/sample_name");
+    REQUIRE(response.code == HTTPCode::Not_Found);
+  }
+
+  SECTION("Task Missing Cores should Fail")
+  {
+    std::string taskSpec =
+        R"({ "job": { "command": [ "/usr/bin/touch", "dagrun_{{FILE}}" ]}, "memoryMB": 100 })";
+
+    auto response =
+        HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", taskSpec, "POST");
+    REQUIRE(response.code == HTTPCode::Not_Acceptable);
+  }
+
+  SECTION("Task Missing MemoryMB should Fail")
+  {
+    std::string taskSpec =
+        R"({ "job": { "command": [ "/usr/bin/touch", "dagrun_{{FILE}}" ]}, "cores": 100 })";
+
+    auto response =
+        HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", taskSpec, "POST");
+    REQUIRE(response.code == HTTPCode::Not_Acceptable);
+  }
+
+  SECTION("Task submission and get result")
+  {
+    std::string taskSpec =
+        R"({ "job": { "command": [ "/usr/bin/echo", "hello", "world" ], "cores": "1", "memoryMB": "100" } })";
+
+    // Submit
+    {
+      auto response =
+          HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", taskSpec, "POST");
+      REQUIRE(response.code == HTTPCode::Ok);
+    }
+
+    while (true) {
+      auto [code, doc] = JSON_HTTP_REQUEST(baseURL + "/v1/task/0/sample_task");
+      REQUIRE(doc.IsObject());
+      REQUIRE(doc.HasMember("state"));
+
+      std::string state = doc["state"].GetString();
+      if (state != "COMPLETED") {
+        std::this_thread::sleep_for(250ms);
+      }
+      else {
+        REQUIRE(doc.HasMember("attempt"));
+        auto attempt = attemptRecordFromJSON(doc["attempt"]);
+
+        REQUIRE(attempt.rc == 0);
+        REQUIRE(attempt.outputLog == "hello world\n");
+        break;
+      }
+    }
+  }
+
+  SECTION("Task capacity changes")
+  {
+    std::string taskSpec =
+        R"({ "job": { "command": [ "/usr/bin/sleep", "5" ], "cores": "1", "memoryMB": "100" } })";
+
+    auto getCapacity = [&]() -> daggy::executors::task::daggy_runner::Capacity {
+      daggy::executors::task::daggy_runner::Capacity cap;
+      auto [code, doc] = JSON_HTTP_REQUEST(baseURL + "/v1/capacity");
+      REQUIRE(doc.IsObject());
+      REQUIRE(doc.HasMember("current"));
+      const auto &cur = doc["current"];
+      REQUIRE(cur.IsObject());
+      REQUIRE(cur.HasMember("cores"));
+      REQUIRE(cur.HasMember("memoryMB"));
+
+      cap.cores    = cur["cores"].GetInt64();
+      cap.memoryMB = cur["memoryMB"].GetInt64();
+
+      return cap;
+    };
+
+    auto preCap = getCapacity();
+
+    // Submit
+    {
+      auto response =
+          HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", taskSpec, "POST");
+      REQUIRE(response.code == HTTPCode::Ok);
+    }
+
+    auto postCap = getCapacity();
+
+    REQUIRE(postCap.cores == preCap.cores - 1);
+    REQUIRE(postCap.memoryMB == preCap.memoryMB - 100);
+
+    // Ensure the current job is running
+    {
+      auto [code, doc] = JSON_HTTP_REQUEST(baseURL + "/v1/task/0/sample_task");
+      REQUIRE(doc.IsObject());
+      REQUIRE(doc.HasMember("state"));
+      REQUIRE(doc["state"] != "COMPLETED");
+    }
+
+    // Stop it
+    {
+      auto [code, doc] =
+          JSON_HTTP_REQUEST(baseURL + "/v1/task/0/sample_task", "", "DELETE");
+      REQUIRE(code == HTTPCode::Ok);
+    }
+
+    // Grab it and ensure it was killed
+    while (true) {
+      auto response = HTTP_REQUEST(baseURL + "/v1/task/0/sample_task");
+
+      REQUIRE(response.code == HTTPCode::Ok);
+      rj::Document doc;
+      daggy::checkRJParse(doc.Parse(response.body.c_str()));
+      REQUIRE(doc.IsObject());
+      REQUIRE(doc.HasMember("state"));
+
+      std::string state = doc["state"].GetString();
+      if (state != "COMPLETED") {
+        std::this_thread::sleep_for(250ms);
+      }
+      else {
+        REQUIRE(doc.HasMember("attempt"));
+        auto attempt = attemptRecordFromJSON(doc["attempt"]);
+
+        REQUIRE(attempt.rc != 0);
+        break;
+      }
+    }
+  }
+
+  server.shutdown();
+}