Large re-organization to split daggyd away from the core libdaggy.
This paves the way for implementing daggys and other utilities. Squashed commit of the following: commit 1f77239ab3c9e44d190eef94531a39501c8c4dfe Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 16:25:02 2021 -0300 Adding README, stdout support for daggyd logging commit c2c237224e84a3be68aaa597ce98af1365e74a13 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 16:10:29 2021 -0300 removing old daggyd commit cfea2baf61ca10c535801c5a391d2d525a1a2d04 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 16:10:09 2021 -0300 Moving tests into their sub-project folders commit e41ca42069bea1db16dd76b6684a3f692fef6b15 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 15:57:40 2021 -0300 Splitting out daggyd from libdaggy commit be97b146c1d2446f5c03cb78707e921f18c60bd8 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 15:56:55 2021 -0300 Splitting out daggyd from libdaggy commit cb61e140e9d6d8832d61fb7037fd4c0ff6edad00 Author: Ian Roddis <gitlab@ie2r.com> Date: Mon Oct 18 15:49:47 2021 -0300 moving daggy to libdaggy
This commit is contained in:
1
libdaggy/src/executors/CMakeLists.txt
Normal file
1
libdaggy/src/executors/CMakeLists.txt
Normal file
@@ -0,0 +1 @@
|
||||
add_subdirectory(task)
|
||||
5
libdaggy/src/executors/task/CMakeLists.txt
Normal file
5
libdaggy/src/executors/task/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
target_sources(${PROJECT_NAME} PRIVATE
|
||||
SlurmTaskExecutor.cpp
|
||||
NoopTaskExecutor.cpp
|
||||
ForkingTaskExecutor.cpp
|
||||
)
|
||||
229
libdaggy/src/executors/task/ForkingTaskExecutor.cpp
Normal file
229
libdaggy/src/executors/task/ForkingTaskExecutor.cpp
Normal file
@@ -0,0 +1,229 @@
|
||||
#include <fcntl.h>
|
||||
#include <poll.h>
|
||||
#include <unistd.h>
|
||||
#include <wait.h>
|
||||
|
||||
#include <daggy/Utilities.hpp>
|
||||
#include <daggy/executors/task/ForkingTaskExecutor.hpp>
|
||||
#include <iomanip>
|
||||
|
||||
using namespace daggy::executors::task;
|
||||
|
||||
std::string slurp(int fd)
|
||||
{
|
||||
std::string result;
|
||||
|
||||
const ssize_t BUFFER_SIZE = 4096;
|
||||
char buffer[BUFFER_SIZE];
|
||||
|
||||
struct pollfd pfd
|
||||
{
|
||||
.fd = fd, .events = POLLIN, .revents = 0
|
||||
};
|
||||
poll(&pfd, 1, 1);
|
||||
|
||||
while (pfd.revents & POLLIN) {
|
||||
ssize_t bytes = read(fd, buffer, BUFFER_SIZE);
|
||||
if (bytes == 0) {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
result.append(buffer, bytes);
|
||||
}
|
||||
pfd.revents = 0;
|
||||
poll(&pfd, 1, 1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ForkingTaskExecutor::ForkingTaskExecutor(size_t nThreads)
|
||||
: tp_(nThreads)
|
||||
{
|
||||
}
|
||||
|
||||
ForkingTaskExecutor::~ForkingTaskExecutor()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(taskControlsGuard_);
|
||||
taskControls_.clear();
|
||||
}
|
||||
|
||||
bool ForkingTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
|
||||
{
|
||||
std::string key = std::to_string(runID) + "_" + taskName;
|
||||
std::lock_guard<std::mutex> lock(taskControlsGuard_);
|
||||
auto it = taskControls_.find(key);
|
||||
if (it == taskControls_.end())
|
||||
return true;
|
||||
it->second = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::future<daggy::AttemptRecord> ForkingTaskExecutor::execute(
|
||||
DAGRunID runID, const std::string &taskName, const Task &task)
|
||||
{
|
||||
std::string key = std::to_string(runID) + "_" + taskName;
|
||||
std::lock_guard<std::mutex> lock(taskControlsGuard_);
|
||||
auto [it, ins] = taskControls_.emplace(key, true);
|
||||
auto &running = it->second;
|
||||
return tp_.addTask([this, task, &running, key]() {
|
||||
auto ret = this->runTask(task, running);
|
||||
std::lock_guard<std::mutex> lock(this->taskControlsGuard_);
|
||||
this->taskControls_.extract(key);
|
||||
return ret;
|
||||
});
|
||||
}
|
||||
|
||||
daggy::AttemptRecord ForkingTaskExecutor::runTask(const Task &task,
|
||||
std::atomic<bool> &running)
|
||||
{
|
||||
AttemptRecord rec;
|
||||
|
||||
rec.startTime = Clock::now();
|
||||
|
||||
// Need to convert the strings
|
||||
std::vector<char *> argv;
|
||||
std::vector<char *> envp;
|
||||
|
||||
// Populate the command
|
||||
Command command;
|
||||
if (task.job.count("commandString")) {
|
||||
std::stringstream ss;
|
||||
ss << std::get<std::string>(task.job.at("commandString"));
|
||||
std::string tok;
|
||||
while (ss >> std::quoted(tok)) {
|
||||
command.push_back(tok);
|
||||
}
|
||||
}
|
||||
else {
|
||||
const auto cmd = std::get<Command>(task.job.at("command"));
|
||||
std::copy(cmd.begin(), cmd.end(), std::back_inserter(command));
|
||||
}
|
||||
std::transform(
|
||||
command.begin(), command.end(), std::back_inserter(argv),
|
||||
[](const std::string &s) { return const_cast<char *>(s.c_str()); });
|
||||
argv.push_back(nullptr);
|
||||
|
||||
// Populate the environment
|
||||
auto environment = (task.job.count("environment") == 0
|
||||
? std::vector<std::string>{}
|
||||
: std::get<Command>(task.job.at("environment")));
|
||||
std::transform(
|
||||
environment.begin(), environment.end(), std::back_inserter(envp),
|
||||
[](const std::string &s) { return const_cast<char *>(s.c_str()); });
|
||||
envp.push_back(nullptr);
|
||||
|
||||
// Create the pipe
|
||||
int stdoutPipe[2];
|
||||
int pipeRC = pipe2(stdoutPipe, O_DIRECT);
|
||||
if (pipeRC != 0)
|
||||
throw std::runtime_error("Unable to create pipe for stdout");
|
||||
int stderrPipe[2];
|
||||
pipeRC = pipe2(stderrPipe, O_DIRECT);
|
||||
if (pipeRC != 0)
|
||||
throw std::runtime_error("Unable to create pipe for stderr");
|
||||
|
||||
pid_t child = fork();
|
||||
if (child < 0) {
|
||||
throw std::runtime_error("Unable to fork child");
|
||||
}
|
||||
else if (child == 0) { // child
|
||||
while ((dup2(stdoutPipe[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {
|
||||
}
|
||||
while ((dup2(stderrPipe[1], STDERR_FILENO) == -1) && (errno == EINTR)) {
|
||||
}
|
||||
close(stdoutPipe[0]);
|
||||
close(stderrPipe[0]);
|
||||
char **env = (envp.empty() ? nullptr : envp.data());
|
||||
auto res = execvpe(argv[0], argv.data(), env);
|
||||
std::cout << res << std::endl;
|
||||
exit(errno);
|
||||
}
|
||||
|
||||
std::atomic<bool> reading = true;
|
||||
std::thread stdoutReader([&]() {
|
||||
while (reading)
|
||||
rec.outputLog.append(slurp(stdoutPipe[0]));
|
||||
});
|
||||
std::thread stderrReader([&]() {
|
||||
while (reading)
|
||||
rec.errorLog.append(slurp(stderrPipe[0]));
|
||||
});
|
||||
|
||||
siginfo_t childInfo;
|
||||
while (running) {
|
||||
childInfo.si_pid = 0;
|
||||
waitid(P_PID, child, &childInfo, WEXITED | WNOHANG);
|
||||
if (childInfo.si_pid > 0) {
|
||||
break;
|
||||
}
|
||||
std::this_thread::sleep_for(250ms);
|
||||
}
|
||||
|
||||
if (!running) {
|
||||
rec.executorLog = "Killed";
|
||||
// Send the kills until pid is dead
|
||||
while (kill(child, SIGKILL) != -1) {
|
||||
// Need to collect the child to avoid a zombie process
|
||||
waitid(P_PID, child, &childInfo, WEXITED | WNOHANG);
|
||||
std::this_thread::sleep_for(50ms);
|
||||
}
|
||||
}
|
||||
|
||||
reading = false;
|
||||
|
||||
rec.stopTime = Clock::now();
|
||||
if (childInfo.si_pid > 0) {
|
||||
rec.rc = childInfo.si_status;
|
||||
}
|
||||
else {
|
||||
rec.rc = -1;
|
||||
}
|
||||
|
||||
stdoutReader.join();
|
||||
stderrReader.join();
|
||||
|
||||
close(stdoutPipe[0]);
|
||||
close(stderrPipe[0]);
|
||||
|
||||
return rec;
|
||||
}
|
||||
|
||||
bool ForkingTaskExecutor::validateTaskParameters(const ConfigValues &job)
|
||||
{
|
||||
// command or commandString is required
|
||||
if (job.count("command")) {
|
||||
if (!std::holds_alternative<Command>(job.at("command")))
|
||||
throw std::runtime_error(R"(command must be an array of strings)");
|
||||
}
|
||||
else {
|
||||
if (job.count("commandString") == 0) {
|
||||
throw std::runtime_error(R"(command or commandString must be defined.)");
|
||||
}
|
||||
if (!std::holds_alternative<std::string>(job.at("commandString")))
|
||||
throw std::runtime_error(R"(commandString must be a string)");
|
||||
}
|
||||
|
||||
if (job.count("environment")) {
|
||||
if (!std::holds_alternative<Command>(job.at("environment")))
|
||||
throw std::runtime_error(R"(environment must be an array of strings)");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<daggy::ConfigValues> ForkingTaskExecutor::expandTaskParameters(
|
||||
const ConfigValues &job, const ConfigValues &expansionValues)
|
||||
{
|
||||
std::vector<ConfigValues> newValues;
|
||||
|
||||
const auto command = std::get<Command>(job.at("command"));
|
||||
for (const auto &expandedCommand :
|
||||
interpolateValues(command, expansionValues)) {
|
||||
ConfigValues newCommand{job};
|
||||
newCommand.at("command") = expandedCommand;
|
||||
newValues.emplace_back(newCommand);
|
||||
}
|
||||
|
||||
return newValues;
|
||||
}
|
||||
51
libdaggy/src/executors/task/NoopTaskExecutor.cpp
Normal file
51
libdaggy/src/executors/task/NoopTaskExecutor.cpp
Normal file
@@ -0,0 +1,51 @@
|
||||
#include <daggy/Utilities.hpp>
|
||||
#include <daggy/executors/task/NoopTaskExecutor.hpp>
|
||||
|
||||
namespace daggy::executors::task {
|
||||
std::future<daggy::AttemptRecord> NoopTaskExecutor::execute(
|
||||
DAGRunID runID, const std::string &taskName, const Task &task)
|
||||
{
|
||||
std::promise<daggy::AttemptRecord> promise;
|
||||
auto ts = Clock::now();
|
||||
promise.set_value(AttemptRecord{.startTime = ts,
|
||||
.stopTime = ts,
|
||||
.rc = 0,
|
||||
.executorLog = taskName,
|
||||
.outputLog = taskName,
|
||||
.errorLog = taskName});
|
||||
return promise.get_future();
|
||||
}
|
||||
|
||||
bool NoopTaskExecutor::validateTaskParameters(const ConfigValues &job)
|
||||
{
|
||||
auto it = job.find("command");
|
||||
if (it == job.end())
|
||||
throw std::runtime_error(R"(job does not have a "command" argument)");
|
||||
if (!std::holds_alternative<Command>(it->second))
|
||||
throw std::runtime_error(
|
||||
R"(taskParameter's "command" must be an array of strings)");
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<daggy::ConfigValues> NoopTaskExecutor::expandTaskParameters(
|
||||
const ConfigValues &job, const ConfigValues &expansionValues)
|
||||
{
|
||||
std::vector<ConfigValues> newValues;
|
||||
|
||||
const auto command = std::get<Command>(job.at("command"));
|
||||
for (const auto &expandedCommand :
|
||||
interpolateValues(command, expansionValues)) {
|
||||
ConfigValues newCommand{job};
|
||||
newCommand.at("command") = expandedCommand;
|
||||
newValues.emplace_back(newCommand);
|
||||
}
|
||||
|
||||
return newValues;
|
||||
}
|
||||
|
||||
bool NoopTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace daggy::executors::task
|
||||
347
libdaggy/src/executors/task/SlurmTaskExecutor.cpp
Normal file
347
libdaggy/src/executors/task/SlurmTaskExecutor.cpp
Normal file
@@ -0,0 +1,347 @@
|
||||
#include <iomanip>
|
||||
#include <iterator>
|
||||
#include <mutex>
|
||||
#include <stdexcept>
|
||||
#ifdef DAGGY_ENABLE_SLURM
|
||||
#include <slurm/slurm.h>
|
||||
#include <string.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <csignal>
|
||||
#include <cstdlib>
|
||||
#include <daggy/Utilities.hpp>
|
||||
#include <daggy/executors/task/SlurmTaskExecutor.hpp>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <random>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
namespace daggy::executors::task {
|
||||
std::string getUniqueTag(size_t nChars = 6)
|
||||
{
|
||||
std::string result(nChars, '\0');
|
||||
static std::random_device dev;
|
||||
static std::mt19937 rng(dev());
|
||||
|
||||
std::uniform_int_distribution<int> dist(0, 61);
|
||||
|
||||
const char *v =
|
||||
"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
||||
|
||||
for (size_t i = 0; i < nChars; i++) {
|
||||
result[i] = v[dist(rng)];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void readAndClean(const fs::path &fn, std::string &dest)
|
||||
{
|
||||
if (!fs::exists(fn))
|
||||
return;
|
||||
|
||||
std::ifstream ifh;
|
||||
ifh.open(fn);
|
||||
std::string contents(std::istreambuf_iterator<char>{ifh}, {});
|
||||
ifh.close();
|
||||
fs::remove_all(fn);
|
||||
|
||||
dest.swap(contents);
|
||||
}
|
||||
|
||||
SlurmTaskExecutor::SlurmTaskExecutor()
|
||||
: running_(true)
|
||||
, monitorWorker_(&SlurmTaskExecutor::monitor, this)
|
||||
{
|
||||
std::string priority =
|
||||
"SLURM_PRIO_PROCESS=" + std::to_string(getpriority(PRIO_PROCESS, 0));
|
||||
std::string submitDir = "SLURM_SUBMIT_DIR=" + fs::current_path().string();
|
||||
|
||||
const size_t MAX_HOSTNAME_LENGTH = 50;
|
||||
std::string submitHost(MAX_HOSTNAME_LENGTH, '\0');
|
||||
gethostname(submitHost.data(), MAX_HOSTNAME_LENGTH);
|
||||
submitHost = "SLURM_SUBMIT_HOST=" + submitHost;
|
||||
submitHost.resize(submitHost.find('\0'));
|
||||
|
||||
uint32_t mask = umask(0);
|
||||
umask(mask); // Restore the old mask
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "SLURM_UMASK=0" << uint32_t{((mask >> 6) & 07)}
|
||||
<< uint32_t{((mask >> 3) & 07)} << uint32_t{(mask & 07)};
|
||||
|
||||
// Set some environment variables
|
||||
putenv(const_cast<char *>(priority.c_str()));
|
||||
putenv(const_cast<char *>(submitDir.c_str()));
|
||||
putenv(const_cast<char *>(submitHost.c_str()));
|
||||
putenv(const_cast<char *>(ss.str().c_str()));
|
||||
}
|
||||
|
||||
SlurmTaskExecutor::~SlurmTaskExecutor()
|
||||
{
|
||||
running_ = false;
|
||||
monitorWorker_.join();
|
||||
|
||||
// Resolve the remaining futures
|
||||
std::lock_guard<std::mutex> lock(promiseGuard_);
|
||||
for (auto &[jobID, job] : runningJobs_) {
|
||||
job.prom.set_value(
|
||||
AttemptRecord{.rc = -1, .executorLog = "executor killed"});
|
||||
}
|
||||
runningJobs_.clear();
|
||||
}
|
||||
|
||||
// Validates the job to ensure that all required values are set and are of
|
||||
// the right type,
|
||||
bool SlurmTaskExecutor::validateTaskParameters(const ConfigValues &job)
|
||||
{
|
||||
const std::unordered_set<std::string> requiredFields{
|
||||
"minCPUs", "minMemoryMB", "minTmpDiskMB", "priority",
|
||||
"timeLimitSeconds", "userID", "workDir", "tmpDir"};
|
||||
|
||||
for (const auto &requiredField : requiredFields) {
|
||||
if (job.count(requiredField) == 0) {
|
||||
throw std::runtime_error("Missing field " + requiredField);
|
||||
}
|
||||
}
|
||||
|
||||
// Require command or commandString
|
||||
if (job.count("command") + job.count("commandString") == 0)
|
||||
throw std::runtime_error(
|
||||
"Either command or commandString must be specified");
|
||||
|
||||
if (job.count("environment")) {
|
||||
if (!std::holds_alternative<Command>(job.at("environment")))
|
||||
throw std::runtime_error(R"(environment must be an array of strings)");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<ConfigValues> SlurmTaskExecutor::expandTaskParameters(
|
||||
const ConfigValues &job, const ConfigValues &expansionValues)
|
||||
{
|
||||
std::vector<ConfigValues> newValues;
|
||||
|
||||
const auto command = std::get<Command>(job.at("command"));
|
||||
for (const auto &expandedCommand :
|
||||
interpolateValues(command, expansionValues)) {
|
||||
ConfigValues newCommand{job};
|
||||
newCommand.at("command") = expandedCommand;
|
||||
newValues.emplace_back(newCommand);
|
||||
}
|
||||
|
||||
return newValues;
|
||||
}
|
||||
|
||||
std::future<AttemptRecord> SlurmTaskExecutor::execute(
|
||||
DAGRunID runID, const std::string &taskName, const Task &task)
|
||||
{
|
||||
std::stringstream executorLog;
|
||||
|
||||
const auto &job = task.job;
|
||||
const auto uniqueTaskName = taskName + "_" + getUniqueTag(6);
|
||||
|
||||
fs::path tmpDir = std::get<std::string>(job.at("tmpDir"));
|
||||
std::string stdoutFile = (tmpDir / (uniqueTaskName + ".stdout")).string();
|
||||
std::string stderrFile = (tmpDir / (uniqueTaskName + ".stderr")).string();
|
||||
std::string workDir = std::get<std::string>(job.at("workDir"));
|
||||
|
||||
// Convert command to argc / argv
|
||||
std::vector<char *> argv{nullptr};
|
||||
// Populate the command
|
||||
Command command;
|
||||
if (task.job.count("commandString")) {
|
||||
std::stringstream ss;
|
||||
ss << std::get<std::string>(task.job.at("commandString"));
|
||||
std::string tok;
|
||||
while (ss >> std::quoted(tok)) {
|
||||
command.push_back(tok);
|
||||
}
|
||||
}
|
||||
else {
|
||||
const auto cmd = std::get<Command>(task.job.at("command"));
|
||||
std::copy(cmd.begin(), cmd.end(), std::back_inserter(command));
|
||||
}
|
||||
std::transform(
|
||||
command.begin(), command.end(), std::back_inserter(argv),
|
||||
[](const std::string &s) { return const_cast<char *>(s.c_str()); });
|
||||
argv.push_back(nullptr);
|
||||
|
||||
std::vector<std::string> env{""};
|
||||
std::vector<char *> envp;
|
||||
auto it = task.job.find("environment");
|
||||
if (it != task.job.end()) {
|
||||
const auto environment = std::get<Command>(task.job.at("environment"));
|
||||
std::copy(environment.begin(), environment.end(),
|
||||
std::back_inserter(env));
|
||||
}
|
||||
std::transform(
|
||||
env.begin(), env.end(), std::back_inserter(envp),
|
||||
[](const std::string &s) { return const_cast<char *>(s.c_str()); });
|
||||
|
||||
char script[] = "#!/bin/bash\n$@\n";
|
||||
char stdinFile[] = "/dev/null";
|
||||
|
||||
// taken from slurm
|
||||
int error_code;
|
||||
job_desc_msg_t jd;
|
||||
submit_response_msg_t *resp_msg;
|
||||
|
||||
slurm_init_job_desc_msg(&jd);
|
||||
jd.contiguous = 1;
|
||||
jd.name = const_cast<char *>(taskName.c_str());
|
||||
jd.min_cpus = std::stoi(std::get<std::string>(job.at("minCPUs")));
|
||||
|
||||
jd.pn_min_memory = std::stoi(std::get<std::string>(job.at("minMemoryMB")));
|
||||
jd.pn_min_tmp_disk =
|
||||
std::stoi(std::get<std::string>(job.at("minTmpDiskMB")));
|
||||
jd.priority = std::stoi(std::get<std::string>(job.at("priority")));
|
||||
jd.shared = 0;
|
||||
jd.time_limit =
|
||||
std::stoi(std::get<std::string>(job.at("timeLimitSeconds")));
|
||||
jd.min_nodes = 1;
|
||||
jd.user_id = std::stoi(std::get<std::string>(job.at("userID")));
|
||||
jd.argv = argv.data();
|
||||
jd.argc = argv.size();
|
||||
// TODO figure out the script to run
|
||||
jd.script = script;
|
||||
jd.std_in = stdinFile;
|
||||
jd.std_err = const_cast<char *>(stderrFile.c_str());
|
||||
jd.std_out = const_cast<char *>(stdoutFile.c_str());
|
||||
jd.work_dir = const_cast<char *>(workDir.c_str());
|
||||
|
||||
// jd.env_size = 1;
|
||||
// jd.environment = env;
|
||||
jd.env_size = envp.size();
|
||||
jd.environment = envp.data();
|
||||
|
||||
error_code = slurm_submit_batch_job(&jd, &resp_msg);
|
||||
if (error_code) {
|
||||
std::stringstream ss;
|
||||
ss << "Unable to submit slurm job: " << slurm_strerror(error_code);
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
uint32_t jobID = resp_msg->job_id;
|
||||
executorLog << "Job " << resp_msg->job_submit_user_msg << '\n';
|
||||
slurm_free_submit_response_response_msg(resp_msg);
|
||||
|
||||
std::lock_guard<std::mutex> lock(promiseGuard_);
|
||||
Job newJob{.prom{},
|
||||
.stdoutFile = stdoutFile,
|
||||
.stderrFile = stderrFile,
|
||||
.runID = runID,
|
||||
.taskName = taskName};
|
||||
auto fut = newJob.prom.get_future();
|
||||
runningJobs_.emplace(jobID, std::move(newJob));
|
||||
|
||||
return fut;
|
||||
}
|
||||
|
||||
bool SlurmTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
|
||||
{
|
||||
// Hopefully this isn't a common thing, so just scrap the current jobs and
|
||||
// kill them
|
||||
size_t jobID = 0;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(promiseGuard_);
|
||||
for (const auto &[k, v] : runningJobs_) {
|
||||
if (v.runID == runID and v.taskName == taskName) {
|
||||
jobID = k;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (jobID == 0)
|
||||
return true;
|
||||
}
|
||||
|
||||
// Send the kill message to slurm
|
||||
slurm_kill_job(jobID, SIGKILL, KILL_HURRY);
|
||||
return true;
|
||||
}
|
||||
|
||||
void SlurmTaskExecutor::monitor()
|
||||
{
|
||||
std::unordered_set<size_t> resolvedJobs;
|
||||
while (running_) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(promiseGuard_);
|
||||
for (auto &[jobID, job] : runningJobs_) {
|
||||
job_info_msg_t *jobStatus;
|
||||
int error_code =
|
||||
slurm_load_job(&jobStatus, jobID, SHOW_ALL | SHOW_DETAIL);
|
||||
if (error_code != SLURM_SUCCESS)
|
||||
continue;
|
||||
|
||||
uint32_t idx = jobStatus->record_count;
|
||||
if (idx == 0)
|
||||
continue;
|
||||
idx--;
|
||||
const slurm_job_info_t &jobInfo = jobStatus->job_array[idx];
|
||||
AttemptRecord record;
|
||||
switch (jobInfo.job_state) {
|
||||
case JOB_PENDING:
|
||||
case JOB_SUSPENDED:
|
||||
case JOB_RUNNING:
|
||||
continue;
|
||||
// Job has finished
|
||||
case JOB_COMPLETE: /* completed execution successfully */
|
||||
record.rc = jobInfo.exit_code;
|
||||
break;
|
||||
case JOB_FAILED: /* completed execution unsuccessfully */
|
||||
record.rc = jobInfo.exit_code;
|
||||
record.executorLog = "Script errored.\n";
|
||||
break;
|
||||
case JOB_CANCELLED: /* cancelled by user */
|
||||
record.rc = 9; // matches SIGKILL
|
||||
record.executorLog = "Job cancelled by user.\n";
|
||||
break;
|
||||
case JOB_TIMEOUT: /* terminated on reaching time limit */
|
||||
record.rc = jobInfo.exit_code;
|
||||
record.executorLog = "Job exceeded time limit.\n";
|
||||
break;
|
||||
case JOB_NODE_FAIL: /* terminated on node failure */
|
||||
record.rc = jobInfo.exit_code;
|
||||
record.executorLog = "Node failed during execution\n";
|
||||
break;
|
||||
case JOB_PREEMPTED: /* terminated due to preemption */
|
||||
record.rc = jobInfo.exit_code;
|
||||
record.executorLog = "Job terminated due to pre-emption.\n";
|
||||
break;
|
||||
case JOB_BOOT_FAIL: /* terminated due to node boot failure */
|
||||
record.rc = jobInfo.exit_code;
|
||||
record.executorLog =
|
||||
"Job failed to run due to failure of compute node to "
|
||||
"boot.\n";
|
||||
break;
|
||||
case JOB_DEADLINE: /* terminated on deadline */
|
||||
record.rc = jobInfo.exit_code;
|
||||
record.executorLog = "Job terminated due to deadline.\n";
|
||||
break;
|
||||
case JOB_OOM: /* experienced out of memory error */
|
||||
record.rc = jobInfo.exit_code;
|
||||
record.executorLog = "Job terminated due to out-of-memory.\n";
|
||||
break;
|
||||
}
|
||||
slurm_free_job_info_msg(jobStatus);
|
||||
|
||||
readAndClean(job.stdoutFile, record.outputLog);
|
||||
readAndClean(job.stderrFile, record.errorLog);
|
||||
|
||||
job.prom.set_value(std::move(record));
|
||||
resolvedJobs.insert(jobID);
|
||||
}
|
||||
|
||||
for (const auto &jobID : resolvedJobs) {
|
||||
runningJobs_.extract(jobID);
|
||||
}
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
}
|
||||
}
|
||||
} // namespace daggy::executors::task
|
||||
#endif
|
||||
Reference in New Issue
Block a user