Making DaggyTaskRunner block until a runner is available
This commit is contained in:
@@ -142,6 +142,9 @@ namespace daggy::daggyr {
|
|||||||
auto runID = request.param(":runID").as<DAGRunID>();
|
auto runID = request.param(":runID").as<DAGRunID>();
|
||||||
auto taskName = request.param(":taskName").as<std::string>();
|
auto taskName = request.param(":taskName").as<std::string>();
|
||||||
|
|
||||||
|
std::cout << "Received request for " << runID << " / " << taskName
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
Capacity resourcesUsed;
|
Capacity resourcesUsed;
|
||||||
Task task;
|
Task task;
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -57,7 +57,6 @@ namespace daggy::executors::task {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
void monitor();
|
void monitor();
|
||||||
void dispatchQueuedTasks();
|
|
||||||
|
|
||||||
struct RunningTask
|
struct RunningTask
|
||||||
{
|
{
|
||||||
@@ -68,31 +67,16 @@ namespace daggy::executors::task {
|
|||||||
daggy_runner::Capacity resources;
|
daggy_runner::Capacity resources;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct QueuedTask
|
|
||||||
{
|
|
||||||
Task task;
|
|
||||||
RunningTask rt;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::mutex queuedGuard_;
|
|
||||||
std::condition_variable queuedCV_;
|
|
||||||
std::deque<QueuedTask> queuedTasks_;
|
|
||||||
|
|
||||||
// Resolves jobs through polling
|
// Resolves jobs through polling
|
||||||
std::atomic<bool> running_;
|
std::atomic<bool> running_;
|
||||||
bool promptTask_;
|
bool promptTask_;
|
||||||
std::thread monitorWorker_;
|
std::thread monitorWorker_;
|
||||||
std::thread dispatchWorker_;
|
|
||||||
|
|
||||||
struct RunnerCapacity
|
daggy_runner::Capacity getRunnerCapacity(const std::string &runnerURL);
|
||||||
{
|
|
||||||
daggy_runner::Capacity current;
|
|
||||||
daggy_runner::Capacity total;
|
|
||||||
};
|
|
||||||
RunnerCapacity getRunnerCapacity(const std::string &runnerURL);
|
|
||||||
|
|
||||||
std::mutex runnersGuard_;
|
std::mutex runnersGuard_;
|
||||||
std::unordered_map<std::string, RunnerCapacity> runners_;
|
std::condition_variable runnersCV_;
|
||||||
|
std::unordered_map<std::string, daggy_runner::Capacity> runners_;
|
||||||
|
|
||||||
std::mutex rtGuard_;
|
std::mutex rtGuard_;
|
||||||
std::unordered_map<std::pair<DAGRunID, std::string>, RunningTask>
|
std::unordered_map<std::pair<DAGRunID, std::string>, RunningTask>
|
||||||
|
|||||||
@@ -80,7 +80,6 @@ DaggyRunnerTaskExecutor::DaggyRunnerTaskExecutor()
|
|||||||
: running_(true)
|
: running_(true)
|
||||||
, promptTask_(false)
|
, promptTask_(false)
|
||||||
, monitorWorker_(&DaggyRunnerTaskExecutor::monitor, this)
|
, monitorWorker_(&DaggyRunnerTaskExecutor::monitor, this)
|
||||||
, dispatchWorker_(&DaggyRunnerTaskExecutor::dispatchQueuedTasks, this)
|
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -88,7 +87,6 @@ DaggyRunnerTaskExecutor::~DaggyRunnerTaskExecutor()
|
|||||||
{
|
{
|
||||||
running_ = false;
|
running_ = false;
|
||||||
monitorWorker_.join();
|
monitorWorker_.join();
|
||||||
dispatchWorker_.join();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string DaggyRunnerTaskExecutor::description() const
|
std::string DaggyRunnerTaskExecutor::description() const
|
||||||
@@ -153,79 +151,44 @@ TaskFuture DaggyRunnerTaskExecutor::execute(DAGRunID runID,
|
|||||||
const Task &task)
|
const Task &task)
|
||||||
{
|
{
|
||||||
auto taskUsed = capacityFromTask(task);
|
auto taskUsed = capacityFromTask(task);
|
||||||
QueuedTask qt{.task = task,
|
|
||||||
.rt{.fut = std::make_shared<Future<AttemptRecord>>(),
|
std::string exe_runner;
|
||||||
.runID = runID,
|
Capacity *exe_capacity;
|
||||||
.taskName = taskName,
|
|
||||||
.resources = taskUsed}};
|
// Block until a host is found
|
||||||
auto fut = qt.rt.fut;
|
std::unique_lock<std::mutex> lock(runnersGuard_);
|
||||||
|
// Wait for a host to be available
|
||||||
|
runnersCV_.wait(lock, [&] {
|
||||||
|
for (auto &[runner, capacity] : runners_) {
|
||||||
|
if (capacity.cores >= taskUsed.cores and
|
||||||
|
capacity.memoryMB >= taskUsed.memoryMB) {
|
||||||
|
exe_runner = runner;
|
||||||
|
exe_capacity = &capacity;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
exe_capacity->cores -= taskUsed.cores;
|
||||||
|
exe_capacity->memoryMB -= taskUsed.memoryMB;
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << exe_runner << "/v1/task/" << runID << "/" << taskName;
|
||||||
|
auto url = ss.str();
|
||||||
|
|
||||||
|
const auto response = HTTP_REQUEST(url, taskToJSON(task), "POST");
|
||||||
|
|
||||||
|
RunningTask rt{.fut = std::make_shared<Future<AttemptRecord>>(),
|
||||||
|
.runID = runID,
|
||||||
|
.taskName = taskName,
|
||||||
|
.resources = taskUsed};
|
||||||
|
|
||||||
|
auto fut = rt.fut;
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(queuedGuard_);
|
|
||||||
queuedTasks_.emplace_back(std::move(qt));
|
|
||||||
}
|
|
||||||
promptTask_ = true;
|
|
||||||
queuedCV_.notify_one();
|
|
||||||
return fut;
|
|
||||||
}
|
|
||||||
|
|
||||||
void DaggyRunnerTaskExecutor::dispatchQueuedTasks()
|
|
||||||
{
|
|
||||||
while (running_) {
|
|
||||||
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
|
||||||
std::vector<std::string> runners;
|
|
||||||
std::optional<QueuedTask> oqt;
|
|
||||||
{
|
|
||||||
// Wait for either a new task, or an existing task to finish
|
|
||||||
std::unique_lock<std::mutex> lock(queuedGuard_);
|
|
||||||
queuedCV_.wait(lock, [&] { return !running_ or !queuedTasks_.empty(); });
|
|
||||||
promptTask_ = false;
|
|
||||||
// Check to see if there's a worker available
|
|
||||||
if (queuedTasks_.empty())
|
|
||||||
continue;
|
|
||||||
const auto &fqt = queuedTasks_.front();
|
|
||||||
std::lock_guard<std::mutex> rlock(runnersGuard_);
|
|
||||||
for (auto &[runner, caps] : runners_) {
|
|
||||||
if (caps.total.cores == 0) {
|
|
||||||
caps = getRunnerCapacity(runner);
|
|
||||||
}
|
|
||||||
if (fqt.rt.resources.cores <= caps.current.cores and
|
|
||||||
fqt.rt.resources.memoryMB <= caps.current.memoryMB) {
|
|
||||||
runners.push_back(runner);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (runners.empty())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
oqt.emplace(std::move(queuedTasks_.front()));
|
|
||||||
queuedTasks_.pop_front();
|
|
||||||
}
|
|
||||||
|
|
||||||
auto &qt = oqt.value();
|
|
||||||
|
|
||||||
for (const auto &runner : runners) {
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << runner << "/v1/task/" << qt.rt.runID << "/" << qt.rt.taskName;
|
|
||||||
auto url = ss.str();
|
|
||||||
|
|
||||||
const auto response = HTTP_REQUEST(url, taskToJSON(qt.task), "POST");
|
|
||||||
if (response.code != HTTPCode::Ok) {
|
|
||||||
std::cout << response.code << " : " << response.body << std::endl;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Subtract the capacity from the runner
|
|
||||||
std::lock_guard<std::mutex> rlock(runnersGuard_);
|
|
||||||
auto &cur = runners_.at(runner).current;
|
|
||||||
cur.cores -= qt.rt.resources.cores;
|
|
||||||
cur.memoryMB -= qt.rt.resources.memoryMB;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::lock_guard<std::mutex> lock(rtGuard_);
|
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||||
runningTasks_.emplace(std::make_pair(qt.rt.runID, qt.rt.taskName),
|
runningTasks_.emplace(std::make_pair(runID, taskName), std::move(rt));
|
||||||
std::move(qt.rt));
|
|
||||||
}
|
}
|
||||||
|
return fut;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DaggyRunnerTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
|
bool DaggyRunnerTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
|
||||||
@@ -233,18 +196,16 @@ bool DaggyRunnerTaskExecutor::stop(DAGRunID runID, const std::string &taskName)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
DaggyRunnerTaskExecutor::RunnerCapacity
|
daggy_runner::Capacity DaggyRunnerTaskExecutor::getRunnerCapacity(
|
||||||
DaggyRunnerTaskExecutor::getRunnerCapacity(const std::string &runnerURL)
|
const std::string &runnerURL)
|
||||||
{
|
{
|
||||||
// Try and get the capacity
|
// Try and get the capacity
|
||||||
const auto &[code, doc] = JSON_HTTP_REQUEST(runnerURL + "/v1/capacity");
|
const auto &[code, doc] = JSON_HTTP_REQUEST(runnerURL + "/v1/capacity");
|
||||||
if (code != HTTPCode::Ok) {
|
if (code != HTTPCode::Ok) {
|
||||||
return RunnerCapacity{};
|
return Capacity{};
|
||||||
}
|
}
|
||||||
|
|
||||||
return DaggyRunnerTaskExecutor::RunnerCapacity{
|
return capacityFromJSON(doc["total"]);
|
||||||
.current = capacityFromJSON(doc["current"]),
|
|
||||||
.total = capacityFromJSON(doc["total"])};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
|
void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
|
||||||
@@ -255,7 +216,7 @@ void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
|
|||||||
|
|
||||||
void DaggyRunnerTaskExecutor::monitor()
|
void DaggyRunnerTaskExecutor::monitor()
|
||||||
{
|
{
|
||||||
std::unordered_map<std::string, RunnerCapacity> runners;
|
std::unordered_map<std::string, Capacity> runners;
|
||||||
|
|
||||||
while (running_) {
|
while (running_) {
|
||||||
std::this_thread::sleep_for(std::chrono::milliseconds(250));
|
std::this_thread::sleep_for(std::chrono::milliseconds(250));
|
||||||
@@ -305,14 +266,14 @@ void DaggyRunnerTaskExecutor::monitor()
|
|||||||
task["taskName"].GetString());
|
task["taskName"].GetString());
|
||||||
auto it = taskResources.find(tid);
|
auto it = taskResources.find(tid);
|
||||||
if (it != taskResources.end()) {
|
if (it != taskResources.end()) {
|
||||||
caps.current.cores += it->second.cores;
|
caps.cores += it->second.cores;
|
||||||
caps.current.memoryMB += it->second.memoryMB;
|
caps.memoryMB += it->second.memoryMB;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto attempt = attemptRecordFromJSON(task["attempt"]);
|
auto attempt = attemptRecordFromJSON(task["attempt"]);
|
||||||
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
|
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
|
||||||
promptTask_ = true;
|
promptTask_ = true;
|
||||||
queuedCV_.notify_one();
|
runnersCV_.notify_one();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -119,13 +119,6 @@ void SSHTaskExecutor::monitor()
|
|||||||
if (attempt.rc == 255) {
|
if (attempt.rc == 255) {
|
||||||
--rt.sshRetries;
|
--rt.sshRetries;
|
||||||
if (rt.sshRetries > 0) {
|
if (rt.sshRetries > 0) {
|
||||||
/*
|
|
||||||
std::cout << "Resubmitting: " << rt.sshRetries;
|
|
||||||
for (const auto &i : std::get<std::vector<std::string>>(
|
|
||||||
rt.task.job.at("command")))
|
|
||||||
std::cout << " " << i;
|
|
||||||
std::cout << std::endl;
|
|
||||||
*/
|
|
||||||
rt.feFuture = fe_.execute(rt.runID, rt.taskName, rt.task);
|
rt.feFuture = fe_.execute(rt.runID, rt.taskName, rt.task);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user