Fixing a number of scaling issues:

- Missed closing of file descriptor made ForkingTaskExecutor
  silently die after running out of FDs
- Tightened up scope for locks to prevent http timeout
- Simplified threadpool
This commit is contained in:
Ian Roddis
2022-01-10 13:02:10 -04:00
parent efd4078f70
commit 53308c063d
8 changed files with 96 additions and 140 deletions

View File

@@ -116,6 +116,7 @@ void daemonize()
} }
namespace dl = daggy::loggers::dag_run; namespace dl = daggy::loggers::dag_run;
std::ofstream ofh;
std::unique_ptr<dl::DAGRunLogger> loggerFactory(const rj::Value &config) std::unique_ptr<dl::DAGRunLogger> loggerFactory(const rj::Value &config)
{ {
@@ -138,7 +139,7 @@ std::unique_ptr<dl::DAGRunLogger> loggerFactory(const rj::Value &config)
if (fn == "-") if (fn == "-")
return std::make_unique<dl::OStreamLogger>(std::cout); return std::make_unique<dl::OStreamLogger>(std::cout);
std::ofstream ofh(logConfig["file"].GetString()); ofh.open(logConfig["file"].GetString());
return std::make_unique<dl::OStreamLogger>(ofh); return std::make_unique<dl::OStreamLogger>(ofh);
} }
} }

View File

@@ -161,8 +161,6 @@ namespace daggy::daggyr {
.resourcesUsed = resourcesUsed}); .resourcesUsed = resourcesUsed});
} }
std::cout << "Enqueuing " << runID << " / " << taskName << std::endl;
response.send(Pistache::Http::Code::Ok, ""); response.send(Pistache::Http::Code::Ok, "");
} }
@@ -202,8 +200,6 @@ namespace daggy::daggyr {
curCapacity_.cores += it->resourcesUsed.cores; curCapacity_.cores += it->resourcesUsed.cores;
curCapacity_.memoryMB += it->resourcesUsed.memoryMB; curCapacity_.memoryMB += it->resourcesUsed.memoryMB;
} }
std::cout << "Resolved " << it->runID << " / " << it->taskName
<< std::endl;
it = pending_.erase(it); it = pending_.erase(it);
} }
else { else {

View File

@@ -4,6 +4,7 @@
#include <condition_variable> #include <condition_variable>
#include <functional> #include <functional>
#include <future> #include <future>
#include <iostream>
#include <list> #include <list>
#include <memory> #include <memory>
#include <queue> #include <queue>
@@ -13,65 +14,11 @@
using namespace std::chrono_literals; using namespace std::chrono_literals;
namespace daggy { namespace daggy {
/*
A Task Queue is a collection of async tasks to be executed by the
thread pool. Using individual task queues allows for a rough QoS
when a single thread may be submitting batches of requests --
one producer won't starve out another, but all tasks will be run
as quickly as possible.
*/
class TaskQueue
{
public:
template <class F, class... Args>
decltype(auto) addTask(F &&f, Args &&...args)
{
// using return_type = std::invoke_result<F, Args...>::type;
using return_type = std::invoke_result_t<F, Args...>;
std::packaged_task<return_type()> task(
std::bind(std::forward<F>(f), std::forward<Args>(args)...));
std::future<return_type> res = task.get_future();
{
std::lock_guard<std::mutex> guard(mtx_);
tasks_.emplace(std::move(task));
}
return res;
}
std::packaged_task<void()> pop()
{
std::lock_guard<std::mutex> guard(mtx_);
auto task = std::move(tasks_.front());
tasks_.pop();
return task;
}
size_t size()
{
std::lock_guard<std::mutex> guard(mtx_);
return tasks_.size();
}
bool empty()
{
std::lock_guard<std::mutex> guard(mtx_);
return tasks_.empty();
}
private:
std::queue<std::packaged_task<void()>> tasks_;
std::mutex mtx_;
};
class ThreadPool class ThreadPool
{ {
public: public:
explicit ThreadPool(size_t nWorkers) explicit ThreadPool(size_t nWorkers)
: tqit_(taskQueues_.begin()) : stop_(false)
, stop_(false)
, drain_(false) , drain_(false)
{ {
resize(nWorkers); resize(nWorkers);
@@ -98,7 +45,7 @@ namespace daggy {
while (true) { while (true) {
{ {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
if (taskQueues_.empty()) if (tasks_.empty())
break; break;
} }
std::this_thread::sleep_for(250ms); std::this_thread::sleep_for(250ms);
@@ -118,25 +65,18 @@ namespace daggy {
for (size_t i = 0; i < nWorkers; ++i) for (size_t i = 0; i < nWorkers; ++i)
workers_.emplace_back([&] { workers_.emplace_back([&] {
std::packaged_task<void()> task;
while (true) { while (true) {
std::packaged_task<void()> task;
{ {
std::unique_lock<std::mutex> lock(mtx_); std::unique_lock<std::mutex> lock(mtx_);
cv_.wait(lock, [&] { return stop_ || !taskQueues_.empty(); }); cv_.wait(lock, [&] { return stop_ || !tasks_.empty(); });
if (taskQueues_.empty()) { if (tasks_.empty()) {
if (stop_) if (stop_)
return; return;
continue; continue;
} }
if (tqit_ == taskQueues_.end()) task.swap(tasks_.front());
tqit_ = taskQueues_.begin(); tasks_.pop();
task = (*tqit_)->pop();
if ((*tqit_)->empty()) {
tqit_ = taskQueues_.erase(tqit_);
}
else {
tqit_++;
}
} }
task(); task();
} }
@@ -148,25 +88,18 @@ namespace daggy {
{ {
if (drain_) if (drain_)
throw std::runtime_error("Unable to add task to draining pool"); throw std::runtime_error("Unable to add task to draining pool");
auto tq = std::make_shared<TaskQueue>(); using return_type = std::invoke_result_t<F, Args...>;
auto fut = tq->addTask(f, args...); std::packaged_task<return_type()> task(
std::bind(std::forward<F>(f), std::forward<Args>(args)...));
std::future<return_type> res = task.get_future();
{ {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
taskQueues_.push_back(tq); tasks_.emplace(std::move(task));
} }
cv_.notify_one(); cv_.notify_one();
return fut; return res;
}
void addTasks(std::shared_ptr<TaskQueue> &tq)
{
if (drain_)
throw std::runtime_error("Unable to add task to draining pool");
std::lock_guard<std::mutex> guard(mtx_);
taskQueues_.push_back(tq);
cv_.notify_one();
} }
size_t size() const size_t size() const
@@ -174,12 +107,17 @@ namespace daggy {
return workers_.size(); return workers_.size();
} }
size_t queueSize()
{
std::lock_guard<std::mutex> lock(mtx_);
return tasks_.size();
}
private: private:
// need to keep track of threads, so we can join them // need to keep track of threads, so we can join them
std::vector<std::thread> workers_; std::vector<std::thread> workers_;
// the task queue // the task queue
std::list<std::shared_ptr<TaskQueue>> taskQueues_; std::queue<std::packaged_task<void()>> tasks_;
std::list<std::shared_ptr<TaskQueue>>::iterator tqit_;
// synchronization // synchronization
std::mutex mtx_; std::mutex mtx_;
@@ -187,5 +125,4 @@ namespace daggy {
std::atomic<bool> stop_; std::atomic<bool> stop_;
std::atomic<bool> drain_; std::atomic<bool> drain_;
}; };
} // namespace daggy } // namespace daggy

View File

@@ -51,7 +51,7 @@ namespace daggy::executors::task {
bool stop(DAGRunID runID, const std::string &taskName) override; bool stop(DAGRunID runID, const std::string &taskName) override;
std::string description() const; std::string description() const override;
void addRunner(const std::string &url); void addRunner(const std::string &url);

View File

@@ -111,6 +111,7 @@ namespace daggy {
runningTasks_.emplace(taskName, std::move(fut)); runningTasks_.emplace(taskName, std::move(fut));
} }
catch (std::exception &e) { catch (std::exception &e) {
std::cout << "Unable to execute task: " << e.what() << std::endl;
} }
++nRunningTasks_; ++nRunningTasks_;

View File

@@ -154,8 +154,6 @@ std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
// Capacities for a runner can be negative, meaning that they're currently // Capacities for a runner can be negative, meaning that they're currently
// oversubscribed. // oversubscribed.
std::vector<std::pair<std::string, double>> impacts; std::vector<std::pair<std::string, double>> impacts;
std::string runner;
{ {
std::lock_guard<std::mutex> lock(runnersGuard_); std::lock_guard<std::mutex> lock(runnersGuard_);
for (auto &[runner, caps] : runners_) { for (auto &[runner, caps] : runners_) {
@@ -191,28 +189,43 @@ std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
prom.set_value(std::move(record)); prom.set_value(std::move(record));
return fut; return fut;
} }
}
std::sort(impacts.begin(), impacts.end(), std::sort(impacts.begin(), impacts.end(),
[](const auto &a, const auto &b) { return a.second < b.second; }); [](const auto &a, const auto &b) { return a.second > b.second; });
runner = impacts.back().first; std::string submitted_runner;
for (const auto &[runner, _] : impacts) {
auto &caps = runners_.at(runner); auto &caps = runners_.at(runner);
caps.current.cores -= taskUsed.cores; caps.current.cores -= taskUsed.cores;
caps.current.memoryMB -= taskUsed.memoryMB; caps.current.memoryMB -= taskUsed.memoryMB;
std::stringstream ss;
ss << runner << "/v1/task/" << runID << "/" << taskName;
auto url = ss.str();
const auto response = HTTP_REQUEST(url, taskToJSON(task), "POST");
if (response.code != HTTPCode::Ok) {
continue;
// throw std::runtime_error("Unable to submit task: " + response.body);
}
submitted_runner = runner;
} }
std::stringstream ss; if (!submitted_runner.empty()) {
ss << runner << "/v1/task/" << runID << "/" << taskName; std::promise<AttemptRecord> prom;
auto url = ss.str(); auto fut = prom.get_future();
AttemptRecord record{.rc = -1,
const auto response = HTTP_REQUEST(url, taskToJSON(task), "POST"); .executorLog = "No runners available for execution"};
if (response.code != HTTPCode::Ok) prom.set_value(std::move(record));
throw std::runtime_error("Unable to submit task: " + response.body); return fut;
}
RunningTask rt{.prom{}, RunningTask rt{.prom{},
.runID = runID, .runID = runID,
.taskName = taskName, .taskName = taskName,
.runnerURL = runner, .runnerURL = submitted_runner,
.retries = 3, .retries = 3,
.resources = taskUsed}; .resources = taskUsed};
@@ -250,6 +263,8 @@ void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
void DaggyRunnerTaskExecutor::monitor() void DaggyRunnerTaskExecutor::monitor()
{ {
std::unordered_map<std::string, RunnerCapacity> runners;
while (running_) { while (running_) {
std::unordered_map<std::pair<DAGRunID, std::string>, std::unordered_map<std::pair<DAGRunID, std::string>,
std::optional<AttemptRecord>> std::optional<AttemptRecord>>
@@ -258,6 +273,7 @@ void DaggyRunnerTaskExecutor::monitor()
std::unordered_map<std::pair<DAGRunID, std::string>, Capacity> std::unordered_map<std::pair<DAGRunID, std::string>, Capacity>
taskResources; taskResources;
// Cache what's running now
{ {
std::lock_guard<std::mutex> lock(rtGuard_); std::lock_guard<std::mutex> lock(rtGuard_);
for (const auto &[tid, info] : runningTasks_) { for (const auto &[tid, info] : runningTasks_) {
@@ -267,39 +283,40 @@ void DaggyRunnerTaskExecutor::monitor()
{ {
std::lock_guard<std::mutex> lock(runnersGuard_); std::lock_guard<std::mutex> lock(runnersGuard_);
for (auto &[runnerURL, caps] : runners_) { runners = runners_;
rj::Document doc; }
try {
auto [code, json] = JSON_HTTP_REQUEST(runnerURL + "/v1/poll");
if (code != HTTPCode::Ok)
continue;
doc.Swap(json);
}
catch (std::exception &e) {
std::cout << "Curl failed for runner " << runnerURL << ": "
<< e.what() << std::endl;
}
const auto tasks = doc.GetArray(); for (auto &[runnerURL, caps] : runners) {
for (size_t idx = 0; idx < tasks.Size(); ++idx) { rj::Document doc;
const auto &task = tasks[idx]; try {
if (task["state"] == "PENDING") { auto [code, json] = JSON_HTTP_REQUEST(runnerURL + "/v1/poll");
resolvedJobs.emplace(std::make_pair(task["runID"].GetInt64(), if (code != HTTPCode::Ok)
task["taskName"].GetString()), continue;
std::nullopt); doc.Swap(json);
} }
else { catch (std::exception &e) {
auto tid = std::make_pair(task["runID"].GetInt64(), continue;
task["taskName"].GetString()); }
auto it = taskResources.find(tid);
if (it != taskResources.end()) {
const auto &res = taskResources.at(tid);
caps.current.cores += res.cores;
caps.current.memoryMB += res.memoryMB;
}
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"])); const auto tasks = doc.GetArray();
for (size_t idx = 0; idx < tasks.Size(); ++idx) {
const auto &task = tasks[idx];
auto tid = std::make_pair(task["runID"].GetInt64(),
task["taskName"].GetString());
if (task["state"] == "PENDING") {
resolvedJobs.emplace(tid, std::nullopt);
}
else {
auto it = taskResources.find(tid);
if (it != taskResources.end()) {
const auto &res = taskResources.at(tid);
caps.current.cores += res.cores;
caps.current.memoryMB += res.memoryMB;
} }
auto attempt = attemptRecordFromJSON(task["attempt"]);
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
} }
} }
} }

View File

@@ -97,7 +97,7 @@ std::future<daggy::AttemptRecord> ForkingTaskExecutor::execute(
std::lock_guard<std::mutex> lock(taskControlsGuard_); std::lock_guard<std::mutex> lock(taskControlsGuard_);
auto [it, ins] = taskControls_.emplace(key, true); auto [it, ins] = taskControls_.emplace(key, true);
auto &running = it->second; auto &running = it->second;
return tp_.addTask([this, task, &running, key]() { return tp_.addTask([this, task, taskName, &running, key]() {
auto ret = this->runTask(task, running); auto ret = this->runTask(task, running);
std::lock_guard<std::mutex> lock(this->taskControlsGuard_); std::lock_guard<std::mutex> lock(this->taskControlsGuard_);
this->taskControls_.extract(key); this->taskControls_.extract(key);
@@ -147,12 +147,16 @@ daggy::AttemptRecord ForkingTaskExecutor::runTask(const Task &task,
// Create the pipe // Create the pipe
int stdoutPipe[2]; int stdoutPipe[2];
int pipeRC = pipe2(stdoutPipe, O_DIRECT); int pipeRC = pipe2(stdoutPipe, O_DIRECT);
if (pipeRC != 0) if (pipeRC != 0) {
std::cerr << "Unable to create pipe for stdout: " << pipeRC << std::endl;
throw std::runtime_error("Unable to create pipe for stdout"); throw std::runtime_error("Unable to create pipe for stdout");
}
int stderrPipe[2]; int stderrPipe[2];
pipeRC = pipe2(stderrPipe, O_DIRECT); pipeRC = pipe2(stderrPipe, O_DIRECT);
if (pipeRC != 0) if (pipeRC != 0) {
std::cerr << "Unable to create pipe for stderr" << std::endl;
throw std::runtime_error("Unable to create pipe for stderr"); throw std::runtime_error("Unable to create pipe for stderr");
}
pid_t child = fork(); pid_t child = fork();
if (child < 0) { if (child < 0) {
@@ -187,7 +191,7 @@ daggy::AttemptRecord ForkingTaskExecutor::runTask(const Task &task,
if (childInfo.si_pid > 0) { if (childInfo.si_pid > 0) {
break; break;
} }
std::this_thread::sleep_for(250ms); std::this_thread::sleep_for(100ms);
} }
if (!running) { if (!running) {
@@ -215,6 +219,8 @@ daggy::AttemptRecord ForkingTaskExecutor::runTask(const Task &task,
close(stdoutPipe[0]); close(stdoutPipe[0]);
close(stderrPipe[0]); close(stderrPipe[0]);
close(stdoutPipe[1]);
close(stderrPipe[1]);
return rec; return rec;
} }

View File

@@ -15,14 +15,12 @@ TEST_CASE("threadpool", "[threadpool]")
SECTION("Adding large tasks queues with return values") SECTION("Adding large tasks queues with return values")
{ {
auto tq = std::make_shared<daggy::TaskQueue>();
std::vector<std::future<uint32_t>> res; std::vector<std::future<uint32_t>> res;
for (size_t i = 0; i < 100; ++i) for (size_t i = 0; i < 100; ++i)
res.emplace_back(tq->addTask([&cnt]() { res.emplace_back(tp.addTask([&cnt]() {
cnt++; cnt++;
return cnt.load(); return cnt.load();
})); }));
tp.addTasks(tq);
for (auto &r : res) for (auto &r : res)
r.get(); r.get();
REQUIRE(cnt == 100); REQUIRE(cnt == 100);