Fixing a number of scaling issues:

- Missed closing of file descriptor made ForkingTaskExecutor
  silently die after running out of FDs
- Tightened up scope for locks to prevent http timeout
- Simplified threadpool
This commit is contained in:
Ian Roddis
2022-01-10 13:02:10 -04:00
parent efd4078f70
commit 53308c063d
8 changed files with 96 additions and 140 deletions

View File

@@ -154,8 +154,6 @@ std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
// Capacities for a runner can be negative, meaning that they're currently
// oversubscribed.
std::vector<std::pair<std::string, double>> impacts;
std::string runner;
{
std::lock_guard<std::mutex> lock(runnersGuard_);
for (auto &[runner, caps] : runners_) {
@@ -191,28 +189,43 @@ std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
prom.set_value(std::move(record));
return fut;
}
}
std::sort(impacts.begin(), impacts.end(),
[](const auto &a, const auto &b) { return a.second < b.second; });
std::sort(impacts.begin(), impacts.end(),
[](const auto &a, const auto &b) { return a.second > b.second; });
runner = impacts.back().first;
std::string submitted_runner;
for (const auto &[runner, _] : impacts) {
auto &caps = runners_.at(runner);
caps.current.cores -= taskUsed.cores;
caps.current.memoryMB -= taskUsed.memoryMB;
std::stringstream ss;
ss << runner << "/v1/task/" << runID << "/" << taskName;
auto url = ss.str();
const auto response = HTTP_REQUEST(url, taskToJSON(task), "POST");
if (response.code != HTTPCode::Ok) {
continue;
// throw std::runtime_error("Unable to submit task: " + response.body);
}
submitted_runner = runner;
}
std::stringstream ss;
ss << runner << "/v1/task/" << runID << "/" << taskName;
auto url = ss.str();
const auto response = HTTP_REQUEST(url, taskToJSON(task), "POST");
if (response.code != HTTPCode::Ok)
throw std::runtime_error("Unable to submit task: " + response.body);
if (!submitted_runner.empty()) {
std::promise<AttemptRecord> prom;
auto fut = prom.get_future();
AttemptRecord record{.rc = -1,
.executorLog = "No runners available for execution"};
prom.set_value(std::move(record));
return fut;
}
RunningTask rt{.prom{},
.runID = runID,
.taskName = taskName,
.runnerURL = runner,
.runnerURL = submitted_runner,
.retries = 3,
.resources = taskUsed};
@@ -250,6 +263,8 @@ void DaggyRunnerTaskExecutor::addRunner(const std::string &url)
void DaggyRunnerTaskExecutor::monitor()
{
std::unordered_map<std::string, RunnerCapacity> runners;
while (running_) {
std::unordered_map<std::pair<DAGRunID, std::string>,
std::optional<AttemptRecord>>
@@ -258,6 +273,7 @@ void DaggyRunnerTaskExecutor::monitor()
std::unordered_map<std::pair<DAGRunID, std::string>, Capacity>
taskResources;
// Cache what's running now
{
std::lock_guard<std::mutex> lock(rtGuard_);
for (const auto &[tid, info] : runningTasks_) {
@@ -267,39 +283,40 @@ void DaggyRunnerTaskExecutor::monitor()
{
std::lock_guard<std::mutex> lock(runnersGuard_);
for (auto &[runnerURL, caps] : runners_) {
rj::Document doc;
try {
auto [code, json] = JSON_HTTP_REQUEST(runnerURL + "/v1/poll");
if (code != HTTPCode::Ok)
continue;
doc.Swap(json);
}
catch (std::exception &e) {
std::cout << "Curl failed for runner " << runnerURL << ": "
<< e.what() << std::endl;
}
runners = runners_;
}
const auto tasks = doc.GetArray();
for (size_t idx = 0; idx < tasks.Size(); ++idx) {
const auto &task = tasks[idx];
if (task["state"] == "PENDING") {
resolvedJobs.emplace(std::make_pair(task["runID"].GetInt64(),
task["taskName"].GetString()),
std::nullopt);
}
else {
auto tid = std::make_pair(task["runID"].GetInt64(),
task["taskName"].GetString());
auto it = taskResources.find(tid);
if (it != taskResources.end()) {
const auto &res = taskResources.at(tid);
caps.current.cores += res.cores;
caps.current.memoryMB += res.memoryMB;
}
for (auto &[runnerURL, caps] : runners) {
rj::Document doc;
try {
auto [code, json] = JSON_HTTP_REQUEST(runnerURL + "/v1/poll");
if (code != HTTPCode::Ok)
continue;
doc.Swap(json);
}
catch (std::exception &e) {
continue;
}
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
const auto tasks = doc.GetArray();
for (size_t idx = 0; idx < tasks.Size(); ++idx) {
const auto &task = tasks[idx];
auto tid = std::make_pair(task["runID"].GetInt64(),
task["taskName"].GetString());
if (task["state"] == "PENDING") {
resolvedJobs.emplace(tid, std::nullopt);
}
else {
auto it = taskResources.find(tid);
if (it != taskResources.end()) {
const auto &res = taskResources.at(tid);
caps.current.cores += res.cores;
caps.current.memoryMB += res.memoryMB;
}
auto attempt = attemptRecordFromJSON(task["attempt"]);
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
}
}
}

View File

@@ -97,7 +97,7 @@ std::future<daggy::AttemptRecord> ForkingTaskExecutor::execute(
std::lock_guard<std::mutex> lock(taskControlsGuard_);
auto [it, ins] = taskControls_.emplace(key, true);
auto &running = it->second;
return tp_.addTask([this, task, &running, key]() {
return tp_.addTask([this, task, taskName, &running, key]() {
auto ret = this->runTask(task, running);
std::lock_guard<std::mutex> lock(this->taskControlsGuard_);
this->taskControls_.extract(key);
@@ -147,12 +147,16 @@ daggy::AttemptRecord ForkingTaskExecutor::runTask(const Task &task,
// Create the pipe
int stdoutPipe[2];
int pipeRC = pipe2(stdoutPipe, O_DIRECT);
if (pipeRC != 0)
if (pipeRC != 0) {
std::cerr << "Unable to create pipe for stdout: " << pipeRC << std::endl;
throw std::runtime_error("Unable to create pipe for stdout");
}
int stderrPipe[2];
pipeRC = pipe2(stderrPipe, O_DIRECT);
if (pipeRC != 0)
if (pipeRC != 0) {
std::cerr << "Unable to create pipe for stderr" << std::endl;
throw std::runtime_error("Unable to create pipe for stderr");
}
pid_t child = fork();
if (child < 0) {
@@ -187,7 +191,7 @@ daggy::AttemptRecord ForkingTaskExecutor::runTask(const Task &task,
if (childInfo.si_pid > 0) {
break;
}
std::this_thread::sleep_for(250ms);
std::this_thread::sleep_for(100ms);
}
if (!running) {
@@ -215,6 +219,8 @@ daggy::AttemptRecord ForkingTaskExecutor::runTask(const Task &task,
close(stdoutPipe[0]);
close(stderrPipe[0]);
close(stdoutPipe[1]);
close(stderrPipe[1]);
return rec;
}