Adding fix for race condition in task resource management

This commit is contained in:
Ian Roddis
2022-01-07 16:00:04 -04:00
parent 4da78c9dd6
commit f9076be081

View File

@@ -268,34 +268,39 @@ void DaggyRunnerTaskExecutor::monitor()
{
std::lock_guard<std::mutex> lock(runnersGuard_);
for (auto &[runnerURL, caps] : runners_) {
rj::Document doc;
try {
const auto &[code, json] = JSON_HTTP_REQUEST(runnerURL + "/v1/poll");
auto [code, json] = JSON_HTTP_REQUEST(runnerURL + "/v1/poll");
if (code != HTTPCode::Ok)
continue;
doc.Swap(json);
}
catch (std::exception &e) {
std::cout << "Curl failed for runner " << runnerURL << ": "
<< e.what() << std::endl;
}
const auto tasks = json.GetArray();
for (size_t idx = 0; idx < tasks.Size(); ++idx) {
const auto &task = tasks[idx];
if (task["state"] == "PENDING") {
resolvedJobs.emplace(std::make_pair(task["runID"].GetInt64(),
task["taskName"].GetString()),
std::nullopt);
}
else {
auto tid = std::make_pair(task["runID"].GetInt64(),
task["taskName"].GetString());
const auto tasks = doc.GetArray();
for (size_t idx = 0; idx < tasks.Size(); ++idx) {
const auto &task = tasks[idx];
if (task["state"] == "PENDING") {
resolvedJobs.emplace(std::make_pair(task["runID"].GetInt64(),
task["taskName"].GetString()),
std::nullopt);
}
else {
auto tid = std::make_pair(task["runID"].GetInt64(),
task["taskName"].GetString());
auto it = taskResources.find(tid);
if (it != taskResources.end()) {
const auto &res = taskResources.at(tid);
caps.current.cores += res.cores;
caps.current.memoryMB += res.memoryMB;
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
}
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
}
}
catch (std::exception &e) {
std::cout << "Curl timeout failed for runner " << runnerURL << ": "
<< e.what() << std::endl;
}
}
}