Adding fix for race condition in task resource management
This commit is contained in:
@@ -268,34 +268,39 @@ void DaggyRunnerTaskExecutor::monitor()
|
|||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(runnersGuard_);
|
std::lock_guard<std::mutex> lock(runnersGuard_);
|
||||||
for (auto &[runnerURL, caps] : runners_) {
|
for (auto &[runnerURL, caps] : runners_) {
|
||||||
|
rj::Document doc;
|
||||||
try {
|
try {
|
||||||
const auto &[code, json] = JSON_HTTP_REQUEST(runnerURL + "/v1/poll");
|
auto [code, json] = JSON_HTTP_REQUEST(runnerURL + "/v1/poll");
|
||||||
if (code != HTTPCode::Ok)
|
if (code != HTTPCode::Ok)
|
||||||
continue;
|
continue;
|
||||||
|
doc.Swap(json);
|
||||||
|
}
|
||||||
|
catch (std::exception &e) {
|
||||||
|
std::cout << "Curl failed for runner " << runnerURL << ": "
|
||||||
|
<< e.what() << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
const auto tasks = json.GetArray();
|
const auto tasks = doc.GetArray();
|
||||||
for (size_t idx = 0; idx < tasks.Size(); ++idx) {
|
for (size_t idx = 0; idx < tasks.Size(); ++idx) {
|
||||||
const auto &task = tasks[idx];
|
const auto &task = tasks[idx];
|
||||||
if (task["state"] == "PENDING") {
|
if (task["state"] == "PENDING") {
|
||||||
resolvedJobs.emplace(std::make_pair(task["runID"].GetInt64(),
|
resolvedJobs.emplace(std::make_pair(task["runID"].GetInt64(),
|
||||||
task["taskName"].GetString()),
|
task["taskName"].GetString()),
|
||||||
std::nullopt);
|
std::nullopt);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto tid = std::make_pair(task["runID"].GetInt64(),
|
auto tid = std::make_pair(task["runID"].GetInt64(),
|
||||||
task["taskName"].GetString());
|
task["taskName"].GetString());
|
||||||
|
auto it = taskResources.find(tid);
|
||||||
|
if (it != taskResources.end()) {
|
||||||
const auto &res = taskResources.at(tid);
|
const auto &res = taskResources.at(tid);
|
||||||
caps.current.cores += res.cores;
|
caps.current.cores += res.cores;
|
||||||
caps.current.memoryMB += res.memoryMB;
|
caps.current.memoryMB += res.memoryMB;
|
||||||
|
|
||||||
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resolvedJobs.emplace(tid, attemptRecordFromJSON(task["attempt"]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (std::exception &e) {
|
|
||||||
std::cout << "Curl timeout failed for runner " << runnerURL << ": "
|
|
||||||
<< e.what() << std::endl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user