Lots of fixes to poor daggyr implementation, added debugging messages
This commit is contained in:
@@ -55,6 +55,7 @@ namespace daggy::executors::task {
|
||||
DAGRunID runID;
|
||||
std::string taskName;
|
||||
std::string runnerURL;
|
||||
uint32_t retries;
|
||||
};
|
||||
|
||||
// Resolves jobs through polling
|
||||
|
||||
@@ -106,8 +106,12 @@ namespace daggy {
|
||||
taskAttemptCounts_[taskName] = 1;
|
||||
|
||||
logger_.updateTaskState(runID_, taskName, RunState::RUNNING);
|
||||
runningTasks_.emplace(taskName,
|
||||
executor_.execute(runID_, taskName, task));
|
||||
try {
|
||||
auto fut = executor_.execute(runID_, taskName, task);
|
||||
runningTasks_.emplace(taskName, std::move(fut));
|
||||
}
|
||||
catch (std::exception &e) {
|
||||
}
|
||||
++nRunningTasks_;
|
||||
|
||||
auto nextTask = dag_.visitNext();
|
||||
|
||||
@@ -234,6 +234,7 @@ namespace daggy {
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curlWriter);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 2);
|
||||
|
||||
if (trace) {
|
||||
curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, http_trace);
|
||||
|
||||
@@ -135,6 +135,7 @@ std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
|
||||
// Capacities for a runner can be negative, meaning that they're currently
|
||||
// oversubscribed.
|
||||
std::vector<std::pair<std::string, double>> impacts;
|
||||
|
||||
for (const auto &runner : runners_) {
|
||||
try {
|
||||
const auto &[code, doc] = JSON_HTTP_REQUEST(runner + "/v1/capacity");
|
||||
@@ -158,8 +159,14 @@ std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
|
||||
}
|
||||
}
|
||||
|
||||
if (impacts.empty())
|
||||
throw std::runtime_error("No runners available for execution");
|
||||
if (impacts.empty()) {
|
||||
std::promise<AttemptRecord> prom;
|
||||
auto fut = prom.get_future();
|
||||
AttemptRecord record{.rc = -1,
|
||||
.executorLog = "No runners available for execution"};
|
||||
prom.set_value(std::move(record));
|
||||
return fut;
|
||||
}
|
||||
|
||||
auto cit = impacts.begin();
|
||||
for (auto it = impacts.begin(); it != impacts.end(); ++it) {
|
||||
@@ -167,8 +174,19 @@ std::future<AttemptRecord> DaggyRunnerTaskExecutor::execute(
|
||||
cit = it;
|
||||
}
|
||||
|
||||
RunningTask rt{
|
||||
.prom{}, .runID = runID, .taskName = taskName, .runnerURL = cit->first};
|
||||
std::stringstream ss;
|
||||
ss << cit->first << "/v1/task/" << runID << "/" << taskName;
|
||||
auto url = ss.str();
|
||||
|
||||
const auto response = HTTP_REQUEST(url, taskToJSON(task), "POST");
|
||||
if (response.code != HTTPCode::Ok)
|
||||
throw std::runtime_error("Unable to submit task: " + response.body);
|
||||
|
||||
RunningTask rt{.prom{},
|
||||
.runID = runID,
|
||||
.taskName = taskName,
|
||||
.runnerURL = cit->first,
|
||||
.retries = 3};
|
||||
|
||||
auto fut = rt.prom.get_future();
|
||||
|
||||
@@ -194,34 +212,42 @@ void DaggyRunnerTaskExecutor::monitor()
|
||||
{
|
||||
std::vector<std::pair<DAGRunID, std::string>> resolvedJobs;
|
||||
|
||||
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||
for (auto &[taskID, task] : runningTasks_) {
|
||||
try {
|
||||
const auto &[code, json] = JSON_HTTP_REQUEST(
|
||||
task.runnerURL + "/v1/task/" + std::to_string(taskID.first) +
|
||||
"/" + taskID.second);
|
||||
if (code != HTTPCode::Ok) {
|
||||
AttemptRecord record{
|
||||
.rc = -1, .executorLog = "Unable to query runner for progress"};
|
||||
task.prom.set_value(std::move(record));
|
||||
resolvedJobs.emplace_back(taskID);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(rtGuard_);
|
||||
for (auto &[taskID, task] : runningTasks_) {
|
||||
try {
|
||||
const auto &[code, json] = JSON_HTTP_REQUEST(
|
||||
task.runnerURL + "/v1/task/" + std::to_string(taskID.first) +
|
||||
"/" + taskID.second);
|
||||
if (code != HTTPCode::Ok) {
|
||||
--task.retries;
|
||||
|
||||
if (task.retries == 0) {
|
||||
AttemptRecord record{
|
||||
.rc = -1,
|
||||
.executorLog = "Unable to query runner for progress"};
|
||||
task.prom.set_value(std::move(record));
|
||||
resolvedJobs.emplace_back(taskID);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (json["state"] == "COMPLETED") {
|
||||
auto attempt = attemptRecordFromJSON(json["attempt"]);
|
||||
task.prom.set_value(std::move(attempt));
|
||||
resolvedJobs.emplace_back(taskID);
|
||||
}
|
||||
}
|
||||
catch (std::runtime_error &e) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (json["state"] == "COMPLETED") {
|
||||
task.prom.set_value(attemptRecordFromJSON(json["attempt"]));
|
||||
resolvedJobs.emplace_back(taskID);
|
||||
}
|
||||
}
|
||||
catch (std::runtime_error &e) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const auto &tid : resolvedJobs) {
|
||||
runningTasks_.extract(tid);
|
||||
}
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(250));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user