Adding agent executor
This commit is contained in:
parent
ca906e75b3
commit
779852022a
@@ -20,3 +20,4 @@ sysinfo = "0.23"
|
|||||||
redis = { version = "*", features = ["aio", "tokio-comp"] }
|
redis = { version = "*", features = ["aio", "tokio-comp"] }
|
||||||
clap = { version = "3.1", features = ["derive"] }
|
clap = { version = "3.1", features = ["derive"] }
|
||||||
env_logger = "0.9"
|
env_logger = "0.9"
|
||||||
|
log = "0.4"
|
||||||
|
|||||||
+7
-1
@@ -29,7 +29,12 @@ impl StorageConfig {
|
|||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
#[serde(rename_all = "snake_case", deny_unknown_fields, tag = "type")]
|
#[serde(rename_all = "snake_case", deny_unknown_fields, tag = "type")]
|
||||||
enum ExecutorConfig {
|
enum ExecutorConfig {
|
||||||
Local { workers: usize },
|
Local {
|
||||||
|
workers: usize,
|
||||||
|
},
|
||||||
|
Agent {
|
||||||
|
targets: Vec<agent_executor::AgentTarget>,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExecutorConfig {
|
impl ExecutorConfig {
|
||||||
@@ -42,6 +47,7 @@ impl ExecutorConfig {
|
|||||||
let (tx, rx) = mpsc::unbounded_channel();
|
let (tx, rx) = mpsc::unbounded_channel();
|
||||||
match self {
|
match self {
|
||||||
ExecutorConfig::Local { workers } => (tx, local_executor::start(*workers, rx)),
|
ExecutorConfig::Local { workers } => (tx, local_executor::start(*workers, rx)),
|
||||||
|
ExecutorConfig::Agent { targets } => (tx, agent_executor::start(targets.clone(), rx)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,289 @@
|
|||||||
|
//! The Agent executor is essentially a wrapped version of the local executor.
|
||||||
|
//! It dispatches tasks to remote hosts
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use futures::stream::futures_unordered::FuturesUnordered;
|
||||||
|
use log::{info, warn};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use tokio::sync::{mpsc, oneshot};
|
||||||
|
|
||||||
|
use futures::StreamExt;
|
||||||
|
|
||||||
|
fn default_as_true() -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
|
pub struct AgentTarget {
|
||||||
|
pub base_url: String,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
pub resources: TaskResources,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
pub current_resources: TaskResources,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
pub enabled: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AgentTarget {
|
||||||
|
fn new(base_url: String, resources: TaskResources) -> Self {
|
||||||
|
AgentTarget {
|
||||||
|
base_url,
|
||||||
|
resources: resources.clone(),
|
||||||
|
current_resources: resources,
|
||||||
|
enabled: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn refresh_resources(&mut self, client: &reqwest::Client) {
|
||||||
|
let resource_url = format!("{}/resources", self.base_url);
|
||||||
|
let disabled = match client.get(resource_url).send().await {
|
||||||
|
Ok(result) => {
|
||||||
|
if result.status() == reqwest::StatusCode::OK {
|
||||||
|
self.resources = result.json().await.unwrap();
|
||||||
|
self.current_resources = self.resources.clone();
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => true,
|
||||||
|
};
|
||||||
|
if self.enabled && disabled {
|
||||||
|
warn!("Disabling {}: unable to refresh resources", self.base_url);
|
||||||
|
}
|
||||||
|
self.enabled = !disabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn ping(&mut self, client: &reqwest::Client) -> Result<()> {
|
||||||
|
let resource_url = format!("{}/ready", self.base_url);
|
||||||
|
let result = client.get(resource_url).send().await?;
|
||||||
|
self.enabled = result.status() == reqwest::StatusCode::OK;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Contains specifics on how to run a local task
|
||||||
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||||
|
struct AgentTaskDetail {
|
||||||
|
/// The command and all arguments to run
|
||||||
|
#[serde(default)]
|
||||||
|
command: Vec<String>,
|
||||||
|
|
||||||
|
/// Environment variables to set
|
||||||
|
#[serde(default)]
|
||||||
|
environment: HashMap<String, String>,
|
||||||
|
|
||||||
|
/// Timeout in seconds
|
||||||
|
#[serde(default)]
|
||||||
|
timeout: i64,
|
||||||
|
|
||||||
|
/// resources required by the task
|
||||||
|
resources: TaskResources,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_details(details: &TaskDetails) -> Result<AgentTaskDetail, serde_json::Error> {
|
||||||
|
serde_json::from_value::<AgentTaskDetail>(details.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn validate_task(details: &TaskDetails, max_capacities: &[TaskResources]) -> Result<()> {
|
||||||
|
let parsed = extract_details(details)?;
|
||||||
|
if max_capacities.is_empty()
|
||||||
|
|| max_capacities.iter().all(|x| x.values().all(|x| *x == 0))
|
||||||
|
|| max_capacities
|
||||||
|
.iter()
|
||||||
|
.any(|x| x.can_satisfy(&parsed.resources))
|
||||||
|
{
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(anyhow!("No Agent target satisfies the required resources"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn submit_task(
|
||||||
|
base_url: String,
|
||||||
|
details: TaskDetails,
|
||||||
|
output_options: TaskOutputOptions,
|
||||||
|
client: reqwest::Client,
|
||||||
|
varmap: VarMap,
|
||||||
|
) -> TaskAttempt {
|
||||||
|
let submit_url = format!("{}/run", base_url);
|
||||||
|
let mut attempt = TaskAttempt::new();
|
||||||
|
match client.post(submit_url).json(&details).send().await {
|
||||||
|
Ok(result) => {
|
||||||
|
if result.status() == reqwest::StatusCode::OK {
|
||||||
|
attempt = result.json().await.unwrap();
|
||||||
|
attempt
|
||||||
|
.executor
|
||||||
|
.push(format!("Executed on agent at {}", base_url));
|
||||||
|
} else {
|
||||||
|
attempt.succeeded = false;
|
||||||
|
attempt.infra_failure = true;
|
||||||
|
attempt.executor.push(format!(
|
||||||
|
"Unable to dispatch to agent at {}: {:?}",
|
||||||
|
base_url,
|
||||||
|
result.text().await.unwrap()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
attempt.succeeded = false;
|
||||||
|
attempt.infra_failure = true;
|
||||||
|
attempt.executor.push(format!(
|
||||||
|
"Unable to dispatch to agent at {}: {:?}",
|
||||||
|
base_url, e
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
attempt
|
||||||
|
}
|
||||||
|
|
||||||
|
// async fn select_target() -> Option<usize> {}
|
||||||
|
|
||||||
|
struct RunningTask {
|
||||||
|
resources: TaskResources,
|
||||||
|
target_id: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The mpsc channel can be sized to fit max parallelism
|
||||||
|
async fn start_agent_executor(
|
||||||
|
mut targets: Vec<AgentTarget>,
|
||||||
|
mut exe_msgs: mpsc::UnboundedReceiver<ExecutorMessage>,
|
||||||
|
) {
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
|
||||||
|
for target in &mut targets {
|
||||||
|
target.refresh_resources(&client).await;
|
||||||
|
}
|
||||||
|
let mut max_caps: Vec<TaskResources> = targets.iter().map(|x| x.resources.clone()).collect();
|
||||||
|
|
||||||
|
// Set up the local executor
|
||||||
|
let (le_tx, le_rx) = mpsc::unbounded_channel();
|
||||||
|
local_executor::start(1, le_rx);
|
||||||
|
|
||||||
|
// Tasks waiting to release resources
|
||||||
|
let mut running = FuturesUnordered::new();
|
||||||
|
|
||||||
|
while let Some(msg) = exe_msgs.recv().await {
|
||||||
|
use ExecutorMessage::*;
|
||||||
|
match msg {
|
||||||
|
ValidateTask { details, response } => {
|
||||||
|
let ltx = le_tx.clone();
|
||||||
|
let caps = max_caps.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let result = validate_task(&details, &caps);
|
||||||
|
if result.is_err() {
|
||||||
|
response.send(result).unwrap_or(());
|
||||||
|
} else {
|
||||||
|
ltx.send(ValidateTask { details, response }).unwrap_or(());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
ExecuteTask {
|
||||||
|
task_name,
|
||||||
|
interval,
|
||||||
|
details,
|
||||||
|
varmap,
|
||||||
|
output_options,
|
||||||
|
storage,
|
||||||
|
response,
|
||||||
|
kill,
|
||||||
|
} => {
|
||||||
|
let task = extract_details(&details).unwrap();
|
||||||
|
let resources = task.resources.clone();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match targets.iter_mut().enumerate().find(|(_, x)| {
|
||||||
|
x.enabled && x.current_resources.can_satisfy(&task.resources)
|
||||||
|
}) {
|
||||||
|
// There is a remote agent with capacity
|
||||||
|
Some((tid, target)) => {
|
||||||
|
target.current_resources.sub(&resources).unwrap();
|
||||||
|
let base_url = target.base_url.clone();
|
||||||
|
let submit_client = client.clone();
|
||||||
|
running.push(tokio::spawn(async move {
|
||||||
|
let attempt = submit_task(
|
||||||
|
base_url,
|
||||||
|
details,
|
||||||
|
output_options,
|
||||||
|
submit_client,
|
||||||
|
varmap,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let rc = attempt.succeeded;
|
||||||
|
storage
|
||||||
|
.send(StorageMessage::StoreAttempt {
|
||||||
|
task_name,
|
||||||
|
interval,
|
||||||
|
attempt,
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
response.send(rc).unwrap();
|
||||||
|
(tid, resources, rc)
|
||||||
|
}));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// No agent has capacity
|
||||||
|
None => {
|
||||||
|
// Give the outstanding tasks a chance to complete or agents
|
||||||
|
// recover
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_millis(250)).await;
|
||||||
|
info!("Waiting to run message");
|
||||||
|
|
||||||
|
// Refresh any disabled targets
|
||||||
|
for (tid, target) in targets.iter_mut().enumerate() {
|
||||||
|
if target.enabled {
|
||||||
|
info!("Skipping {} as it is enabled", target.base_url);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
target.refresh_resources(&client).await;
|
||||||
|
if target.enabled {
|
||||||
|
max_caps[tid] = target.resources.clone();
|
||||||
|
info!("{} is now enabled.", target.base_url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for the next item
|
||||||
|
if !running.is_empty() {
|
||||||
|
let result: Result<
|
||||||
|
(usize, TaskResources, bool),
|
||||||
|
tokio::task::JoinError,
|
||||||
|
> = running.next().await.unwrap();
|
||||||
|
|
||||||
|
let (tid, resources, submit_ok) = result.unwrap();
|
||||||
|
if !submit_ok {
|
||||||
|
warn!(
|
||||||
|
"Disabling agent at {} due to incomplete submission.",
|
||||||
|
targets[tid].base_url
|
||||||
|
);
|
||||||
|
targets[tid].enabled = false;
|
||||||
|
}
|
||||||
|
targets[tid].current_resources.add(&resources);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
msg @ StopTask { .. } => {
|
||||||
|
le_tx.send(msg).unwrap_or(());
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
Stop {} => {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn start(
|
||||||
|
targets: Vec<AgentTarget>,
|
||||||
|
msgs: mpsc::UnboundedReceiver<ExecutorMessage>,
|
||||||
|
) -> tokio::task::JoinHandle<()> {
|
||||||
|
tokio::spawn(async move {
|
||||||
|
start_agent_executor(targets, msgs).await;
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
pub mod agent_executor;
|
||||||
pub mod local_executor;
|
pub mod local_executor;
|
||||||
|
|
||||||
/// Messages for interacting with an Executor
|
/// Messages for interacting with an Executor
|
||||||
|
|||||||
+59
@@ -1,4 +1,63 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
|
use std::ops::{Deref, DerefMut};
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
|
||||||
|
pub struct TaskResources(HashMap<String, i64>);
|
||||||
|
|
||||||
|
impl Deref for TaskResources {
|
||||||
|
type Target = HashMap<String, i64>;
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DerefMut for TaskResources {
|
||||||
|
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||||
|
&mut self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TaskResources {
|
||||||
|
#[must_use]
|
||||||
|
pub fn new() -> Self {
|
||||||
|
TaskResources(HashMap::new())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[must_use]
|
||||||
|
pub fn can_satisfy(&self, requirements: &TaskResources) -> bool {
|
||||||
|
requirements
|
||||||
|
.iter()
|
||||||
|
.all(|(k, v)| self.contains_key(k) && self[k] >= *v)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Subtracts resources from available resources.
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an `Err` if the requested resources cannot be fulfilled
|
||||||
|
/// # Panics
|
||||||
|
/// It doesn't, keys are checked for ahead-of-time
|
||||||
|
pub fn sub(&mut self, resources: &TaskResources) -> Result<()> {
|
||||||
|
if self.can_satisfy(resources) {
|
||||||
|
for (k, v) in resources.iter() {
|
||||||
|
*self.get_mut(k).unwrap() -= v;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(anyhow!("Cannot satisfy requested resources"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// # Panics
|
||||||
|
/// It doesn't, keys are checked for ahead-of-time
|
||||||
|
pub fn add(&mut self, resources: &TaskResources) {
|
||||||
|
for (k, v) in resources.iter() {
|
||||||
|
if self.contains_key(k) {
|
||||||
|
*self.get_mut(k).unwrap() += *v;
|
||||||
|
} else {
|
||||||
|
self.insert(k.clone(), *v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Defines the struct to parse for tasks
|
/// Defines the struct to parse for tasks
|
||||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Debug)]
|
#[derive(Clone, Serialize, Deserialize, PartialEq, Debug)]
|
||||||
|
|||||||
Reference in New Issue
Block a user