ArroyoSystems · mwylde · Jun 11, 2024 · Jun 11, 2024
diff --git a/crates/arroyo-api/src/pipelines.rs b/crates/arroyo-api/src/pipelines.rs
@@ -53,8 +53,6 @@ use crate::{connection_tables, to_micros};
 use arroyo_rpc::config::config;
 use cornucopia_async::{Database, DatabaseSource};
 
-const DEFAULT_CHECKPOINT_INTERVAL: Duration = Duration::from_secs(10);
-
 async fn compile_sql<'a>(
     query: String,
     local_udfs: &Vec<Udf>,
@@ -511,7 +509,7 @@ pub async fn create_pipeline(
     let checkpoint_interval = pipeline_post
         .checkpoint_interval_micros
         .map(Duration::from_micros)
-        .unwrap_or(DEFAULT_CHECKPOINT_INTERVAL);
+        .unwrap_or(*config().default_checkpoint_interval);
 
     let job_id = jobs::create_job(
         &pipeline_post.name,

diff --git a/crates/arroyo-controller/src/job_controller/mod.rs b/crates/arroyo-controller/src/job_controller/mod.rs
@@ -40,7 +40,6 @@ pub mod job_metrics;
 const CHECKPOINTS_TO_KEEP: u32 = 4;
 const CHECKPOINT_ROWS_TO_KEEP: u32 = 100;
 const COMPACT_EVERY: u32 = 2;
-const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(30);
 
 #[derive(Debug, PartialEq, Eq)]
 pub enum WorkerState {
@@ -58,7 +57,7 @@ pub struct WorkerStatus {
 
 impl WorkerStatus {
     fn heartbeat_timeout(&self) -> bool {
-        self.last_heartbeat.elapsed() > HEARTBEAT_TIMEOUT
+        self.last_heartbeat.elapsed() > *config().pipeline.worker_heartbeat_timeout
     }
 }
 
@@ -369,7 +368,7 @@ impl RunningJobModel {
     }
 
     async fn compact_state(&mut self) -> anyhow::Result<()> {
-        if !config().controller.compaction.enabled {
+        if !config().pipeline.compaction.enabled {
             info!("Compaction is disabled, skipping compaction");
             return Ok(());
         }

diff --git a/crates/arroyo-controller/src/states/running.rs b/crates/arroyo-controller/src/states/running.rs
@@ -18,12 +18,6 @@ use serde_json::json;
 
 use super::{JobContext, State, Transition};
 
-// after this amount of time, we consider the job to be healthy and reset the restarts counter
-const HEALTHY_DURATION: Duration = Duration::from_secs(2 * 60);
-
-// how many times we allow the job to restart before moving it to failed
-const RESTARTS_ALLOWED: usize = 10;
-
 #[derive(Debug)]
 pub struct Running {}
 
@@ -36,6 +30,8 @@ impl State for Running {
     async fn next(mut self: Box<Self>, ctx: &mut JobContext) -> Result<Transition, StateError> {
         stop_if_desired_running!(self, ctx.config);
 
+        let pipeline_config = &config().clone().pipeline;
+
         let running_start = Instant::now();
 
         let mut log_interval = tokio::time::interval(Duration::from_secs(60));
@@ -92,7 +88,7 @@ impl State for Running {
                     }
                 }
                 _ = tokio::time::sleep(Duration::from_millis(200)) => {
-                    if ctx.status.restarts > 0 && running_start.elapsed() > HEALTHY_DURATION {
+                    if ctx.status.restarts > 0 && running_start.elapsed() > *pipeline_config.healthy_duration {
                         let restarts = ctx.status.restarts;
                         ctx.status.restarts = 0;
                         if let Err(e) = ctx.status.update_db(&ctx.db).await {
@@ -120,7 +116,7 @@ impl State for Running {
                                 "job_id": ctx.config.id,
                                 "error": format!("{:?}", err),
                             }));
-                            if ctx.status.restarts >= RESTARTS_ALLOWED as i32 {
+                            if pipeline_config.allowed_restarts != -1 && ctx.status.restarts >= pipeline_config.allowed_restarts {
                                 return Err(fatal(
                                     "Job has restarted too many times",
                                     err

diff --git a/crates/arroyo-controller/src/states/scheduling.rs b/crates/arroyo-controller/src/states/scheduling.rs
@@ -31,9 +31,6 @@ use crate::{
 
 use super::{running::Running, JobContext, State, Transition};
 
-const WORKER_STARTUP_TIME: Duration = Duration::from_secs(10 * 60);
-const TASK_STARTUP_TIME: Duration = Duration::from_secs(2 * 60);
-
 #[derive(Debug, Clone)]
 struct WorkerStatus {
     id: WorkerId,
@@ -189,7 +186,7 @@ impl Scheduling {
                         slots_for_job = slots_needed,
                         slots_needed = s
                     );
-                    if start.elapsed() > WORKER_STARTUP_TIME {
+                    if start.elapsed() > *config().pipeline.worker_startup_time {
                         return Err(fatal(
                             "Not enough slots to schedule job",
                             anyhow!("scheduler error -- needed {} slots", slots_needed),
@@ -244,10 +241,13 @@ impl State for Scheduling {
         let worker_connects = Arc::new(Mutex::new(HashMap::new()));
         let mut handles = vec![];
 
+        let config = &config().pipeline;
+
         let start = Instant::now();
         loop {
-            let timeout = WORKER_STARTUP_TIME
-                .min(ctx.config.ttl.unwrap_or(WORKER_STARTUP_TIME))
+            let timeout = config
+                .worker_startup_time
+                .min(ctx.config.ttl.unwrap_or(*config.worker_startup_time))
                 .checked_sub(start.elapsed())
                 .unwrap_or(Duration::ZERO);
 
@@ -268,7 +268,7 @@ impl State for Scheduling {
                 _ = tokio::time::sleep(timeout) => {
                     return Err(ctx.retryable(self,
                         "timed out while waiting for workers to start",
-                        anyhow!("timed out after {:?} while waiting for worker startup", WORKER_STARTUP_TIME), 3));
+                        anyhow!("timed out after {:?} while waiting for worker startup", *config.worker_startup_time), 3));
                 }
             }
 
@@ -499,8 +499,9 @@ impl State for Scheduling {
         let start = Instant::now();
         let mut started_tasks = HashSet::new();
         while started_tasks.len() < ctx.program.task_count() {
-            let timeout = TASK_STARTUP_TIME
-                .min(ctx.config.ttl.unwrap_or(TASK_STARTUP_TIME))
+            let timeout = config
+                .task_startup_time
+                .min(ctx.config.ttl.unwrap_or(*config.task_startup_time))
                 .checked_sub(start.elapsed())
                 .unwrap_or(Duration::ZERO);
 
@@ -528,7 +529,7 @@ impl State for Scheduling {
                 _ = tokio::time::sleep(timeout) => {
                     return Err(ctx.retryable(self,
                         "timed out while waiting for tasks to start",
-                        anyhow!("timed out after {:?} while waiting for worker startup", TASK_STARTUP_TIME), 3));
+                        anyhow!("timed out after {:?} while waiting for worker startup", *config.task_startup_time), 3));
                 }
             }
         }

diff --git a/crates/arroyo-rpc/default.toml b/crates/arroyo-rpc/default.toml
@@ -1,4 +1,19 @@
 checkpoint-url = "/tmp/arroyo/checkpoints"
+default-checkpoint-interval = "10s"
+
+[pipeline]
+source-batch-size = 512
+source-batch-linger = "100ms"
+update-aggregate-flush-interval = "1s"
+allowed-restarts = 20
+worker-heartbeat-timeout = "30s"
+healthy-duration = "2m"
+worker-startup-time = "10m"
+task-startup-time = "2m"
+
+[pipeline.compaction]
+enabled = false
+checkpoints-to-compact = 4
 
 # Services
 
@@ -11,11 +26,6 @@ bind-address = "0.0.0.0"
 rpc-port = 9190
 scheduler = "process"
 
-[controller.compaction]
-enabled = false
-checkpoints-to-compact = 4
-
-
 [compiler]
 bind-address = "0.0.0.0"
 rpc-port = 9191
@@ -40,10 +50,6 @@ task-slots = 16
 bind-address = "0.0.0.0"
 http-port = 8001
 
-[pipeline]
-source-batch-size = 512
-source-batch-linger = "100ms"
-update-aggregate-flush-interval = "1s"
 
 # Schedulers
 
@@ -63,6 +69,8 @@ resources = { requests = { cpu = "900m",  memory = "500Mi" } }
 task-slots = 16
 command = "/app/arroyo-bin worker"
 
+# other
+
 [database]
 type = "postgres"
 

diff --git a/crates/arroyo-rpc/src/config.rs b/crates/arroyo-rpc/src/config.rs
@@ -214,6 +214,9 @@ pub struct Config {
     /// URL of an object store or filesystem for storing checkpoints
     pub checkpoint_url: String,
 
+    /// Default interval for checkpointing
+    pub default_checkpoint_interval: HumanReadableDuration,
+
     /// The endpoint of the controller, used by other services to connect to it. This must be set
     /// if running the controller on a separate machine from the other services or on a separate
     /// process with a non-standard port.
@@ -273,8 +276,6 @@ pub struct ControllerConfig {
 
     /// The scheduler to use
     pub scheduler: Scheduler,
-
-    pub compaction: CompactionConfig,
 }
 
 #[derive(Debug, Deserialize, Serialize)]
@@ -381,8 +382,25 @@ pub struct PipelineConfig {
     /// Batch linger time (how long to wait before flushing)
     pub source_batch_linger: HumanReadableDuration,
 
-    // How often to flush aggregates
+    /// How often to flush aggregates
     pub update_aggregate_flush_interval: HumanReadableDuration,
+
+    /// How many restarts to allow before moving to failed (-1 for infinite)
+    pub allowed_restarts: i32,
+
+    /// After this amount of time, we consider the job to be healthy and reset the restarts counter
+    pub healthy_duration: HumanReadableDuration,
+
+    /// Number of seconds to wait for a worker heartbeat before considering it dead
+    pub worker_heartbeat_timeout: HumanReadableDuration,
+
+    /// Amount of time to wait for workers to start up before considering them failed
+    pub worker_startup_time: HumanReadableDuration,
+
+    /// Amount of time to wait for tasks to startup before considering it failed
+    pub task_startup_time: HumanReadableDuration,
+
+    pub compaction: CompactionConfig,
 }
 
 #[derive(Debug, Deserialize, Serialize, Eq, PartialEq)]

diff --git a/crates/arroyo-state/src/parquet.rs b/crates/arroyo-state/src/parquet.rs
@@ -170,7 +170,7 @@ impl ParquetBackend {
         operator_id: String,
         epoch: u32,
     ) -> Result<HashMap<String, TableCheckpointMetadata>> {
-        let min_files_to_compact = config().controller.compaction.checkpoints_to_compact as usize;
+        let min_files_to_compact = config().pipeline.compaction.checkpoints_to_compact as usize;
 
         let operator_checkpoint_metadata =
             Self::load_operator_metadata(&job_id, &operator_id, epoch)

diff --git a/k8s/arroyo/Chart.yaml b/k8s/arroyo/Chart.yaml
@@ -23,4 +23,4 @@ maintainers:
     email: support@arroyo.systems
     url: https://arroyo.dev
 
-icon: https://raw.githubusercontent.com/ArroyoSystems/arroyo/master/images/arroyo_logo.png
+icon: https://raw.githubusercontent.com/ArroyoSystems/arroyo/afc004609351f6e4cd2c1f467bb7bf33ad044ccd/images/arroyo_logo.png
diff --git a/k8s/arroyo/README.md b/k8s/arroyo/README.md
@@ -9,11 +9,7 @@ Arroyo control plane. This is the easiest way to get a production quality Arroyo
 See the [docs](https://doc.arroyo.dev/deployment/kubernetes) for full information on how to use this helm chart.
 
 Each version of the helm chart is associated by default with a particular release of Arroyo. The latest release
-<<<<<<< Updated upstream
-is [0.10.0](https://www.arroyo.dev/blog/arroyo-0-10-0).
-=======
 is [0.10.3](https://www.arroyo.dev/blog/arroyo-0-10-0).
->>>>>>> Stashed changes
 
 ## Quickstart