feat: add deep health check for scheduler (#3304)

Co-authored-by: hyperswitch-bot[bot] <148525504+hyperswitch-bot[bot]@users.noreply.github.com> Co-authored-by: dracarys18 <karthikey.hegde@juspay.in>
2025-11-01 02:57:02 +08:00 · 2024-02-01 16:44:13 +05:30
parent 7cf6c8c0b9
commit 170e10cb8e
13 changed files with 193 additions and 3 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -5526,6 +5526,7 @@ dependencies = [
 "external_services",
 "futures 0.3.28",
 "masking",
+ "num_cpus",
 "once_cell",
 "rand 0.8.5",
 "redis_interface",
--- a/config/config.example.toml
+++ b/config/config.example.toml
@ -298,6 +298,12 @@ lower_fetch_limit = 1800          # Lower limit for fetching entries from redis
 lock_key = "PRODUCER_LOCKING_KEY" # The following keys defines the producer lock that is created in redis with
 lock_ttl = 160                    # the ttl being the expiry (in seconds)

+# Scheduler server configuration
+[scheduler.server]
+port = 3000                       # Port on which the server will listen for incoming requests
+host = "127.0.0.1"                # Host IP address to bind the server to
+workers = 1                       # Number of actix workers to handle incoming requests concurrently
+
 batch_size = 200 # Specifies the batch size the producer will push under a single entry in the redis queue

 # Drainer configuration, which handles draining raw SQL queries from Redis streams to the SQL database
--- a/config/deployments/scheduler/consumer.toml
+++ b/config/deployments/scheduler/consumer.toml
@ -9,3 +9,9 @@ stream = "scheduler_stream"
 [scheduler.consumer]
 consumer_group = "scheduler_group"
 disabled = false                   # This flag decides if the consumer should actively consume task
+
+# Scheduler server configuration
+[scheduler.server]
+port = 3000                       # Port on which the server will listen for incoming requests
+host = "127.0.0.1"                # Host IP address to bind the server to
+workers = 1                       # Number of actix workers to handle incoming requests concurrently
--- a/config/deployments/scheduler/producer.toml
+++ b/config/deployments/scheduler/producer.toml
@ -12,3 +12,9 @@ lock_key = "producer_locking_key" # The following keys defines the producer lock
 lock_ttl = 160                    # the ttl being the expiry (in seconds)
 lower_fetch_limit = 900           # Lower limit for fetching entries from redis queue (in seconds)
 upper_fetch_limit = 0             # Upper limit for fetching entries from the redis queue (in seconds)0
+
+# Scheduler server configuration
+[scheduler.server]
+port = 3000                       # Port on which the server will listen for incoming requests
+host = "127.0.0.1"                # Host IP address to bind the server to
+workers = 1                       # Number of actix workers to handle incoming requests concurrently
--- a/config/development.toml
+++ b/config/development.toml
@ -228,6 +228,11 @@ stream = "SCHEDULER_STREAM"
 disabled = false
 consumer_group = "SCHEDULER_GROUP"

+[scheduler.server]
+port = 3000
+host = "127.0.0.1"
+workers = 1
+
 [email]
 sender_email = "example@example.com"
 aws_region = ""
--- a/config/docker_compose.toml
+++ b/config/docker_compose.toml
@ -227,6 +227,11 @@ stream = "SCHEDULER_STREAM"
 disabled = false
 consumer_group = "SCHEDULER_GROUP"

+[scheduler.server]
+port = 3000
+host = "127.0.0.1"
+workers = 1
+
 #tokenization configuration which describe token lifetime and payment method for specific connector
 [tokenization]
 stripe = { long_lived_token = false, payment_method = "wallet", payment_method_type = { type = "disable_only", list = "google_pay" } }
--- a/crates/api_models/src/health_check.rs
+++ b/crates/api_models/src/health_check.rs
@ -8,3 +8,10 @@ pub struct RouterHealthCheckResponse {
 }

 impl common_utils::events::ApiEventMetric for RouterHealthCheckResponse {}
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct SchedulerHealthCheckResponse {
+    pub database: bool,
+    pub redis: bool,
+}
+
+impl common_utils::events::ApiEventMetric for SchedulerHealthCheckResponse {}
--- a/crates/router/src/bin/scheduler.rs
+++ b/crates/router/src/bin/scheduler.rs
@ -1,21 +1,29 @@
 #![recursion_limit = "256"]
 use std::{str::FromStr, sync::Arc};

+use actix_web::{dev::Server, web, Scope};
+use api_models::health_check::SchedulerHealthCheckResponse;
 use common_utils::ext_traits::{OptionExt, StringExt};
 use diesel_models::process_tracker as storage;
 use error_stack::ResultExt;
 use router::{
    configs::settings::{CmdLineConf, Settings},
-    core::errors::{self, CustomResult},
-    logger, routes, services,
+    core::{
+        errors::{self, CustomResult},
+        health_check::HealthCheckInterface,
+    },
+    logger, routes,
+    services::{self, api},
    types::storage::ProcessTrackerExt,
    workflows,
 };
+use router_env::{instrument, tracing};
 use scheduler::{
    consumer::workflows::ProcessTrackerWorkflow, errors::ProcessTrackerError,
    workflows::ProcessTrackerWorkflows, SchedulerAppState,
 };
 use serde::{Deserialize, Serialize};
+use storage_impl::errors::ApplicationError;
 use strum::EnumString;
 use tokio::sync::{mpsc, oneshot};

@ -68,6 +76,19 @@ async fn main() -> CustomResult<(), ProcessTrackerError> {
        [router_env::service_name!()],
    );

+    #[allow(clippy::expect_used)]
+    let web_server = Box::pin(start_web_server(
+        state.clone(),
+        scheduler_flow_str.to_string(),
+    ))
+    .await
+    .expect("Failed to create the server");
+
+    tokio::spawn(async move {
+        let _ = web_server.await;
+        logger::error!("The health check probe stopped working!");
+    });
+
    logger::debug!(startup_config=?state.conf);

    start_scheduler(&state, scheduler_flow, (tx, rx)).await?;
@ -76,6 +97,106 @@ async fn main() -> CustomResult<(), ProcessTrackerError> {
    Ok(())
 }

+pub async fn start_web_server(
+    state: routes::AppState,
+    service: String,
+) -> errors::ApplicationResult<Server> {
+    let server = state
+        .conf
+        .scheduler
+        .as_ref()
+        .ok_or(ApplicationError::InvalidConfigurationValueError(
+            "Scheduler server is invalidly configured".into(),
+        ))?
+        .server
+        .clone();
+
+    let web_server = actix_web::HttpServer::new(move || {
+        actix_web::App::new().service(Health::server(state.clone(), service.clone()))
+    })
+    .bind((server.host.as_str(), server.port))?
+    .workers(server.workers)
+    .run();
+    let _ = web_server.handle();
+
+    Ok(web_server)
+}
+
+pub struct Health;
+
+impl Health {
+    pub fn server(state: routes::AppState, service: String) -> Scope {
+        web::scope("health")
+            .app_data(web::Data::new(state))
+            .app_data(web::Data::new(service))
+            .service(web::resource("").route(web::get().to(health)))
+            .service(web::resource("/ready").route(web::get().to(deep_health_check)))
+    }
+}
+
+#[instrument(skip_all)]
+pub async fn health() -> impl actix_web::Responder {
+    logger::info!("Scheduler health was called");
+    actix_web::HttpResponse::Ok().body("Scheduler health is good")
+}
+#[instrument(skip_all)]
+pub async fn deep_health_check(
+    state: web::Data<routes::AppState>,
+    service: web::Data<String>,
+) -> impl actix_web::Responder {
+    let report = deep_health_check_func(state, service).await;
+    match report {
+        Ok(response) => services::http_response_json(
+            serde_json::to_string(&response)
+                .map_err(|err| {
+                    logger::error!(serialization_error=?err);
+                })
+                .unwrap_or_default(),
+        ),
+        Err(err) => api::log_and_return_error_response(err),
+    }
+}
+#[instrument(skip_all)]
+pub async fn deep_health_check_func(
+    state: web::Data<routes::AppState>,
+    service: web::Data<String>,
+) -> errors::RouterResult<SchedulerHealthCheckResponse> {
+    logger::info!("{} deep health check was called", service.into_inner());
+
+    logger::debug!("Database health check begin");
+
+    let db_status = state.health_check_db().await.map(|_| true).map_err(|err| {
+        error_stack::report!(errors::ApiErrorResponse::HealthCheckError {
+            component: "Database",
+            message: err.to_string()
+        })
+    })?;
+
+    logger::debug!("Database health check end");
+
+    logger::debug!("Redis health check begin");
+
+    let redis_status = state
+        .health_check_redis()
+        .await
+        .map(|_| true)
+        .map_err(|err| {
+            error_stack::report!(errors::ApiErrorResponse::HealthCheckError {
+                component: "Redis",
+                message: err.to_string()
+            })
+        })?;
+
+    logger::debug!("Redis health check end");
+
+    let response = SchedulerHealthCheckResponse {
+        database: db_status,
+        redis: redis_status,
+    };
+
+    Ok(response)
+}
+
 #[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, EnumString)]
 #[serde(rename_all = "SCREAMING_SNAKE_CASE")]
 #[strum(serialize_all = "SCREAMING_SNAKE_CASE")]
--- a/crates/router/src/core/errors/api_error_response.rs
+++ b/crates/router/src/core/errors/api_error_response.rs
@ -238,7 +238,7 @@ pub enum ApiErrorResponse {
    WebhookInvalidMerchantSecret,
    #[error(error_type = ErrorType::InvalidRequestError, code = "IR_19", message = "{message}")]
    CurrencyNotSupported { message: String },
-    #[error(error_type = ErrorType::ServerNotAvailable, code= "HE_00", message = "{component} health check is failiing with error: {message}")]
+    #[error(error_type = ErrorType::ServerNotAvailable, code= "HE_00", message = "{component} health check is failing with error: {message}")]
    HealthCheckError {
        component: &'static str,
        message: String,
--- a/crates/scheduler/Cargo.toml
+++ b/crates/scheduler/Cargo.toml
@ -13,6 +13,7 @@ kv_store = []
 async-trait = "0.1.68"
 error-stack = "0.3.1"
 futures = "0.3.28"
+num_cpus = "1.15.0"
 once_cell = "1.18.0"
 rand = "0.8.5"
 serde = "1.0.193"
--- a/crates/scheduler/src/configs/defaults.rs
+++ b/crates/scheduler/src/configs/defaults.rs
@ -6,6 +6,7 @@ impl Default for super::settings::SchedulerSettings {
            consumer: super::settings::ConsumerSettings::default(),
            graceful_shutdown_interval: 60000,
            loop_interval: 5000,
+            server: super::settings::Server::default(),
        }
    }
 }
@ -30,3 +31,13 @@ impl Default for super::settings::ConsumerSettings {
        }
    }
 }
+
+impl Default for super::settings::Server {
+    fn default() -> Self {
+        Self {
+            port: 8080,
+            workers: num_cpus::get_physical(),
+            host: "localhost".into(),
+        }
+    }
+}
--- a/crates/scheduler/src/configs/settings.rs
+++ b/crates/scheduler/src/configs/settings.rs
@ -15,6 +15,15 @@ pub struct SchedulerSettings {
    pub consumer: ConsumerSettings,
    pub loop_interval: u64,
    pub graceful_shutdown_interval: u64,
+    pub server: Server,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+#[serde(default)]
+pub struct Server {
+    pub port: u16,
+    pub workers: usize,
+    pub host: String,
 }

 #[derive(Debug, Clone, Deserialize)]
--- a/crates/scheduler/src/configs/validations.rs
+++ b/crates/scheduler/src/configs/validations.rs
@ -19,6 +19,8 @@ impl super::settings::SchedulerSettings {

        self.producer.validate()?;

+        self.server.validate()?;
+
        Ok(())
    }
 }
@ -32,3 +34,13 @@ impl super::settings::ProducerSettings {
        })
    }
 }
+
+impl super::settings::Server {
+    pub fn validate(&self) -> Result<(), ApplicationError> {
+        common_utils::fp_utils::when(self.host.is_default_or_empty(), || {
+            Err(ApplicationError::InvalidConfigurationValueError(
+                "server host must not be empty".into(),
+            ))
+        })
+    }
+}