From 42cd769407f4a30e50d5b9826677a4dd310d97f4 Mon Sep 17 00:00:00 2001 From: Chethan Rao <70657455+Chethan-rao@users.noreply.github.com> Date: Tue, 11 Jun 2024 20:22:20 +0530 Subject: [PATCH] feat(metrics): add support for gauge metrics and include IMC metrics (#4939) Co-authored-by: hyperswitch-bot[bot] <148525504+hyperswitch-bot[bot]@users.noreply.github.com> --- config/config.example.toml | 1 + config/deployments/env_specific.toml | 1 + config/development.toml | 1 + config/docker_compose.toml | 3 +- crates/router/src/bin/router.rs | 6 +++ crates/router/src/routes/metrics.rs | 10 ++-- .../routes/metrics/bg_metrics_collector.rs | 46 +++++++++++++++++++ crates/router_env/src/logger/config.rs | 2 + crates/router_env/src/metrics.rs | 19 ++++++++ crates/storage_impl/src/redis/cache.rs | 10 ++++ 10 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 crates/router/src/routes/metrics/bg_metrics_collector.rs diff --git a/config/config.example.toml b/config/config.example.toml index f4a6a66cd6..81516ee8b6 100644 --- a/config/config.example.toml +++ b/config/config.example.toml @@ -112,6 +112,7 @@ otel_exporter_otlp_endpoint = "http://localhost:4317" # endpoint to send metrics otel_exporter_otlp_timeout = 5000 # timeout (in milliseconds) for sending metrics and traces use_xray_generator = false # Set this to true for AWS X-ray compatible traces route_to_trace = ["*/confirm"] +bg_metrics_collection_interval_in_secs = 15 # Interval for collecting the metrics in background thread # This section provides some secret values. [secrets] diff --git a/config/deployments/env_specific.toml b/config/deployments/env_specific.toml index a097e1b66d..9ab790b8ee 100644 --- a/config/deployments/env_specific.toml +++ b/config/deployments/env_specific.toml @@ -139,6 +139,7 @@ otel_exporter_otlp_endpoint = "http://localhost:4317" # endpoint to send metrics otel_exporter_otlp_timeout = 5000 # timeout (in milliseconds) for sending metrics and traces use_xray_generator = false # Set this to true for AWS X-ray compatible traces route_to_trace = ["*/confirm"] +bg_metrics_collection_interval_in_secs = 15 # Interval for collecting the metrics in background thread [lock_settings] delay_between_retries_in_milliseconds = 500 # Delay between retries in milliseconds diff --git a/config/development.toml b/config/development.toml index 2a1ea74660..2fcac14fb1 100644 --- a/config/development.toml +++ b/config/development.toml @@ -10,6 +10,7 @@ log_format = "default" traces_enabled = false metrics_enabled = false use_xray_generator = false +bg_metrics_collection_interval_in_secs = 15 # TODO: Update database credentials before running application [master_database] diff --git a/config/docker_compose.toml b/config/docker_compose.toml index 2a4fe36928..2cd93eade0 100644 --- a/config/docker_compose.toml +++ b/config/docker_compose.toml @@ -18,7 +18,8 @@ traces_enabled = false # Whether traces are metrics_enabled = true # Whether metrics are enabled. ignore_errors = false # Whether to ignore errors during traces or metrics pipeline setup. otel_exporter_otlp_endpoint = "https://otel-collector:4317" # Endpoint to send metrics and traces to. -use_xray_generator = false +use_xray_generator = false # Set this to true for AWS X-ray compatible traces +bg_metrics_collection_interval_in_secs = 15 # Interval for collecting the metrics in background thread [master_database] username = "db_user" diff --git a/crates/router/src/bin/router.rs b/crates/router/src/bin/router.rs index 094f799cb2..8271e458b2 100644 --- a/crates/router/src/bin/router.rs +++ b/crates/router/src/bin/router.rs @@ -2,6 +2,7 @@ use router::{ configs::settings::{CmdLineConf, Settings}, core::errors::{ApplicationError, ApplicationResult}, logger, + routes::metrics, }; #[tokio::main] @@ -27,6 +28,11 @@ async fn main() -> ApplicationResult<()> { logger::info!("Application started [{:?}] [{:?}]", conf.server, conf.log); + // Spawn a thread for collecting metrics at fixed intervals + metrics::bg_metrics_collector::spawn_metrics_collector( + &conf.log.telemetry.bg_metrics_collection_interval_in_secs, + ); + #[allow(clippy::expect_used)] let server = Box::pin(router::start_server(conf)) .await diff --git a/crates/router/src/routes/metrics.rs b/crates/router/src/routes/metrics.rs index 1123be1a87..18014c6e19 100644 --- a/crates/router/src/routes/metrics.rs +++ b/crates/router/src/routes/metrics.rs @@ -1,4 +1,8 @@ -use router_env::{counter_metric, global_meter, histogram_metric, metrics_context}; +pub mod bg_metrics_collector; +pub mod request; +pub mod utils; + +use router_env::{counter_metric, gauge_metric, global_meter, histogram_metric, metrics_context}; metrics_context!(CONTEXT); global_meter!(GLOBAL_METER, "ROUTER_API"); @@ -133,5 +137,5 @@ counter_metric!(ACCESS_TOKEN_CACHE_HIT, GLOBAL_METER); // A counter to indicate the access token cache miss counter_metric!(ACCESS_TOKEN_CACHE_MISS, GLOBAL_METER); -pub mod request; -pub mod utils; +// Metrics for In-memory cache +gauge_metric!(CACHE_ENTRY_COUNT, GLOBAL_METER); diff --git a/crates/router/src/routes/metrics/bg_metrics_collector.rs b/crates/router/src/routes/metrics/bg_metrics_collector.rs new file mode 100644 index 0000000000..65cb7a13e5 --- /dev/null +++ b/crates/router/src/routes/metrics/bg_metrics_collector.rs @@ -0,0 +1,46 @@ +use storage_impl::redis::cache; + +const DEFAULT_BG_METRICS_COLLECTION_INTERVAL_IN_SECS: u16 = 15; + +macro_rules! gauge_metrics_for_imc { + ($($cache:ident),*) => { + $( + { + cache::$cache.run_pending_tasks().await; + + super::CACHE_ENTRY_COUNT.observe( + &super::CONTEXT, + cache::$cache.get_entry_count(), + &[super::request::add_attributes( + "cache_type", + stringify!($cache), + )], + ); + } + )* + }; +} + +pub fn spawn_metrics_collector(metrics_collection_interval_in_secs: &Option) { + let metrics_collection_interval = metrics_collection_interval_in_secs + .unwrap_or(DEFAULT_BG_METRICS_COLLECTION_INTERVAL_IN_SECS); + + tokio::spawn(async move { + loop { + gauge_metrics_for_imc!( + CONFIG_CACHE, + ACCOUNTS_CACHE, + ROUTING_CACHE, + CGRAPH_CACHE, + PM_FILTERS_CGRAPH_CACHE, + DECISION_MANAGER_CACHE, + SURCHARGE_CACHE + ); + + tokio::time::sleep(std::time::Duration::from_secs( + metrics_collection_interval.into(), + )) + .await + } + }); +} diff --git a/crates/router_env/src/logger/config.rs b/crates/router_env/src/logger/config.rs index 09d285a862..135451f266 100644 --- a/crates/router_env/src/logger/config.rs +++ b/crates/router_env/src/logger/config.rs @@ -103,6 +103,8 @@ pub struct LogTelemetry { pub use_xray_generator: bool, /// Route Based Tracing pub route_to_trace: Option>, + /// Interval for collecting the metrics (such as gauge) in background thread + pub bg_metrics_collection_interval_in_secs: Option, } /// Telemetry / tracing. diff --git a/crates/router_env/src/metrics.rs b/crates/router_env/src/metrics.rs index e145caace0..e75cacaa3c 100644 --- a/crates/router_env/src/metrics.rs +++ b/crates/router_env/src/metrics.rs @@ -101,3 +101,22 @@ macro_rules! histogram_metric_i64 { > = once_cell::sync::Lazy::new(|| $meter.i64_histogram($description).init()); }; } + +/// Create a [`ObservableGauge`][ObservableGauge] metric with the specified name and an optional description, +/// associated with the specified meter. Note that the meter must be to a valid [`Meter`][Meter]. +/// +/// [ObservableGauge]: opentelemetry::metrics::ObservableGauge +/// [Meter]: opentelemetry::metrics::Meter +#[macro_export] +macro_rules! gauge_metric { + ($name:ident, $meter:ident) => { + pub(crate) static $name: once_cell::sync::Lazy< + $crate::opentelemetry::metrics::ObservableGauge, + > = once_cell::sync::Lazy::new(|| $meter.u64_observable_gauge(stringify!($name)).init()); + }; + ($name:ident, $meter:ident, description:literal) => { + pub(crate) static $name: once_cell::sync::Lazy< + $crate::opentelemetry::metrics::ObservableGauge, + > = once_cell::sync::Lazy::new(|| $meter.u64_observable_gauge($description).init()); + }; +} diff --git a/crates/storage_impl/src/redis/cache.rs b/crates/storage_impl/src/redis/cache.rs index e4efab0da3..6ad96be279 100644 --- a/crates/storage_impl/src/redis/cache.rs +++ b/crates/storage_impl/src/redis/cache.rs @@ -207,6 +207,16 @@ impl Cache { pub async fn remove(&self, key: CacheKey) { self.inner.invalidate::(&key.into()).await; } + + /// Performs any pending maintenance operations needed by the cache. + pub async fn run_pending_tasks(&self) { + self.inner.run_pending_tasks().await; + } + + /// Returns an approximate number of entries in this cache. + pub fn get_entry_count(&self) -> u64 { + self.inner.entry_count() + } } #[instrument(skip_all)]