diff --git a/crates/router/src/routes/metrics.rs b/crates/router/src/routes/metrics.rs index c4d0420a9b..770ae9aafc 100644 --- a/crates/router/src/routes/metrics.rs +++ b/crates/router/src/routes/metrics.rs @@ -106,6 +106,7 @@ counter_metric!(APPLE_PAY_MANUAL_FLOW_FAILED_PAYMENT, GLOBAL_METER); counter_metric!(APPLE_PAY_SIMPLIFIED_FLOW_FAILED_PAYMENT, GLOBAL_METER); // Metrics for Auto Retries +counter_metric!(AUTO_RETRY_CONNECTION_CLOSED, GLOBAL_METER); counter_metric!(AUTO_RETRY_ELIGIBLE_REQUEST_COUNT, GLOBAL_METER); counter_metric!(AUTO_RETRY_GSM_MISS_COUNT, GLOBAL_METER); counter_metric!(AUTO_RETRY_GSM_FETCH_FAILURE_COUNT, GLOBAL_METER); diff --git a/crates/router/src/services/api.rs b/crates/router/src/services/api.rs index 13f57cafc6..5a6ff7e9b6 100644 --- a/crates/router/src/services/api.rs +++ b/crates/router/src/services/api.rs @@ -561,8 +561,7 @@ pub async fn send_request( key: consts::METRICS_HOST_TAG_NAME.into(), value: url.host_str().unwrap_or_default().to_string().into(), }; - - let send_request = async { + let request = { match request.method { Method::Get => client.get(url), Method::Post => { @@ -616,32 +615,92 @@ pub async fn send_request( .timeout(Duration::from_secs( option_timeout_secs.unwrap_or(crate::consts::REQUEST_TIME_OUT), )) - .send() - .await - .map_err(|error| match error { - error if error.is_timeout() => { - metrics::REQUEST_BUILD_FAILURE.add(&metrics::CONTEXT, 1, &[]); - errors::ApiClientError::RequestTimeoutReceived - } - error if is_connection_closed(&error) => { - metrics::REQUEST_BUILD_FAILURE.add(&metrics::CONTEXT, 1, &[]); - errors::ApiClientError::ConnectionClosed - } - _ => errors::ApiClientError::RequestNotSent(error.to_string()), - }) - .into_report() - .attach_printable("Unable to send request to connector") }; - metrics_request::record_operation_time( + // We cannot clone the request type, because it has Form trait which is not clonable. So we are cloning the request builder here. + let cloned_send_request = request.try_clone().map(|cloned_request| async { + cloned_request + .send() + .await + .map_err(|error| match error { + error if error.is_timeout() => { + metrics::REQUEST_BUILD_FAILURE.add(&metrics::CONTEXT, 1, &[]); + errors::ApiClientError::RequestTimeoutReceived + } + error if is_connection_closed_before_message_could_complete(&error) => { + metrics::REQUEST_BUILD_FAILURE.add(&metrics::CONTEXT, 1, &[]); + errors::ApiClientError::ConnectionClosedIncompleteMessage + } + _ => errors::ApiClientError::RequestNotSent(error.to_string()), + }) + .into_report() + .attach_printable("Unable to send request to connector") + }); + + let send_request = async { + request + .send() + .await + .map_err(|error| match error { + error if error.is_timeout() => { + metrics::REQUEST_BUILD_FAILURE.add(&metrics::CONTEXT, 1, &[]); + errors::ApiClientError::RequestTimeoutReceived + } + error if is_connection_closed_before_message_could_complete(&error) => { + metrics::REQUEST_BUILD_FAILURE.add(&metrics::CONTEXT, 1, &[]); + errors::ApiClientError::ConnectionClosedIncompleteMessage + } + _ => errors::ApiClientError::RequestNotSent(error.to_string()), + }) + .into_report() + .attach_printable("Unable to send request to connector") + }; + + let response = metrics_request::record_operation_time( send_request, &metrics::EXTERNAL_REQUEST_TIME, - &[metrics_tag], + &[metrics_tag.clone()], ) - .await + .await; + // Retry once if the response is connection closed. + // + // This is just due to the racy nature of networking. + // hyper has a connection pool of idle connections, and it selected one to send your request. + // Most of the time, hyper will receive the server’s FIN and drop the dead connection from its pool. + // But occasionally, a connection will be selected from the pool + // and written to at the same time the server is deciding to close the connection. + // Since hyper already wrote some of the request, + // it can’t really retry it automatically on a new connection, since the server may have acted already + match response { + Ok(response) => Ok(response), + Err(error) + if error.current_context() + == &errors::ApiClientError::ConnectionClosedIncompleteMessage => + { + metrics::AUTO_RETRY_CONNECTION_CLOSED.add(&metrics::CONTEXT, 1, &[]); + match cloned_send_request { + Some(cloned_request) => { + logger::info!( + "Retrying request due to connection closed before message could complete" + ); + metrics_request::record_operation_time( + cloned_request, + &metrics::EXTERNAL_REQUEST_TIME, + &[metrics_tag], + ) + .await + } + None => { + logger::info!("Retrying request due to connection closed before message could complete failed as request is not clonable"); + Err(error) + } + } + } + err @ Err(_) => err, + } } -fn is_connection_closed(error: &reqwest::Error) -> bool { +fn is_connection_closed_before_message_could_complete(error: &reqwest::Error) -> bool { let mut source = error.source(); while let Some(err) = source { if let Some(hyper_err) = err.downcast_ref::() { diff --git a/crates/storage_impl/src/errors.rs b/crates/storage_impl/src/errors.rs index 2adcdcf8d2..14d1eb2db6 100644 --- a/crates/storage_impl/src/errors.rs +++ b/crates/storage_impl/src/errors.rs @@ -267,7 +267,7 @@ pub enum ApiClientError { RequestTimeoutReceived, #[error("connection closed before a message could complete")] - ConnectionClosed, + ConnectionClosedIncompleteMessage, #[error("Server responded with Internal Server Error")] InternalServerErrorReceived, @@ -285,8 +285,8 @@ impl ApiClientError { pub fn is_upstream_timeout(&self) -> bool { self == &Self::RequestTimeoutReceived } - pub fn is_connection_closed(&self) -> bool { - self == &Self::ConnectionClosed + pub fn is_connection_closed_before_message_could_complete(&self) -> bool { + self == &Self::ConnectionClosedIncompleteMessage } }