From 8d28cb181ee6835901f239d88bde365168dbf95e Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 30 Jun 2026 18:55:59 +0200 Subject: [PATCH 1/7] feat(common): fold rate-limit suppressed count into the emitted log line The rate-limited tracing macros emitted a separate "suppressed N similar log messages" line before the next allowed line. Attach the count to that line instead, as a `suppressed_in_last_min` field. This makes the suppression rate visible inline on the message it belongs to and removes a distinct log pattern, slightly lowering volume. --- quickwit/quickwit-common/src/rate_limited_tracing.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/quickwit/quickwit-common/src/rate_limited_tracing.rs b/quickwit/quickwit-common/src/rate_limited_tracing.rs index 03e76fbb331..5e8b6cdc160 100644 --- a/quickwit/quickwit-common/src/rate_limited_tracing.rs +++ b/quickwit/quickwit-common/src/rate_limited_tracing.rs @@ -23,7 +23,8 @@ use coarsetime::{Duration, Instant}; pub enum ShouldLog { /// Emit the log normally, within the rate limit. Yes, - /// Emit the log; `N` similar messages were suppressed since the last emission. + /// Emit the log, annotated with a `suppressed_in_last_min = N` field recording how many + /// similar messages were suppressed since the last emission. YesAfterSuppression(u32), /// Suppressed — do not emit. No, @@ -165,8 +166,9 @@ macro_rules! rate_limited_tracing { ::tracing::$log_fn!($($args)*); } $crate::rate_limited_tracing::ShouldLog::YesAfterSuppression(skipped) => { - ::tracing::$log_fn!("suppressed {skipped} similar log messages in the last minute"); - ::tracing::$log_fn!($($args)*); + // Attach the count of messages suppressed in the last minute as a field on the + // emitted line, rather than as a separate preceding log line. + ::tracing::$log_fn!(suppressed_in_last_min = skipped, $($args)*); } } }}; From 57e56a061d2b0c797f558a06c997bd6f94d7a10b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 30 Jun 2026 18:56:09 +0200 Subject: [PATCH 2/7] Reduce default log volume by reclassifying verbose INFO logs (CLOUDPREM-759) A fleet-wide extract showed ~1.07B log lines, dominated by a single per-gossip-tick INFO (~907M, ~85%). Reclassify high-frequency operational logs to cut default-level volume by ~20x while preserving actionable signal: - INFO -> DEBUG for routine per-operation chatter with no liveness value: node pool-add (x4), send-to-index-serializer, spawning pipeline, merge schedule/download, leaf split-finished/offsets, offload-to-lambda, resetting pipeline, adding shards assignment. - INFO -> rate-limited INFO (1/min) for logs worth a heartbeat: new-split, actor-exit (success), assigning shards, env-var defaults, truncated-shard. - ERROR -> rate-limited ERROR (1/min) for the recurring lambda invocation failure, keeping it visible without flooding. Error/failure branches are untouched (e.g. actor-exit failure stays at ERROR). Stage/publish, merge-completion, and cluster lifecycle stay at INFO. --- quickwit/quickwit-actors/src/spawn_builder.rs | 4 ++-- quickwit/quickwit-common/src/lib.rs | 4 ++-- quickwit/quickwit-indexing/src/actors/indexer.rs | 7 ++++--- .../quickwit-indexing/src/actors/indexing_pipeline.rs | 5 +++-- quickwit/quickwit-indexing/src/actors/merge_planner.rs | 4 ++-- .../src/actors/merge_split_downloader.rs | 4 ++-- quickwit/quickwit-indexing/src/source/ingest/mod.rs | 4 ++-- quickwit/quickwit-ingest/src/ingest_v2/state.rs | 3 ++- quickwit/quickwit-search/src/leaf.rs | 9 +++++---- quickwit/quickwit-serve/src/lib.rs | 8 ++++---- 10 files changed, 28 insertions(+), 24 deletions(-) diff --git a/quickwit/quickwit-actors/src/spawn_builder.rs b/quickwit/quickwit-actors/src/spawn_builder.rs index 922cfc4d71d..9a94da7c633 100644 --- a/quickwit/quickwit-actors/src/spawn_builder.rs +++ b/quickwit/quickwit-actors/src/spawn_builder.rs @@ -19,7 +19,7 @@ use anyhow::Context; use quickwit_metrics::Counter; use sync_wrapper::SyncWrapper; use tokio::sync::watch; -use tracing::{debug, error, info}; +use tracing::{debug, error}; use crate::envelope::Envelope; use crate::mailbox::{Inbox, create_mailbox}; @@ -406,7 +406,7 @@ async fn actor_loop( | ActorExitStatus::Quit | ActorExitStatus::DownstreamClosed | ActorExitStatus::Killed => { - info!(actor_id, phase = ?exit_phase, exit_status = ?after_process_exit_status, "actor-exit"); + quickwit_common::rate_limited_info!(limit_per_min = 1, actor_id, phase = ?exit_phase, exit_status = ?after_process_exit_status, "actor-exit"); } ActorExitStatus::Failure(_) | ActorExitStatus::Panicked => { error!(actor_id, phase = ?exit_phase, exit_status = ?after_process_exit_status, "actor-exit"); diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs index 50dcdeec846..503c4e46aa2 100644 --- a/quickwit/quickwit-common/src/lib.rs +++ b/quickwit/quickwit-common/src/lib.rs @@ -134,7 +134,7 @@ pub fn get_from_env(key: &str, default_value: T, sensitive: if let Some(value) = get_from_env_opt(key, sensitive) { value } else { - info!(default_value=?default_value, "using environment variable `{key}` default value"); + crate::rate_limited_info!(limit_per_min = 1, default_value=?default_value, "using environment variable `{key}` default value"); default_value } } @@ -151,7 +151,7 @@ pub fn get_bool_from_env(key: &str, default_value: bool) -> bool { if let Some(flag_value) = get_bool_from_env_opt(key) { flag_value } else { - info!(default_value=%default_value, "using environment variable `{key}` default value"); + crate::rate_limited_info!(limit_per_min = 1, default_value=%default_value, "using environment variable `{key}` default value"); default_value } } diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index da36d6b7994..5c1b70f586e 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -47,7 +47,7 @@ use tantivy::tokenizer::TokenizerManager; use tantivy::{DateTime, IndexBuilder, IndexSettings}; use tokio::runtime::Handle; use tokio::sync::Semaphore; -use tracing::{Span, info, info_span, warn}; +use tracing::{Span, debug, info_span, warn}; use ulid::Ulid; use super::IndexSerializer; @@ -133,7 +133,8 @@ impl IndexerState { index_builder, io_controls, )?; - info!( + quickwit_common::rate_limited_info!( + limit_per_min = 1, split_id=%indexed_split.split_id(), partition_id=%partition_id, "new-split" @@ -666,7 +667,7 @@ impl Indexer { } let num_splits = splits.len() as u64; let split_ids = splits.iter().map(|split| split.split_id()).join(","); - info!( + debug!( index=%self.indexer_state.pipeline_id.index_uid, source=self.indexer_state.pipeline_id.source_id.as_str(), pipeline_uid=%self.indexer_state.pipeline_id.pipeline_uid, diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs index 9d6287245ac..5ba3984fee8 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs @@ -288,7 +288,7 @@ impl IndexingPipeline { let index_id = &self.params.pipeline_id.index_uid.index_id; let source_id = &self.params.pipeline_id.source_id; - info!( + debug!( index_id, source_id, pipeline_uid=%self.params.pipeline_id.pipeline_uid, @@ -499,7 +499,8 @@ impl Handler for IndexingPipeline { // If the pipeline is running, we forward the message to its source. // If it is not, it will be respawned soon, and the shards will be assigned afterward. if let Some(handles) = &self.handles_opt { - info!( + quickwit_common::rate_limited_info!( + limit_per_min = 1, shard_ids=?assign_shards_message.0.shard_ids, "assigning shards to indexing pipeline" ); diff --git a/quickwit/quickwit-indexing/src/actors/merge_planner.rs b/quickwit/quickwit-indexing/src/actors/merge_planner.rs index 651e38fc8b3..6851b1a6c87 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_planner.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_planner.rs @@ -23,7 +23,7 @@ use quickwit_proto::indexing::MergePipelineId; use quickwit_proto::types::DocMappingUid; use tantivy::Inventory; use time::OffsetDateTime; -use tracing::{info, warn}; +use tracing::{debug, warn}; use super::merge_scheduler_service::schedule_merge; use super::{MergeSchedulerService, MergeSplitDownloader}; @@ -331,7 +331,7 @@ impl MergePlanner { // index as well. let merge_ops = self.compute_merge_ops(is_finalize, ctx).await?; for merge_operation in merge_ops { - info!(merge_operation=?merge_operation, "schedule merge operation"); + debug!(merge_operation=?merge_operation, "schedule merge operation"); let tracked_merge_operation = self .ongoing_merge_operations_inventory .track(merge_operation); diff --git a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs index 5d68bb59285..71d23da2100 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs @@ -20,7 +20,7 @@ use quickwit_common::io::IoControls; use quickwit_common::temp_dir::{self, TempDirectory}; use quickwit_metastore::SplitMetadata; use tantivy::Directory; -use tracing::{debug, info, instrument}; +use tracing::{debug, instrument}; use super::MergeExecutor; use crate::merge_policy::MergeTask; @@ -66,7 +66,7 @@ impl Handler for MergeSplitDownloader { .join("merge") .tempdir_in(self.scratch_directory.path()) .map_err(|error| anyhow::anyhow!(error))?; - info!(dir=%merge_scratch_directory.path().display(), "download-merge-splits"); + debug!(dir=%merge_scratch_directory.path().display(), "download-merge-splits"); let downloaded_splits_directory = temp_dir::Builder::default() .join("downloaded-splits") .tempdir_in(merge_scratch_directory.path()) diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index cbf3d70d976..a8f1aed8d24 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -430,7 +430,7 @@ impl IngestSource { }); return Ok(()); } - info!( + debug!( index_uid=%self.client_id.source_uid.index_uid, pipeline_uid=%self.client_id.pipeline_uid, "resetting indexing pipeline" @@ -531,7 +531,7 @@ impl Source for IngestSource { .collect(); assert!(!added_shard_ids.is_empty()); - info!(added_shards=?added_shard_ids, "adding shards assignment"); + debug!(added_shards=?added_shard_ids, "adding shards assignment"); let acquire_shards_request = AcquireShardsRequest { index_uid: Some(self.client_id.source_uid.index_uid.clone()), diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs index 4e92d794c4a..07ba8bae016 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs @@ -598,7 +598,8 @@ impl FullyLockedIngesterState<'_> { } } } - info!( + quickwit_common::rate_limited_info!( + limit_per_min = 1, "truncated shard `{queue_id}` at {truncate_up_to_position_inclusive} initiated via \ `{initiator}`" ); diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 16e435fb61f..deddb49457d 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -1663,7 +1663,7 @@ async fn run_offloaded_search_tasks( return Ok(()); } - info!( + debug!( num_offloaded_splits = splits_with_requests.len(), "offloading to lambda" ); @@ -1773,7 +1773,8 @@ async fn run_offloaded_search_tasks( Err(err) => { // Transport-level failure: the Lambda invocation itself failed. // Mark all splits in this batch as failed. - error!( + quickwit_common::rate_limited_error!( + limit_per_min = 1, error = %err, num_splits = batch_split_ids.len(), "lambda invocation failed for batch" @@ -1889,7 +1890,7 @@ pub async fn single_doc_mapping_leaf_search( ) -> Result { let num_docs: u64 = splits.iter().map(|split| split.num_docs).sum(); let num_splits = splits.len(); - info!(num_docs, num_splits, split_offsets = ?PrettySample::new(&splits, 5)); + debug!(num_docs, num_splits, split_offsets = ?PrettySample::new(&splits, 5)); // We simplify the request as much as possible. let split_filter: CanSplitDoBetter = @@ -2076,7 +2077,7 @@ async fn run_local_search_tasks( }); } - info!(split_outcome_counters=%leaf_search_context.split_outcome_counters, "leaf split search finished"); + debug!(split_outcome_counters=%leaf_search_context.split_outcome_counters, "leaf split search finished"); } /// We identify the splits that are in the cache and append them to the incremental merge collector. diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index a5c34ebb961..58794b8775c 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -239,7 +239,7 @@ async fn balance_channel_for_service( match cluster_change { ClusterChange::Add(node) if node.is_service_enabled(service) => { let chitchat_id = node.chitchat_id(); - info!( + debug!( node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` to {} pool", @@ -1120,7 +1120,7 @@ fn build_ingester_insert_change( grpc_compression_encoding_opt: Option, ) -> Change { let chitchat_id = node.chitchat_id(); - info!( + debug!( node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding/updating node `{}` with ingester status `{}` to ingester pool", @@ -1208,7 +1208,7 @@ async fn setup_searcher( match cluster_change { ClusterChange::Add(node) if node.is_searcher() => { let chitchat_id = node.chitchat_id(); - info!( + debug!( node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` to searcher pool", @@ -1354,7 +1354,7 @@ fn build_indexer_insert_change( grpc_max_message_size: ByteSize, ) -> Change { let chitchat_id = node.chitchat_id(); - info!( + debug!( node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` with ingester status `{}` to indexer pool", From f3617339787d588d3c9ada3651fe03e402633299 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 30 Jun 2026 19:19:11 +0200 Subject: [PATCH 3/7] Log split count, doc count, and size on publish The publish-new-splits log carried no fields. Add num_splits, num_docs, and total on-disk split size to give operators visibility into publish throughput and split sizing without adding a new log line. num_splits is >1 only for partitioned sources (a single commit produces one split per partition); merges publish a single output split, with the merged inputs recorded in replaced_split_ids. --- .../src/actors/log_publisher_impl.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/quickwit/quickwit-indexing/src/actors/log_publisher_impl.rs b/quickwit/quickwit-indexing/src/actors/log_publisher_impl.rs index 02a4bca1661..f4bdb86b541 100644 --- a/quickwit/quickwit-indexing/src/actors/log_publisher_impl.rs +++ b/quickwit/quickwit-indexing/src/actors/log_publisher_impl.rs @@ -98,7 +98,18 @@ impl Handler for Publisher { ); return Ok(()); } - info!("publish-new-splits"); + let num_docs: usize = new_splits.iter().map(|split| split.num_docs).sum(); + // `footer_offsets.end` is the on-disk size of the split file in bytes. + let split_size_bytes: u64 = new_splits + .iter() + .map(|split| split.footer_offsets.end) + .sum(); + info!( + num_splits = new_splits.len(), + num_docs, + split_size = %bytesize::ByteSize(split_size_bytes), + "publish-new-splits" + ); suggest_truncate(ctx, &self.source_mailbox_opt, checkpoint_delta_opt).await; if !new_splits.is_empty() { From da0544fe485aac927d3a035168305e434621f665 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 30 Jun 2026 20:24:38 +0200 Subject: [PATCH 4/7] Keep `offloading to lambda` at INFO This is a low-volume (~80K), operationally meaningful event: the searcher spilling search work to Lambda. It carries capacity/cost signal and pairs with the lambda invocation error we deliberately keep visible. Demoting it to DEBUG bought no real volume reduction, so revert it to INFO. --- quickwit/quickwit-search/src/leaf.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index deddb49457d..2129b481c11 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -1663,7 +1663,7 @@ async fn run_offloaded_search_tasks( return Ok(()); } - debug!( + info!( num_offloaded_splits = splits_with_requests.len(), "offloading to lambda" ); From 546bfb72b1227b43833a26246bdb4171f1469108 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 30 Jun 2026 20:26:56 +0200 Subject: [PATCH 5/7] Rate-limit ingester/searcher/generic pool-add logs at INFO Keep visibility into ingester, searcher, and generic-service pool membership at INFO but cap each to 1/min. These are far lower volume than the indexer pool-add (~907M), which stays at DEBUG. --- quickwit/quickwit-serve/src/lib.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index 58794b8775c..1b63093bd21 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -239,7 +239,8 @@ async fn balance_channel_for_service( match cluster_change { ClusterChange::Add(node) if node.is_service_enabled(service) => { let chitchat_id = node.chitchat_id(); - debug!( + quickwit_common::rate_limited_info!( + limit_per_min = 1, node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` to {} pool", @@ -1120,7 +1121,8 @@ fn build_ingester_insert_change( grpc_compression_encoding_opt: Option, ) -> Change { let chitchat_id = node.chitchat_id(); - debug!( + quickwit_common::rate_limited_info!( + limit_per_min = 1, node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding/updating node `{}` with ingester status `{}` to ingester pool", @@ -1208,7 +1210,8 @@ async fn setup_searcher( match cluster_change { ClusterChange::Add(node) if node.is_searcher() => { let chitchat_id = node.chitchat_id(); - debug!( + quickwit_common::rate_limited_info!( + limit_per_min = 1, node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` to searcher pool", From d2a9282ac3dda300171970a3560c22c49925447a Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 30 Jun 2026 20:29:55 +0200 Subject: [PATCH 6/7] Rate-limit indexer pool-add at INFO; rename suppressed field to num_suppressed Rate-limit the indexer pool-add log (~907M, the dominant pattern) at INFO 1/min instead of demoting to DEBUG. At 1/min this collapses to ~1.4K/day -- the same volume win as DEBUG -- while keeping pool membership visible at INFO, consistent with the other three pool-add logs. Rename the rate-limit suppressed-count field from `suppressed_in_last_min` to `num_suppressed`: the count is messages suppressed since the call site last emitted, and since the window only resets on the next call it can span more than a minute, so the old name was misleading. --- quickwit/quickwit-common/src/rate_limited_tracing.rs | 10 +++++----- quickwit/quickwit-serve/src/lib.rs | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/quickwit/quickwit-common/src/rate_limited_tracing.rs b/quickwit/quickwit-common/src/rate_limited_tracing.rs index 5e8b6cdc160..97c2b5ca39b 100644 --- a/quickwit/quickwit-common/src/rate_limited_tracing.rs +++ b/quickwit/quickwit-common/src/rate_limited_tracing.rs @@ -23,8 +23,8 @@ use coarsetime::{Duration, Instant}; pub enum ShouldLog { /// Emit the log normally, within the rate limit. Yes, - /// Emit the log, annotated with a `suppressed_in_last_min = N` field recording how many - /// similar messages were suppressed since the last emission. + /// Emit the log, annotated with a `num_suppressed = N` field recording how many similar + /// messages were suppressed since the last emission of this call site. YesAfterSuppression(u32), /// Suppressed — do not emit. No, @@ -166,9 +166,9 @@ macro_rules! rate_limited_tracing { ::tracing::$log_fn!($($args)*); } $crate::rate_limited_tracing::ShouldLog::YesAfterSuppression(skipped) => { - // Attach the count of messages suppressed in the last minute as a field on the - // emitted line, rather than as a separate preceding log line. - ::tracing::$log_fn!(suppressed_in_last_min = skipped, $($args)*); + // Attach the count of messages suppressed since this call site last emitted as a + // field on the emitted line, rather than as a separate preceding log line. + ::tracing::$log_fn!(num_suppressed = skipped, $($args)*); } } }}; diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index 1b63093bd21..76f0c7acd87 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -1357,7 +1357,8 @@ fn build_indexer_insert_change( grpc_max_message_size: ByteSize, ) -> Change { let chitchat_id = node.chitchat_id(); - debug!( + quickwit_common::rate_limited_info!( + limit_per_min = 1, node_id = %chitchat_id.node_id, generation_id = chitchat_id.generation_id, "adding node `{}` with ingester status `{}` to indexer pool", From 054d3a0a32864ecd5dab8ffaca90c51e331b5859 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 1 Jul 2026 15:06:36 +0200 Subject: [PATCH 7/7] Cache env-var reads once per process instead of rate-limiting the log The "using environment variable ... default value" INFO lines flooded because callers like PostgresqlMetastore::new read the same variables on every (re)construction, and the metastore is rebuilt frequently. Rate- limiting the shared log statement (previous approach) funneled every key through one call-site counter, so within a burst only the first key was ever logged and the rest were deterministically dropped. Instead, add per-call-site caching macros in quickwit-common: - get_from_env_cached!(ty, key, default, sensitive) - get_bool_from_env_cached!(key, default) Each expands to a block-local `static LazyLock`, so the read + log fire exactly once per call site per process, with no cross-key contention (distinct call sites get distinct statics). Revert lib.rs env logging back to plain info!. Convert the non-lazy hot readers (metastore QW_POSTGRES_*, node_config OTLP/JAEGER, CORS debug) and fold the existing hand-rolled LazyLock env caches (ingest v2 enable/disable, batch bytes, per-index metrics, doc validation, load estimation, field list limit, default load per shard) onto the same macro for consistency. Also drop the rate-limit on the actor-exit success log (back to info!). --- quickwit/quickwit-actors/src/spawn_builder.rs | 4 +- quickwit/quickwit-common/src/lib.rs | 47 ++++++++++++++++++- quickwit/quickwit-common/src/metrics.rs | 8 ++-- quickwit/quickwit-config/src/lib.rs | 9 +--- .../quickwit-config/src/node_config/mod.rs | 4 +- .../src/indexing_scheduler/mod.rs | 16 +++---- .../src/ingest_v2/doc_mapper.rs | 7 +-- .../quickwit-ingest/src/ingest_v2/ingester.rs | 12 +++-- .../src/metastore/postgres/metastore.rs | 12 +++-- .../quickwit-search/src/list_fields/mod.rs | 10 ++-- .../quickwit-search/src/search_job_placer.rs | 10 ++-- quickwit/quickwit-serve/src/rest.rs | 2 +- 12 files changed, 89 insertions(+), 52 deletions(-) diff --git a/quickwit/quickwit-actors/src/spawn_builder.rs b/quickwit/quickwit-actors/src/spawn_builder.rs index 9a94da7c633..922cfc4d71d 100644 --- a/quickwit/quickwit-actors/src/spawn_builder.rs +++ b/quickwit/quickwit-actors/src/spawn_builder.rs @@ -19,7 +19,7 @@ use anyhow::Context; use quickwit_metrics::Counter; use sync_wrapper::SyncWrapper; use tokio::sync::watch; -use tracing::{debug, error}; +use tracing::{debug, error, info}; use crate::envelope::Envelope; use crate::mailbox::{Inbox, create_mailbox}; @@ -406,7 +406,7 @@ async fn actor_loop( | ActorExitStatus::Quit | ActorExitStatus::DownstreamClosed | ActorExitStatus::Killed => { - quickwit_common::rate_limited_info!(limit_per_min = 1, actor_id, phase = ?exit_phase, exit_status = ?after_process_exit_status, "actor-exit"); + info!(actor_id, phase = ?exit_phase, exit_status = ?after_process_exit_status, "actor-exit"); } ActorExitStatus::Failure(_) | ActorExitStatus::Panicked => { error!(actor_id, phase = ?exit_phase, exit_status = ?after_process_exit_status, "actor-exit"); diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs index 503c4e46aa2..aeb5f92051b 100644 --- a/quickwit/quickwit-common/src/lib.rs +++ b/quickwit/quickwit-common/src/lib.rs @@ -134,7 +134,7 @@ pub fn get_from_env(key: &str, default_value: T, sensitive: if let Some(value) = get_from_env_opt(key, sensitive) { value } else { - crate::rate_limited_info!(limit_per_min = 1, default_value=?default_value, "using environment variable `{key}` default value"); + info!(default_value=?default_value, "using environment variable `{key}` default value"); default_value } } @@ -151,11 +151,54 @@ pub fn get_bool_from_env(key: &str, default_value: bool) -> bool { if let Some(flag_value) = get_bool_from_env_opt(key) { flag_value } else { - crate::rate_limited_info!(limit_per_min = 1, default_value=%default_value, "using environment variable `{key}` default value"); + info!(default_value=%default_value, "using environment variable `{key}` default value"); default_value } } +/// Reads and parses an environment variable exactly once, caching the result for the lifetime of +/// the process via a per-call-site [`std::sync::LazyLock`]. +/// +/// Prefer this over [`get_from_env`] on paths that may run repeatedly (e.g. constructors that are +/// re-invoked): it avoids re-reading and, more importantly, re-logging the same variable on every +/// call. +/// +/// The type is required because the backing `static` needs a concrete type. The key and default +/// must be `const` expressions or literals — a `static` initializer cannot capture locals. +/// +/// ```no_run +/// # use quickwit_common::get_from_env_cached; +/// let max_concurrency: usize = get_from_env_cached!(usize, "QW_S3_MAX_CONCURRENCY", 10_000, false); +/// ``` +#[macro_export] +macro_rules! get_from_env_cached { + ($ty:ty, $key:expr, $default:expr, $sensitive:expr $(,)?) => {{ + static CACHED: ::std::sync::LazyLock<$ty> = + ::std::sync::LazyLock::new(|| $crate::get_from_env::<$ty>($key, $default, $sensitive)); + // `LazyLock` derefs to `T`; clone so callers receive an owned value, matching + // `get_from_env`'s return type (a no-op copy for the common `Copy` cases). + #[allow(clippy::clone_on_copy)] + let value = (*CACHED).clone(); + value + }}; +} + +/// Boolean counterpart of [`get_from_env_cached`], using the same lenient parsing as +/// [`get_bool_from_env`]. See that macro for the caching semantics and constraints. +/// +/// ```no_run +/// # use quickwit_common::get_bool_from_env_cached; +/// let cors_debug: bool = get_bool_from_env_cached!("QW_ENABLE_CORS_DEBUG", false); +/// ``` +#[macro_export] +macro_rules! get_bool_from_env_cached { + ($key:expr, $default:expr $(,)?) => {{ + static CACHED: ::std::sync::LazyLock = + ::std::sync::LazyLock::new(|| $crate::get_bool_from_env($key, $default)); + *CACHED + }}; +} + pub fn truncate_str(text: &str, max_len: usize) -> &str { if max_len > text.len() { return text; diff --git a/quickwit/quickwit-common/src/metrics.rs b/quickwit/quickwit-common/src/metrics.rs index db5eccfc6da..12ea89cff96 100644 --- a/quickwit/quickwit-common/src/metrics.rs +++ b/quickwit/quickwit-common/src/metrics.rs @@ -12,16 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::LazyLock; - pub use prometheus::{exponential_buckets, linear_buckets}; use quickwit_metrics::{Gauge, LazyCounter, LazyGauge, gauge, lazy_counter, lazy_gauge}; pub fn index_label(index_id: &str) -> &str { - static PER_INDEX_METRICS_ENABLED: LazyLock = - LazyLock::new(|| !crate::get_bool_from_env("QW_DISABLE_PER_INDEX_METRICS", false)); + let per_index_metrics_enabled = + !crate::get_bool_from_env_cached!("QW_DISABLE_PER_INDEX_METRICS", false); - if *PER_INDEX_METRICS_ENABLED { + if per_index_metrics_enabled { index_id } else { "__any__" diff --git a/quickwit/quickwit-config/src/lib.rs b/quickwit/quickwit-config/src/lib.rs index 8f2bfc46a4a..43693ba1f66 100644 --- a/quickwit/quickwit-config/src/lib.rs +++ b/quickwit/quickwit-config/src/lib.rs @@ -20,7 +20,6 @@ use std::sync::LazyLock; use anyhow::{Context, bail, ensure}; use json_comments::StripComments; -use quickwit_common::get_bool_from_env; use quickwit_common::net::is_valid_hostname; use quickwit_common::uri::Uri; use quickwit_proto::types::NodeIdRef; @@ -88,16 +87,12 @@ pub use crate::storage_config::{ /// Returns true if the ingest API v2 is enabled. pub fn enable_ingest_v2() -> bool { - static ENABLE_INGEST_V2: LazyLock = - LazyLock::new(|| get_bool_from_env("QW_ENABLE_INGEST_V2", true)); - *ENABLE_INGEST_V2 + quickwit_common::get_bool_from_env_cached!("QW_ENABLE_INGEST_V2", true) } /// Returns true if the ingest API v1 is disabled. pub fn disable_ingest_v1() -> bool { - static DISABLE_INGEST_V1: LazyLock = - LazyLock::new(|| get_bool_from_env("QW_DISABLE_INGEST_V1", false)); - *DISABLE_INGEST_V1 + quickwit_common::get_bool_from_env_cached!("QW_DISABLE_INGEST_V1", false) } #[derive(utoipa::OpenApi)] diff --git a/quickwit/quickwit-config/src/node_config/mod.rs b/quickwit/quickwit-config/src/node_config/mod.rs index 1c60891cf58..c368d5240ad 100644 --- a/quickwit/quickwit-config/src/node_config/mod.rs +++ b/quickwit/quickwit-config/src/node_config/mod.rs @@ -214,7 +214,7 @@ impl IndexerConfig { } #[cfg(not(any(test, feature = "testsuite")))] { - quickwit_common::get_bool_from_env("QW_ENABLE_OTLP_ENDPOINT", true) + quickwit_common::get_bool_from_env_cached!("QW_ENABLE_OTLP_ENDPOINT", true) } } @@ -770,7 +770,7 @@ impl JaegerConfig { } #[cfg(not(any(test, feature = "testsuite")))] { - quickwit_common::get_bool_from_env("QW_ENABLE_JAEGER_ENDPOINT", true) + quickwit_common::get_bool_from_env_cached!("QW_ENABLE_JAEGER_ENDPOINT", true) } } diff --git a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs index f67a5462264..097ec4469ca 100644 --- a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs +++ b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs @@ -189,15 +189,13 @@ fn compute_load_per_shard(shard_entries: &[&ShardEntry]) -> NonZeroU32 { } fn get_default_load_per_shard() -> NonZeroU32 { - static DEFAULT_LOAD_PER_SHARD: LazyLock = LazyLock::new(|| { - let default_load_per_shard = quickwit_common::get_from_env( - "QW_DEFAULT_LOAD_PER_SHARD", - PIPELINE_FULL_CAPACITY.cpu_millis() / 4, - false, - ); - NonZeroU32::new(default_load_per_shard).unwrap() - }); - *DEFAULT_LOAD_PER_SHARD + let default_load_per_shard = quickwit_common::get_from_env_cached!( + u32, + "QW_DEFAULT_LOAD_PER_SHARD", + PIPELINE_FULL_CAPACITY.cpu_millis() / 4, + false + ); + NonZeroU32::new(default_load_per_shard).unwrap() } fn get_sources_to_schedule(model: &ControlPlaneModel) -> Vec { diff --git a/quickwit/quickwit-ingest/src/ingest_v2/doc_mapper.rs b/quickwit/quickwit-ingest/src/ingest_v2/doc_mapper.rs index e221602bce6..069dfd22ced 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/doc_mapper.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/doc_mapper.rs @@ -14,7 +14,7 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; -use std::sync::{Arc, LazyLock, Weak}; +use std::sync::{Arc, Weak}; use quickwit_common::rate_limited_error; use quickwit_common::thread_pool::run_cpu_intensive; @@ -90,10 +90,7 @@ pub(super) async fn validate_doc_batch( } fn is_document_validation_enabled() -> bool { - static IS_DOCUMENT_VALIDATION_ENABLED: LazyLock = LazyLock::new(|| { - !quickwit_common::get_bool_from_env("QW_DISABLE_DOCUMENT_VALIDATION", false) - }); - *IS_DOCUMENT_VALIDATION_ENABLED + !quickwit_common::get_bool_from_env_cached!("QW_DISABLE_DOCUMENT_VALIDATION", false) } #[instrument(name = "ingester.validate_doc_batch", skip_all, fields(num_docs = doc_batch.num_docs(), num_bytes = doc_batch.num_bytes()))] diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index b1de41d3163..203d1c8f8d3 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -16,7 +16,7 @@ use std::collections::hash_map::Entry; use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt; use std::path::Path; -use std::sync::{Arc, LazyLock}; +use std::sync::Arc; use std::time::{Duration, Instant}; use async_trait::async_trait; @@ -87,10 +87,12 @@ pub(super) const PERSIST_REQUEST_TIMEOUT: Duration = if cfg!(any(test, feature = const DEFAULT_BATCH_NUM_BYTES: usize = 1024 * 1024; // 1 MiB fn get_batch_num_bytes() -> usize { - static BATCH_NUM_BYTES_CELL: LazyLock = LazyLock::new(|| { - quickwit_common::get_from_env("QW_INGEST_BATCH_NUM_BYTES", DEFAULT_BATCH_NUM_BYTES, false) - }); - *BATCH_NUM_BYTES_CELL + quickwit_common::get_from_env_cached!( + usize, + "QW_INGEST_BATCH_NUM_BYTES", + DEFAULT_BATCH_NUM_BYTES, + false + ) } #[derive(Clone)] diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index 43bd995e1ba..65963dd48bf 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -23,7 +23,7 @@ use futures::StreamExt; use itertools::Itertools; use quickwit_common::pretty::PrettySample; use quickwit_common::uri::Uri; -use quickwit_common::{ServiceStream, get_bool_from_env, rate_limited_error}; +use quickwit_common::{ServiceStream, get_bool_from_env_cached, rate_limited_error}; use quickwit_config::{ IndexTemplate, IndexTemplateId, PostgresMetastoreConfig, validate_index_id_pattern, }; @@ -122,9 +122,13 @@ impl PostgresqlMetastore { .max_connection_lifetime_opt() .expect("PostgreSQL metastore config should have been validated"); - let read_only = get_bool_from_env(QW_POSTGRES_READ_ONLY_ENV_KEY, false); - let skip_migrations = get_bool_from_env(QW_POSTGRES_SKIP_MIGRATIONS_ENV_KEY, false); - let skip_locking = get_bool_from_env(QW_POSTGRES_SKIP_MIGRATION_LOCKING_ENV_KEY, false); + // These environment variables are process-global and don't change over a process's + // lifetime, so they're read (and logged) once rather than on every metastore + // construction, which can recur frequently. + let read_only = get_bool_from_env_cached!(QW_POSTGRES_READ_ONLY_ENV_KEY, false); + let skip_migrations = get_bool_from_env_cached!(QW_POSTGRES_SKIP_MIGRATIONS_ENV_KEY, false); + let skip_locking = + get_bool_from_env_cached!(QW_POSTGRES_SKIP_MIGRATION_LOCKING_ENV_KEY, false); let connection_pool = establish_connection( connection_uri, diff --git a/quickwit/quickwit-search/src/list_fields/mod.rs b/quickwit/quickwit-search/src/list_fields/mod.rs index 9cc81957c95..f53f7995d69 100644 --- a/quickwit/quickwit-search/src/list_fields/mod.rs +++ b/quickwit/quickwit-search/src/list_fields/mod.rs @@ -18,7 +18,6 @@ mod patterns; mod root; use std::cmp::Ordering; -use std::sync::LazyLock; pub use cache::ListFieldsCache; use itertools::Itertools; @@ -37,8 +36,9 @@ pub use crate::list_fields::root::root_list_fields; /// a JSON type with random field names. This leads to huge memory consumption /// when building the response. This is a workaround until a way is found to /// prune the long tail of rare fields. -static FIELD_LIST_SIZE_LIMIT: LazyLock = - LazyLock::new(|| quickwit_common::get_from_env("QW_FIELD_LIST_SIZE_LIMIT", 100_000, false)); +fn field_list_size_limit() -> usize { + quickwit_common::get_from_env_cached!(usize, "QW_FIELD_LIST_SIZE_LIMIT", 100_000, false) +} // Sorts and deduplicates the list of fields. // @@ -83,10 +83,10 @@ fn merge_entries(entry_groups: Vec>) -> crate::Result= *FIELD_LIST_SIZE_LIMIT { + if entries.len() >= field_list_size_limit() { return Err(SearchError::Internal(format!( "list fields response exceeded {} fields", - *FIELD_LIST_SIZE_LIMIT + field_list_size_limit() ))); } current_group.push(entry); diff --git a/quickwit/quickwit-search/src/search_job_placer.rs b/quickwit/quickwit-search/src/search_job_placer.rs index de9e816cf19..dc1bd53c120 100644 --- a/quickwit/quickwit-search/src/search_job_placer.rs +++ b/quickwit/quickwit-search/src/search_job_placer.rs @@ -17,7 +17,6 @@ use std::collections::{HashMap, HashSet}; use std::fmt; use std::hash::{Hash, Hasher}; use std::net::SocketAddr; -use std::sync::LazyLock; use std::time::Duration; use anyhow::bail; @@ -25,7 +24,7 @@ use async_trait::async_trait; use futures::future::join_all; use quickwit_common::pubsub::EventSubscriber; use quickwit_common::rendezvous_hasher::{node_affinity, sort_by_rendez_vous_hash}; -use quickwit_common::{SocketAddrLegacyHash, get_bool_from_env}; +use quickwit_common::{SocketAddrLegacyHash, get_bool_from_env_cached}; use quickwit_metrics::counter; use quickwit_proto::search::{ReportSplit, ReportSplitsRequest}; use tracing::{info, warn}; @@ -104,8 +103,9 @@ impl fmt::Debug for SearchJobPlacer { } } -static LOAD_ESTIMATION_DISABLED: LazyLock = - LazyLock::new(|| get_bool_from_env("QW_DISABLE_LOAD_ESTIMATION", false)); +fn load_estimation_disabled() -> bool { + get_bool_from_env_cached!("QW_DISABLE_LOAD_ESTIMATION", false) +} impl SearchJobPlacer { /// Returns an [`SearchJobPlacer`] from a search service client pool. @@ -211,7 +211,7 @@ impl SearchJobPlacer { }) .collect(); - if load_aware && !*LOAD_ESTIMATION_DISABLED { + if load_aware && !load_estimation_disabled() { // Seed each candidate node with its current load so the placer avoids // routing work to already-loaded nodes. If a node fails to report its // load (error or timeout), `load` stays `None`: we still route work diff --git a/quickwit/quickwit-serve/src/rest.rs b/quickwit/quickwit-serve/src/rest.rs index 39bfb0e2580..a17b07dbba6 100644 --- a/quickwit/quickwit-serve/src/rest.rs +++ b/quickwit/quickwit-serve/src/rest.rs @@ -577,7 +577,7 @@ fn get_status_with_error(rejection: Rejection) -> Result CorsLayer { - let debug_mode = quickwit_common::get_bool_from_env("QW_ENABLE_CORS_DEBUG", false); + let debug_mode = quickwit_common::get_bool_from_env_cached!("QW_ENABLE_CORS_DEBUG", false); if debug_mode { info!("CORS debug mode is enabled, localhost and 127.0.0.1 origins will be allowed"); return CorsLayer::new()