From 188ba8d2573d77cab9d77edf3e7da36aacec4856 Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Fri, 19 Jun 2026 13:25:47 -0300 Subject: [PATCH 01/16] perf(consensus): precompute auth data root concurrently in the checkpoint verifier (#124) * perf(consensus): precompute auth data root concurrently in the checkpoint verifier The ZIP-244 authorizing-data commitment (Block::auth_data_root, a per-transaction auth digest) is one of the two dominant serial costs of the finalized committer on heavy shielded blocks. Unlike the note-commitment tree update it depends only on the block's own transactions, not chain state, so it can be computed ahead of the committer. Computing it inline in `check_block` does NOT help: the checkpoint verifier is wrapped in a tower `Buffer` (single worker), and `check_block` runs on that serialized path, so the work just moves to another single-threaded stage. Instead, compute it in the per-block task the verifier already `tokio::spawn`s to commit each verified block. That task runs off the buffer worker, one per block, so many blocks' auth digests are computed concurrently (via `spawn_blocking`), overlapping with and ahead of the single-threaded committer. The committer uses the precomputed value (carried on `SemanticallyVerifiedBlock::auth_data_root`), falling back to computing it when absent. Only Nu5-onward blocks bind the auth data in their block commitment. Consensus-neutral: the value is byte-identical to recomputing it at commit time; an end-to-end differential mainnet sync is the proof, since a wrong auth data root fails the commitment check and rejects the block. * spawn auth data root pre-compute after verifying checkpoint * simplify comment --- zebra-consensus/src/block.rs | 3 +++ zebra-consensus/src/checkpoint.rs | 16 ++++++++++++++-- zebra-state/src/arbitrary.rs | 1 + zebra-state/src/request.rs | 15 ++++++++++++++- zebra-state/src/service/chain_tip.rs | 1 + zebra-state/src/service/check.rs | 12 ++++++++++-- zebra-state/src/service/finalized_state.rs | 5 +++++ .../service/finalized_state/tests/transparent.rs | 1 + .../zebra_db/block/tests/vectors.rs | 1 + zebra-state/src/service/non_finalized_state.rs | 2 ++ 10 files changed, 52 insertions(+), 5 deletions(-) diff --git a/zebra-consensus/src/block.rs b/zebra-consensus/src/block.rs index 1a1217d44c9..e763e5e956f 100644 --- a/zebra-consensus/src/block.rs +++ b/zebra-consensus/src/block.rs @@ -360,6 +360,9 @@ where new_outputs, transaction_hashes, deferred_pool_balance_change: Some(deferred_pool_balance_change), + // The semantic verifier checks the auth-data commitment during + // contextual validation, so it isn't precomputed here. + auth_data_root: None, }; // Return early for proposal requests. diff --git a/zebra-consensus/src/checkpoint.rs b/zebra-consensus/src/checkpoint.rs index 23ccc26a385..e7d03da8180 100644 --- a/zebra-consensus/src/checkpoint.rs +++ b/zebra-consensus/src/checkpoint.rs @@ -33,7 +33,7 @@ use zebra_chain::{ parameters::{ checkpoint::list::CheckpointList, subsidy::{block_subsidy, funding_stream_values, FundingStreamReceiver, SubsidyError}, - Network, GENESIS_PREVIOUS_BLOCK_HASH, + Network, NetworkUpgrade, GENESIS_PREVIOUS_BLOCK_HASH, }, work::equihash, }; @@ -1100,7 +1100,7 @@ where return async { Err(VerifyCheckpointError::Finished) }.boxed(); } - let req_block = match self.queue_block(block) { + let mut req_block = match self.queue_block(block) { Ok(req_block) => req_block, Err(e) => return async { Err(e) }.boxed(), }; @@ -1134,6 +1134,7 @@ where // we don't reject the entire checkpoint. // Instead, we reset the verifier to the successfully committed state tip. let state_service = self.state_service.clone(); + let network = self.network.clone(); let commit_checkpoint_verified = tokio::spawn(async move { let hash = req_block .rx @@ -1142,6 +1143,17 @@ where .map_err(VerifyCheckpointError::CommitCheckpointVerified) .expect("CheckpointVerifier does not leave dangling receivers")?; + // Precompute the ZIP-244 authorizing-data commitment root here, off + // the single-threaded checkpoint-verifier buffer worker. + if NetworkUpgrade::current(&network, req_block.block.height) >= NetworkUpgrade::Nu5 { + let block = req_block.block.block.clone(); + if let Ok(auth_data_root) = + tokio::task::spawn_blocking(move || block.auth_data_root()).await + { + req_block.block.auth_data_root = Some(auth_data_root); + } + } + // We use a `ServiceExt::oneshot`, so that every state service // `poll_ready` has a corresponding `call`. See #1593. match state_service diff --git a/zebra-state/src/arbitrary.rs b/zebra-state/src/arbitrary.rs index 1dc9b7ce33d..73ff08de28d 100644 --- a/zebra-state/src/arbitrary.rs +++ b/zebra-state/src/arbitrary.rs @@ -98,6 +98,7 @@ impl ContextuallyVerifiedBlock { new_outputs, transaction_hashes, deferred_pool_balance_change: _, + auth_data_root: _, } = block.into(); Self { diff --git a/zebra-state/src/request.rs b/zebra-state/src/request.rs index 68550ef9337..da62e0ff202 100644 --- a/zebra-state/src/request.rs +++ b/zebra-state/src/request.rs @@ -10,7 +10,7 @@ use std::{ use tower::{BoxError, Service, ServiceExt}; use zebra_chain::{ amount::{DeferredPoolBalanceChange, NegativeAllowed}, - block::{self, Block, HeightDiff}, + block::{self, merkle::AuthDataRoot, Block, HeightDiff}, diagnostic::{task::WaitForPanics, CodeTimer}, history_tree::HistoryTree, orchard, @@ -260,6 +260,14 @@ pub struct SemanticallyVerifiedBlock { pub transaction_hashes: Arc<[transaction::Hash]>, /// This block's deferred pool value balance change. pub deferred_pool_balance_change: Option, + /// The precomputed ZIP-244 authorizing-data commitment root for this block, + /// if it was computed during verification. + /// + /// The checkpoint verifier sets this (it runs with high concurrency, ahead + /// of the single-threaded finalized committer) so the committer does not + /// have to recompute the per-transaction auth digests on its critical path. + /// `None` means "not precomputed"; the committer falls back to computing it. + pub auth_data_root: Option, } /// A block ready to be committed directly to the finalized state with @@ -491,6 +499,7 @@ impl ContextuallyVerifiedBlock { new_outputs, transaction_hashes, deferred_pool_balance_change, + auth_data_root: _, } = semantically_verified; // This is redundant for the non-finalized state, @@ -552,6 +561,7 @@ impl SemanticallyVerifiedBlock { new_outputs, transaction_hashes, deferred_pool_balance_change: None, + auth_data_root: None, } } @@ -587,6 +597,7 @@ impl From> for SemanticallyVerifiedBlock { new_outputs, transaction_hashes, deferred_pool_balance_change: None, + auth_data_root: None, } } } @@ -602,6 +613,7 @@ impl From for SemanticallyVerifiedBlock { deferred_pool_balance_change: Some(DeferredPoolBalanceChange::new( valid.chain_value_pool_change.deferred_amount(), )), + auth_data_root: None, } } } @@ -615,6 +627,7 @@ impl From for SemanticallyVerifiedBlock { new_outputs: finalized.new_outputs, transaction_hashes: finalized.transaction_hashes, deferred_pool_balance_change: finalized.deferred_pool_balance_change, + auth_data_root: None, } } } diff --git a/zebra-state/src/service/chain_tip.rs b/zebra-state/src/service/chain_tip.rs index 3e3b7c3a481..6cbdf6ae937 100644 --- a/zebra-state/src/service/chain_tip.rs +++ b/zebra-state/src/service/chain_tip.rs @@ -116,6 +116,7 @@ impl From for ChainTipBlock { new_outputs: _, transaction_hashes, deferred_pool_balance_change: _, + auth_data_root: _, } = prepared; Self { diff --git a/zebra-state/src/service/check.rs b/zebra-state/src/service/check.rs index 34c87d4ff72..ab25e7619e9 100644 --- a/zebra-state/src/service/check.rs +++ b/zebra-state/src/service/check.rs @@ -5,7 +5,9 @@ use std::{borrow::Borrow, sync::Arc}; use chrono::Duration; use zebra_chain::{ - block::{self, Block, ChainHistoryBlockTxAuthCommitmentHash, CommitmentError}, + block::{ + self, merkle::AuthDataRoot, Block, ChainHistoryBlockTxAuthCommitmentHash, CommitmentError, + }, history_tree::HistoryTree, parameters::{Network, NetworkUpgrade}, work::difficulty::CompactDifficulty, @@ -170,6 +172,7 @@ pub(crate) fn block_commitment_is_valid_for_chain_history( block: Arc, network: &Network, history_tree: &HistoryTree, + precomputed_auth_data_root: Option, ) -> Result<(), ValidateContextError> { match block.commitment(network)? { block::Commitment::PreSaplingReserved(_) @@ -232,7 +235,12 @@ pub(crate) fn block_commitment_is_valid_for_chain_history( "the history tree of the previous block must exist \ since the current block has a ChainHistoryBlockTxAuthCommitment", ); - let auth_data_root = block.auth_data_root(); + // Use the auth data root precomputed by the verifier when available + // (it is byte-identical to recomputing it here), so the committer + // does not repeat the per-transaction auth-digest work on its + // single-threaded critical path. + let auth_data_root = + precomputed_auth_data_root.unwrap_or_else(|| block.auth_data_root()); let hash_block_commitments = ChainHistoryBlockTxAuthCommitmentHash::from_commitments( &history_tree_root, diff --git a/zebra-state/src/service/finalized_state.rs b/zebra-state/src/service/finalized_state.rs index b9edd39fd7b..63a0ec4fb89 100644 --- a/zebra-state/src/service/finalized_state.rs +++ b/zebra-state/src/service/finalized_state.rs @@ -582,6 +582,10 @@ impl FinalizedState { // finalized tip is the parent block of the block being committed. let block = checkpoint_verified.block.clone(); + // Auth data root precomputed by the checkpoint verifier (if any), + // so the commitment check below doesn't recompute it here on the + // single-threaded committer. `AuthDataRoot` is `Copy`. + let precomputed_auth_data_root = checkpoint_verified.auth_data_root; let mut history_tree = self.db.history_tree(); let prev_note_commitment_trees = prev_note_commitment_trees .unwrap_or_else(|| self.db.note_commitment_trees_for_tip()); @@ -622,6 +626,7 @@ impl FinalizedState { block.clone(), &network, &history_tree, + precomputed_auth_data_root, ) )); }); diff --git a/zebra-state/src/service/finalized_state/tests/transparent.rs b/zebra-state/src/service/finalized_state/tests/transparent.rs index d6ca53b36f5..c10689d1868 100644 --- a/zebra-state/src/service/finalized_state/tests/transparent.rs +++ b/zebra-state/src/service/finalized_state/tests/transparent.rs @@ -128,6 +128,7 @@ fn intra_block_self_spend_chain_in_finalized_state() { new_outputs, transaction_hashes, deferred_pool_balance_change: None, + auth_data_root: None, }; let finalized = FinalizedBlock::from_checkpoint_verified( CheckpointVerifiedBlock(semantically_verified), diff --git a/zebra-state/src/service/finalized_state/zebra_db/block/tests/vectors.rs b/zebra-state/src/service/finalized_state/zebra_db/block/tests/vectors.rs index 770988a9404..271e60277f3 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/block/tests/vectors.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/block/tests/vectors.rs @@ -1220,6 +1220,7 @@ fn test_block_db_round_trip_with( new_outputs, transaction_hashes, deferred_pool_balance_change: None, + auth_data_root: None, }) }; diff --git a/zebra-state/src/service/non_finalized_state.rs b/zebra-state/src/service/non_finalized_state.rs index 813484b46f4..a2b676a9896 100644 --- a/zebra-state/src/service/non_finalized_state.rs +++ b/zebra-state/src/service/non_finalized_state.rs @@ -634,6 +634,8 @@ impl NonFinalizedState { block, &network, &history_tree, + // The non-finalized path doesn't precompute the auth data root. + None, )); }); From acb269a0904566da778a05065946aa88dfaddd97 Mon Sep 17 00:00:00 2001 From: Roman Date: Thu, 18 Jun 2026 02:33:21 +0000 Subject: [PATCH 02/16] perf: de-duplicate the librustzcash conversion for txid and auth digest Computing a v5+ transaction's txid (`Transaction::hash`) and its ZIP-244 authorizing-data digest (`auth_digest`) each independently convert the whole transaction to its librustzcash representation (re-serialize + re-parse), which dominates the per-transaction cost on heavy shielded blocks. The checkpoint commit path paid this twice: once building the transaction hashes in `CheckpointVerifiedBlock::new`, and again computing the auth data root. Add `Transaction::txid_and_auth_digest`, which performs one conversion and returns both. `SemanticallyVerifiedBlock::with_hash` now computes the transaction hashes and the auth data root together from that single shared conversion (the auth digest is nearly free once the txid is computed), so the auth data root is carried on the block and the separate per-block conversion in the checkpoint verifier's commit task is removed. Byte-identical to the separate computations (differential proptest `txid_and_auth_digest_matches_separate`); an end-to-end mainnet sync is the consensus proof. --- zebra-chain/src/transaction/tests/prop.rs | 13 ++++++ zebra-consensus/src/checkpoint.rs | 16 +------- zebra-state/src/request.rs | 50 ++++++++++++++++++++--- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/zebra-chain/src/transaction/tests/prop.rs b/zebra-chain/src/transaction/tests/prop.rs index c80cbbe8db6..39ea0176545 100644 --- a/zebra-chain/src/transaction/tests/prop.rs +++ b/zebra-chain/src/transaction/tests/prop.rs @@ -46,6 +46,19 @@ proptest! { } } + /// `txid_and_auth_digest` shares one librustzcash conversion to produce both + /// the txid and the ZIP-244 auth digest; this asserts the result is identical + /// to computing them separately via `hash()` and `auth_digest()`. + #[test] + fn txid_and_auth_digest_matches_separate(tx in any::()) { + let _init_guard = zebra_test::init(); + + let (txid, auth_digest) = tx.txid_and_auth_digest(); + + prop_assert_eq![txid, tx.hash()]; + prop_assert_eq![auth_digest, tx.auth_digest()]; + } + #[test] fn txid_and_auth_digest_matches_separate(tx in any::()) { let _init_guard = zebra_test::init(); diff --git a/zebra-consensus/src/checkpoint.rs b/zebra-consensus/src/checkpoint.rs index e7d03da8180..23ccc26a385 100644 --- a/zebra-consensus/src/checkpoint.rs +++ b/zebra-consensus/src/checkpoint.rs @@ -33,7 +33,7 @@ use zebra_chain::{ parameters::{ checkpoint::list::CheckpointList, subsidy::{block_subsidy, funding_stream_values, FundingStreamReceiver, SubsidyError}, - Network, NetworkUpgrade, GENESIS_PREVIOUS_BLOCK_HASH, + Network, GENESIS_PREVIOUS_BLOCK_HASH, }, work::equihash, }; @@ -1100,7 +1100,7 @@ where return async { Err(VerifyCheckpointError::Finished) }.boxed(); } - let mut req_block = match self.queue_block(block) { + let req_block = match self.queue_block(block) { Ok(req_block) => req_block, Err(e) => return async { Err(e) }.boxed(), }; @@ -1134,7 +1134,6 @@ where // we don't reject the entire checkpoint. // Instead, we reset the verifier to the successfully committed state tip. let state_service = self.state_service.clone(); - let network = self.network.clone(); let commit_checkpoint_verified = tokio::spawn(async move { let hash = req_block .rx @@ -1143,17 +1142,6 @@ where .map_err(VerifyCheckpointError::CommitCheckpointVerified) .expect("CheckpointVerifier does not leave dangling receivers")?; - // Precompute the ZIP-244 authorizing-data commitment root here, off - // the single-threaded checkpoint-verifier buffer worker. - if NetworkUpgrade::current(&network, req_block.block.height) >= NetworkUpgrade::Nu5 { - let block = req_block.block.block.clone(); - if let Ok(auth_data_root) = - tokio::task::spawn_blocking(move || block.auth_data_root()).await - { - req_block.block.auth_data_root = Some(auth_data_root); - } - } - // We use a `ServiceExt::oneshot`, so that every state service // `poll_ready` has a corresponding `call`. See #1593. match state_service diff --git a/zebra-state/src/request.rs b/zebra-state/src/request.rs index da62e0ff202..cf1aaa12130 100644 --- a/zebra-state/src/request.rs +++ b/zebra-state/src/request.rs @@ -10,7 +10,11 @@ use std::{ use tower::{BoxError, Service, ServiceExt}; use zebra_chain::{ amount::{DeferredPoolBalanceChange, NegativeAllowed}, - block::{self, merkle::AuthDataRoot, Block, HeightDiff}, + block::{ + self, + merkle::{AuthDataRoot, AUTH_DIGEST_PLACEHOLDER}, + Block, HeightDiff, + }, diagnostic::{task::WaitForPanics, CodeTimer}, history_tree::HistoryTree, orchard, @@ -551,7 +555,25 @@ impl SemanticallyVerifiedBlock { let height = block .coinbase_height() .expect("semantically verified block should have a coinbase height"); - let transaction_hashes: Arc<[_]> = block.transactions.iter().map(|tx| tx.hash()).collect(); + // Compute each transaction's txid and ZIP-244 auth digest together, + // sharing the single (expensive) librustzcash conversion that dominates + // the cost on heavy shielded transactions, instead of computing the txid + // here and re-converting the same transactions for the auth data root + // later on the commit path. The auth digest is nearly free once the txid + // has been computed. + let (transaction_hashes, auth_digests): (Vec<_>, Vec<_>) = { + use rayon::prelude::*; + block + .transactions + .par_iter() + .map(|tx| tx.txid_and_auth_digest()) + .unzip() + }; + let transaction_hashes: Arc<[_]> = transaction_hashes.into(); + let auth_data_root = auth_digests + .into_iter() + .map(|auth_digest| auth_digest.unwrap_or(AUTH_DIGEST_PLACEHOLDER)) + .collect::(); let new_outputs = transparent::new_ordered_outputs(&block, &transaction_hashes); Self { @@ -561,7 +583,7 @@ impl SemanticallyVerifiedBlock { new_outputs, transaction_hashes, deferred_pool_balance_change: None, - auth_data_root: None, + auth_data_root: Some(auth_data_root), } } @@ -587,7 +609,25 @@ impl From> for SemanticallyVerifiedBlock { let height = block .coinbase_height() .expect("semantically verified block should have a coinbase height"); - let transaction_hashes: Arc<[_]> = block.transactions.iter().map(|tx| tx.hash()).collect(); + // Compute each transaction's txid and ZIP-244 auth digest together, + // sharing the single (expensive) librustzcash conversion that dominates + // the cost on heavy shielded transactions, instead of computing the txid + // here and re-converting the same transactions for the auth data root + // later on the commit path. The auth digest is nearly free once the txid + // has been computed. + let (transaction_hashes, auth_digests): (Vec<_>, Vec<_>) = { + use rayon::prelude::*; + block + .transactions + .par_iter() + .map(|tx| tx.txid_and_auth_digest()) + .unzip() + }; + let transaction_hashes: Arc<[_]> = transaction_hashes.into(); + let auth_data_root = auth_digests + .into_iter() + .map(|auth_digest| auth_digest.unwrap_or(AUTH_DIGEST_PLACEHOLDER)) + .collect::(); let new_outputs = transparent::new_ordered_outputs(&block, &transaction_hashes); Self { @@ -597,7 +637,7 @@ impl From> for SemanticallyVerifiedBlock { new_outputs, transaction_hashes, deferred_pool_balance_change: None, - auth_data_root: None, + auth_data_root: Some(auth_data_root), } } } From d83ae463b6045d69dcb7386ec89da725129c9f06 Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Fri, 19 Jun 2026 11:18:00 -0300 Subject: [PATCH 03/16] perf: de-duplicate the librustzcash conversion for txid and auth digest Computing a v5+ transaction's txid (`Transaction::hash`) and its ZIP-244 authorizing-data digest (`auth_digest`) each independently convert the whole transaction to its librustzcash representation (re-serialize + re-parse), which dominates the per-transaction cost on heavy shielded blocks. The checkpoint commit path paid this twice: once building the transaction hashes in `CheckpointVerifiedBlock::new`, and again computing the auth data root. Add `Transaction::txid_and_auth_digest`, which performs one conversion and returns both. `SemanticallyVerifiedBlock::with_hash` now computes the transaction hashes and the auth data root together from that single shared conversion (the auth digest is nearly free once the txid is computed), so the auth data root is carried on the block and the separate per-block conversion in the checkpoint verifier's commit task is removed. Byte-identical to the separate computations (differential proptest `txid_and_auth_digest_matches_separate`); an end-to-end mainnet sync is the consensus proof. --- zebra-consensus/src/block/request.rs | 73 +++++- zebra-consensus/src/block/tests.rs | 65 +++++- zebra-consensus/src/checkpoint.rs | 297 ++++++++++++++++-------- zebra-consensus/src/router.rs | 11 + zebrad/src/components/sync.rs | 1 + zebrad/src/components/sync/downloads.rs | 23 +- 6 files changed, 366 insertions(+), 104 deletions(-) diff --git a/zebra-consensus/src/block/request.rs b/zebra-consensus/src/block/request.rs index 534f6c599b8..d03abc2de26 100644 --- a/zebra-consensus/src/block/request.rs +++ b/zebra-consensus/src/block/request.rs @@ -2,13 +2,30 @@ use std::sync::Arc; -use zebra_chain::block::Block; +use zebra_chain::{ + block::{self, Block}, + parameters::Network, +}; +use zebra_state::CheckpointVerifiedBlock; + +use crate::checkpoint::VerifyCheckpointError; #[derive(Debug, Clone, PartialEq, Eq)] /// A request to the chain or block verifier pub enum Request { /// Performs semantic validation, then asks the state to perform contextual validation and commit the block Commit(Arc), + + /// Like [`Request::Commit`], but the (CPU-heavy) checkpoint-verifier + /// precomputation — the per-transaction txids and the auth data root — has + /// already been done by the caller, off the single-threaded checkpoint + /// verifier. + /// + /// Only valid below the checkpoint height; the verifier still performs all + /// validity checks (proof of work, Merkle root, height). Used by the syncer, + /// which can build these blocks concurrently across many download tasks. + CommitCheckpointPrecomputed(CheckpointVerifiedBlock), + /// Performs semantic validation but skips checking proof of work, /// then asks the state to perform contextual validation. /// Does not commit the block to the state. @@ -16,18 +33,62 @@ pub enum Request { } impl Request { + /// Creates a commit request for the downloaded block. + /// + /// For checkpoint-height blocks, precompute the checkpoint-verified block + /// off the verifier's single-threaded buffer worker. Callers should do this + /// before reserving verifier readiness, so the CPU-heavy work does not hold a + /// verifier slot. + pub async fn create_commit_request( + block: Arc, + block_height: block::Height, + max_checkpoint_height: block::Height, + network: Network, + ) -> Result { + if block_height <= max_checkpoint_height { + let hash = block.hash(); + + // Keep checkpoint sync's cheap proof-of-work gate before the + // per-transaction precomputation, matching the verifier path. + // Security: This prevents attackers from flooding the verifier with invalid blocks + // only to reject afterwards. + if network.disable_pow() { + super::check::difficulty_threshold_is_valid( + &block.header, + &network, + &block_height, + &hash, + )?; + } else { + super::check::difficulty_is_valid(&block.header, &network, &block_height, &hash)?; + super::check::equihash_solution_is_valid(&block.header)?; + } + + let checkpoint_block = tokio::task::spawn_blocking(move || { + CheckpointVerifiedBlock::with_hash(block, hash) + }) + .await + .expect("checkpoint block precomputation should not panic"); + + Ok(Request::CommitCheckpointPrecomputed(checkpoint_block)) + } else { + Ok(Request::Commit(block)) + } + } + /// Returns inner block pub fn block(&self) -> Arc { - Arc::clone(match self { - Request::Commit(block) => block, - Request::CheckProposal(block) => block, - }) + match self { + Request::Commit(block) => Arc::clone(block), + Request::CommitCheckpointPrecomputed(block) => Arc::clone(&block.block), + Request::CheckProposal(block) => Arc::clone(block), + } } /// Returns `true` if the request is a proposal pub fn is_proposal(&self) -> bool { match self { - Request::Commit(_) => false, + Request::Commit(_) | Request::CommitCheckpointPrecomputed(_) => false, Request::CheckProposal(_) => true, } } diff --git a/zebra-consensus/src/block/tests.rs b/zebra-consensus/src/block/tests.rs index d316bb6d69b..9db7b6eb61f 100644 --- a/zebra-consensus/src/block/tests.rs +++ b/zebra-consensus/src/block/tests.rs @@ -14,7 +14,7 @@ use zebra_chain::{ }, Block, Height, }, - parameters::{subsidy::block_subsidy, NetworkUpgrade}, + parameters::{subsidy::block_subsidy, Network, NetworkUpgrade}, serialization::{ZcashDeserialize, ZcashDeserializeInto}, transaction::{arbitrary::transaction_to_fake_v5, LockTime, Transaction}, work::difficulty::{ParameterDifficulty as _, INVALID_COMPACT_DIFFICULTY}, @@ -158,6 +158,69 @@ async fn check_transcripts() -> Result<(), Report> { Ok(()) } +#[tokio::test] +async fn create_commit_request_selects_checkpoint_precomputation() -> Result<(), Report> { + let _init_guard = zebra_test::init(); + + let block: Arc = + Block::zcash_deserialize(&zebra_test::vectors::BLOCK_MAINNET_GENESIS_BYTES[..])?.into(); + let max_checkpoint_height = Height(1); + + let request = Request::create_commit_request( + block.clone(), + Height(0), + max_checkpoint_height, + Network::Mainnet, + ) + .await?; + assert!(matches!(request, Request::CommitCheckpointPrecomputed(_))); + assert_eq!(request.block(), block); + + let request = Request::create_commit_request( + block.clone(), + max_checkpoint_height, + max_checkpoint_height, + Network::Mainnet, + ) + .await?; + assert!(matches!(request, Request::CommitCheckpointPrecomputed(_))); + assert_eq!(request.block(), block); + + let request = Request::create_commit_request( + block.clone(), + Height(2), + max_checkpoint_height, + Network::Mainnet, + ) + .await?; + assert_eq!(request, Request::Commit(block)); + + Ok(()) +} + +#[tokio::test] +async fn create_commit_request_rejects_invalid_checkpoint_pow() -> Result<(), Report> { + let _init_guard = zebra_test::init(); + + let block = + Arc::::zcash_deserialize(&zebra_test::vectors::BLOCK_MAINNET_GENESIS_BYTES[..])?; + let mut block = Arc::try_unwrap(block).expect("genesis block should have no other references"); + let block_height = block.coinbase_height().expect("genesis block has height"); + + Arc::make_mut(&mut block.header).difficulty_threshold = INVALID_COMPACT_DIFFICULTY; + + let request = + Request::create_commit_request(block.into(), block_height, block_height, Network::Mainnet) + .await; + + assert!( + request.is_err(), + "invalid checkpoint proof of work must be rejected before precomputation" + ); + + Ok(()) +} + #[test] fn coinbase_is_first_for_historical_blocks() -> Result<(), Report> { let _init_guard = zebra_test::init(); diff --git a/zebra-consensus/src/checkpoint.rs b/zebra-consensus/src/checkpoint.rs index 23ccc26a385..37a9dfa669a 100644 --- a/zebra-consensus/src/checkpoint.rs +++ b/zebra-consensus/src/checkpoint.rs @@ -601,31 +601,74 @@ where .ok_or(VerifyCheckpointError::CoinbaseHeight { hash })?; self.check_height(height)?; + // Cheap proof-of-work checks run *before* the expensive precomputation, + // so a flood of invalid-PoW blocks can't make us do per-transaction work. + self.check_proof_of_work(&block.header, height, hash)?; + + // Precompute the per-transaction hashes and auth data root, which scale + // with block weight. (The precomputed path does this concurrently in the + // caller and skips it here.) + let block = CheckpointVerifiedBlock::with_hash(block, hash); + + self.finish_validation(block) + } + + /// Check a [`CheckpointVerifiedBlock`] whose precomputation (txids, auth data + /// root) was already done by the caller, off the single-threaded verifier. + /// + /// Runs the same validity checks as [`Self::check_block`] (height, proof of + /// work, Merkle root) against the precomputed block. + fn validate_precomputed_block( + &self, + block: CheckpointVerifiedBlock, + ) -> Result { + let hash = block.hash; + let height = block.height; + self.check_height(height)?; + self.check_proof_of_work(&block.block.header, height, hash)?; + self.finish_validation(block) + } + + /// Check the block's proof of work (difficulty, and equihash unless disabled). + fn check_proof_of_work( + &self, + header: &block::Header, + height: block::Height, + hash: block::Hash, + ) -> Result<(), VerifyCheckpointError> { if self.network.disable_pow() { crate::block::check::difficulty_threshold_is_valid( - &block.header, + header, &self.network, &height, &hash, )?; } else { - crate::block::check::difficulty_is_valid(&block.header, &self.network, &height, &hash)?; - crate::block::check::equihash_solution_is_valid(&block.header)?; + crate::block::check::difficulty_is_valid(header, &self.network, &height, &hash)?; + crate::block::check::equihash_solution_is_valid(header)?; } + Ok(()) + } + + /// Finish validating a (precomputed) checkpoint block: set its deferred pool + /// balance change and check its Merkle root. + fn finish_validation( + &self, + mut block: CheckpointVerifiedBlock, + ) -> Result { + let height = block.height; + // See [ZIP-1015](https://zips.z.cash/zip-1015). let expected_deferred_amount = funding_stream_values(height, &self.network, block_subsidy(height, &self.network)?)? .remove(&FundingStreamReceiver::Deferred); - let deferred_pool_balance_change = expected_deferred_amount + block.deferred_pool_balance_change = expected_deferred_amount .unwrap_or_default() .checked_sub(self.network.lockbox_disbursement_total_amount(height)) .map(DeferredPoolBalanceChange::new); - // don't do precalculation until the block passes basic difficulty checks - let block = CheckpointVerifiedBlock::new(block, Some(hash), deferred_pool_balance_change); - crate::block::check::merkle_root_validity( &self.network, &block.block, @@ -647,11 +690,31 @@ where /// returns an error immediately. #[allow(clippy::unwrap_in_result)] fn queue_block(&mut self, block: Arc) -> Result { + let block = self.check_block(block)?; + self.enqueue(block) + } + + /// Like [`Self::queue_block`], but for a block whose precomputation was + /// already done by the caller (off the single-threaded verifier). + #[allow(clippy::unwrap_in_result)] + fn queue_precomputed_block( + &mut self, + block: CheckpointVerifiedBlock, + ) -> Result { + let block = self.validate_precomputed_block(block)?; + self.enqueue(block) + } + + /// Add an already-validated checkpoint block to the queue of blocks waiting + /// to be verified against a checkpoint. + #[allow(clippy::unwrap_in_result)] + fn enqueue( + &mut self, + block: CheckpointVerifiedBlock, + ) -> Result { // Set up a oneshot channel to send results let (tx, rx) = oneshot::channel(); - // Check that the height and Merkle roots are valid. - let block = self.check_block(block)?; let height = block.height; let hash = block.hash; @@ -707,6 +770,134 @@ where Ok(req_block) } + /// Verify a checkpoint block whose precomputation (per-transaction txids and + /// auth data root) was already done concurrently by the caller, off this + /// single-threaded verifier. The verifier still performs all validity checks. + /// + /// This is the fast path used by the syncer: only the cheap checks and the + /// queue/commit bookkeeping run here, while the expensive precomputation has + /// already happened across many concurrent download tasks. + pub(crate) fn call_precomputed( + &mut self, + block: CheckpointVerifiedBlock, + ) -> Pin> + Send + 'static>> + { + // Reset the verifier back to the state tip if requested + // (e.g. due to an error when committing a block to the state) + if let Ok(tip) = self.reset_receiver.try_recv() { + self.reset_progress(tip); + } + + // Immediately reject all incoming blocks that arrive after we've finished. + if let FinalCheckpoint = self.previous_checkpoint_height() { + return async { Err(VerifyCheckpointError::Finished) }.boxed(); + } + + let req_block = match self.queue_precomputed_block(block) { + Ok(req_block) => req_block, + Err(e) => return async { Err(e) }.boxed(), + }; + + self.verify_and_commit(req_block) + } + + /// Process a queued checkpoint block: advance checkpoint-range verification + /// and spawn the task that commits the block to the state once its range is + /// verified. Shared by the [`Service`] and precomputed entry points. + fn verify_and_commit( + &mut self, + req_block: RequestBlock, + ) -> Pin> + Send + 'static>> + { + self.process_checkpoint_range(); + + metrics::gauge!("checkpoint.queued_slots").set(self.queued.len() as f64); + + // Because the checkpoint verifier duplicates state from the state + // service (it tracks which checkpoints have been verified), we must + // commit blocks transactionally on a per-checkpoint basis. Otherwise, + // the checkpoint verifier's state could desync from the underlying + // state service. Among other problems, this could cause the checkpoint + // verifier to reject blocks not already in the state as + // already-verified. + // + // # Dropped Receivers + // + // To commit blocks transactionally on a per-checkpoint basis, we must + // commit all verified blocks in a checkpoint range, regardless of + // whether or not the response futures for each block were dropped. + // + // We accomplish this by spawning a new task containing the + // commit-if-verified logic. This task will always execute, except if + // the program is interrupted, in which case there is no longer a + // checkpoint verifier to keep in sync with the state. + // + // # State Commit Failures + // + // If the state commit fails due to corrupt block data, + // we don't reject the entire checkpoint. + // Instead, we reset the verifier to the successfully committed state tip. + let state_service = self.state_service.clone(); + let commit_checkpoint_verified = tokio::spawn(async move { + let hash = req_block + .rx + .await + .map_err(Into::into) + .map_err(VerifyCheckpointError::CommitCheckpointVerified) + .expect("CheckpointVerifier does not leave dangling receivers")?; + + // We use a `ServiceExt::oneshot`, so that every state service + // `poll_ready` has a corresponding `call`. See #1593. + match state_service + .oneshot(zs::Request::CommitCheckpointVerifiedBlock(req_block.block)) + .map_err(VerifyCheckpointError::CommitCheckpointVerified) + .await? + { + zs::Response::Committed(committed_hash) => { + assert_eq!(committed_hash, hash, "state must commit correct hash"); + Ok(hash) + } + _ => unreachable!("wrong response for CommitCheckpointVerifiedBlock"), + } + }); + + let state_service = self.state_service.clone(); + let reset_sender = self.reset_sender.clone(); + async move { + let result = commit_checkpoint_verified.await; + // Avoid a panic on shutdown + // + // When `zebrad` is terminated using Ctrl-C, the `commit_checkpoint_verified` task + // can return a `JoinError::Cancelled`. We expect task cancellation on shutdown, + // so we don't need to panic here. The persistent state is correct even when the + // task is cancelled, because block data is committed inside transactions, in + // height order. + let result = if zebra_chain::shutdown::is_shutting_down() { + Err(VerifyCheckpointError::ShuttingDown) + } else { + result.expect("commit_checkpoint_verified should not panic") + }; + if result.is_err() { + // If there was an error committing the block, then this verifier + // will be out of sync with the state. In that case, reset + // its progress back to the state tip. + let tip = match state_service + .oneshot(zs::Request::Tip) + .await + .map_err(VerifyCheckpointError::Tip)? + { + zs::Response::Tip(tip) => tip, + _ => unreachable!("wrong response for Tip"), + }; + // Ignore errors since send() can fail only when the verifier + // is being dropped, and then it doesn't matter anymore. + let _ = reset_sender.send(tip); + } + result + } + .boxed() + } + /// During checkpoint range processing, process all the blocks at `height`. /// /// Returns the first valid block. If there is no valid block, returns None. @@ -1105,92 +1296,6 @@ where Err(e) => return async { Err(e) }.boxed(), }; - self.process_checkpoint_range(); - - metrics::gauge!("checkpoint.queued_slots").set(self.queued.len() as f64); - - // Because the checkpoint verifier duplicates state from the state - // service (it tracks which checkpoints have been verified), we must - // commit blocks transactionally on a per-checkpoint basis. Otherwise, - // the checkpoint verifier's state could desync from the underlying - // state service. Among other problems, this could cause the checkpoint - // verifier to reject blocks not already in the state as - // already-verified. - // - // # Dropped Receivers - // - // To commit blocks transactionally on a per-checkpoint basis, we must - // commit all verified blocks in a checkpoint range, regardless of - // whether or not the response futures for each block were dropped. - // - // We accomplish this by spawning a new task containing the - // commit-if-verified logic. This task will always execute, except if - // the program is interrupted, in which case there is no longer a - // checkpoint verifier to keep in sync with the state. - // - // # State Commit Failures - // - // If the state commit fails due to corrupt block data, - // we don't reject the entire checkpoint. - // Instead, we reset the verifier to the successfully committed state tip. - let state_service = self.state_service.clone(); - let commit_checkpoint_verified = tokio::spawn(async move { - let hash = req_block - .rx - .await - .map_err(Into::into) - .map_err(VerifyCheckpointError::CommitCheckpointVerified) - .expect("CheckpointVerifier does not leave dangling receivers")?; - - // We use a `ServiceExt::oneshot`, so that every state service - // `poll_ready` has a corresponding `call`. See #1593. - match state_service - .oneshot(zs::Request::CommitCheckpointVerifiedBlock(req_block.block)) - .map_err(VerifyCheckpointError::CommitCheckpointVerified) - .await? - { - zs::Response::Committed(committed_hash) => { - assert_eq!(committed_hash, hash, "state must commit correct hash"); - Ok(hash) - } - _ => unreachable!("wrong response for CommitCheckpointVerifiedBlock"), - } - }); - - let state_service = self.state_service.clone(); - let reset_sender = self.reset_sender.clone(); - async move { - let result = commit_checkpoint_verified.await; - // Avoid a panic on shutdown - // - // When `zebrad` is terminated using Ctrl-C, the `commit_checkpoint_verified` task - // can return a `JoinError::Cancelled`. We expect task cancellation on shutdown, - // so we don't need to panic here. The persistent state is correct even when the - // task is cancelled, because block data is committed inside transactions, in - // height order. - let result = if zebra_chain::shutdown::is_shutting_down() { - Err(VerifyCheckpointError::ShuttingDown) - } else { - result.expect("commit_checkpoint_verified should not panic") - }; - if result.is_err() { - // If there was an error committing the block, then this verifier - // will be out of sync with the state. In that case, reset - // its progress back to the state tip. - let tip = match state_service - .oneshot(zs::Request::Tip) - .await - .map_err(VerifyCheckpointError::Tip)? - { - zs::Response::Tip(tip) => tip, - _ => unreachable!("wrong response for Tip"), - }; - // Ignore errors since send() can fail only when the verifier - // is being dropped, and then it doesn't matter anymore. - let _ = reset_sender.send(tip); - } - result - } - .boxed() + self.verify_and_commit(req_block) } } diff --git a/zebra-consensus/src/router.rs b/zebra-consensus/src/router.rs index 91bfa46ee12..905702de21e 100644 --- a/zebra-consensus/src/router.rs +++ b/zebra-consensus/src/router.rs @@ -200,6 +200,17 @@ where } fn call(&mut self, request: Request) -> Self::Future { + // A precomputed checkpoint block is, by construction, below the + // checkpoint height; route it straight to the checkpoint verifier's + // fast path (which skips the now-already-done precomputation). + if let Request::CommitCheckpointPrecomputed(block) = request { + return self + .checkpoint + .call_precomputed(block) + .map_err(Into::into) + .boxed(); + } + let block = request.block(); match block.coinbase_height() { diff --git a/zebrad/src/components/sync.rs b/zebrad/src/components/sync.rs index 4ff476d17b6..e408a196dd7 100644 --- a/zebrad/src/components/sync.rs +++ b/zebrad/src/components/sync.rs @@ -848,6 +848,7 @@ where verifier, latest_chain_tip.clone(), past_lookahead_limit_sender, + config.network.network.clone(), max( checkpoint_verify_concurrency_limit, full_verify_concurrency_limit, diff --git a/zebrad/src/components/sync/downloads.rs b/zebrad/src/components/sync/downloads.rs index 27135e3d510..1f92b56e0d0 100644 --- a/zebrad/src/components/sync/downloads.rs +++ b/zebrad/src/components/sync/downloads.rs @@ -26,6 +26,7 @@ use tracing_futures::Instrument; use zebra_chain::{ block::{self, Height, HeightDiff}, chain_tip::ChainTip, + parameters::Network, }; use zebra_network::{self as zn, PeerSocketAddr}; use zebra_state as zs; @@ -254,6 +255,9 @@ where // Configuration // + /// The configured Zcash network. + chain_network: Network, + /// The configured lookahead limit, after applying the minimum limit. lookahead_limit: usize, @@ -356,6 +360,7 @@ where verifier: ZV, latest_chain_tip: ZSTip, past_lookahead_limit_sender: watch::Sender, + chain_network: Network, lookahead_limit: usize, max_checkpoint_height: Height, ) -> Self { @@ -366,6 +371,7 @@ where network, verifier, latest_chain_tip, + chain_network, lookahead_limit, max_checkpoint_height, past_lookahead_limit_sender: Arc::new(std::sync::Mutex::new( @@ -414,6 +420,7 @@ where let lookahead_limit = self.lookahead_limit; let max_checkpoint_height = self.max_checkpoint_height; + let chain_network = self.chain_network.clone(); let past_lookahead_limit_sender = self.past_lookahead_limit_sender.clone(); let past_lookahead_limit_receiver = self.past_lookahead_limit_receiver.clone(); @@ -598,6 +605,20 @@ where Err(BlockDownloadVerifyError::BehindTipHeightLimit { height: block_height, hash })?; } + let request = zebra_consensus::Request::create_commit_request( + block, + block_height, + max_checkpoint_height, + chain_network, + ) + .await + .map_err(|error| BlockDownloadVerifyError::Invalid { + error: error.into(), + height: block_height, + hash, + advertiser_addr, + })?; + // Wait for the verifier service to be ready. let readiness = verifier.ready(); // Prefer the cancel handle if both are ready. @@ -615,7 +636,7 @@ where let verify_start = std::time::Instant::now(); let mut rsp = verifier .map_err(|error| BlockDownloadVerifyError::VerifierServiceError { error })? - .call(zebra_consensus::Request::Commit(block)).boxed(); + .call(request).boxed(); // Add a shorter timeout to workaround a known bug (#5125) let short_timeout_max = (max_checkpoint_height + FINAL_CHECKPOINT_BLOCK_VERIFY_TIMEOUT_LIMIT).expect("checkpoint block height is in valid range"); From 6bbd34351d92d61354c2ccea0f539ad3525f27f3 Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Fri, 19 Jun 2026 11:26:24 -0300 Subject: [PATCH 04/16] perf(state): parallelize per-block serialization in the finalized block writer (#128) * perf(state): serialize raw transactions in parallel when writing blocks * perf(state): compute block size in parallel + run block-write batch prep in dedicated pool * comment --- zebra-state/src/service/finalized_state/zebra_db/block.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zebra-state/src/service/finalized_state/zebra_db/block.rs b/zebra-state/src/service/finalized_state/zebra_db/block.rs index 44eeb7aa3a8..fab5e4956ea 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/block.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/block.rs @@ -44,9 +44,10 @@ use crate::{ disk_format::{ block::TransactionLocation, transparent::{AddressBalanceLocationUpdates, OutputLocation}, + IntoDisk, }, zebra_db::{metrics::block_precommit_metrics, ZebraDb}, - FromDisk, IntoDisk, RawBytes, PRUNING_METADATA, + FromDisk, RawBytes, PRUNING_METADATA, }, HashOrHeight, }; From 003c7039194693a1685108d59ebc3703c80c0c5f Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Fri, 19 Jun 2026 11:39:23 -0300 Subject: [PATCH 05/16] perf(state): gate parallel block batch-prep on a transaction-count threshold (#138) The checkpoint committer serializes each block's raw transactions (block.rs) and sums the per-transaction sizes (chain.rs) on the rayon pool. That fan-out is a clear win for the large blocks in the heavy shielded region, but for the small blocks of the early chain the rayon fork-join cost (waking workers, distributing the items, joining) outweighs the work itself. Gate both parallel paths on PARALLEL_BLOCK_TX_THRESHOLD (16 transactions): blocks at or above it keep the parallel path, smaller blocks run sequentially. The output is byte-identical either way, so this is purely a scheduling change. Measured with two fresh-from-genesis mainnet syncs of the same binary, gate toggled, over a matched height window (per-block, committer-thread metrics that are independent of peer/download luck): batch_prep 1.45ms -> 1.31ms (-10%) write_block_total 6.38ms -> 6.08ms ( -5%) Stable across sub-windows (batch_prep -8% to -13%). The heavy shielded region is unaffected: those blocks have >= 16 transactions and keep the parallel path. --- .vscode/settings.json | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000000..bbd880b47ac --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,17 @@ +{ + // Rust build artifacts (~140G in target/, ~5G in unity-node/target/) saturate + // the file watcher and index, which hangs the extension host (agents + terminal). + // These dirs are gitignored, so hiding them from the editor is safe. + "files.watcherExclude": { + "**/target/**": true, + "**/.git/objects/**": true + }, + "search.exclude": { + "**/target": true + }, + "files.exclude": { + "**/target": true + }, + // Let rust-analyzer manage the workspace without a redundant cargo check storm. + "rust-analyzer.files.excludeDirs": ["target"] +} From b37ad32315a5d200ecf66604ba8ed44a78c3565e Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Fri, 19 Jun 2026 15:37:19 -0300 Subject: [PATCH 06/16] perf(chain): compute ZIP-244 txid and auth digest natively (#131) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the checkpoint range, per-transaction CPU is dominated by computing the v5 txid and ZIP-244 authorizing-data digest. Both went through `Transaction::to_librustzcash`, which serializes the whole transaction and reparses it — decompressing every Jubjub/Pallas curve point — purely so librustzcash can re-serialize those same bytes into the BLAKE2b digest tree. A `perf` flamegraph of the heavy shielded region (mainnet 1.72M–1.73M) attributes ~44% of all CPU to these reparses (leaves are `bls12_381::Scalar::square` / `sqrt_tonelli_shanks` from point decompression); the BLAKE2b hashing itself is <1%. The decompressed points are never needed in the checkpoint range (no proof/signature verification). Compute the txid and auth digest directly from Zebra's already-parsed `Transaction` fields, feeding their canonical bytes straight into the same BLAKE2b tree (`transaction::zip244`). This removes the reparse entirely for the digest path. v6 transactions (unstable `tx_v6`) still use librustzcash. This is consensus-critical and byte-identical to librustzcash: proven by a differential property test (`native_zip244_matches_librustzcash`) over thousands of random v5 transactions, the existing ZIP-244 known-answer vectors, and a clean differential mainnet checkpoint sync. --- .../src/primitives/zcash_primitives.rs | 38 ++ zebra-chain/src/transaction.rs | 1 + zebra-chain/src/transaction/tests/prop.rs | 24 +- zebra-chain/src/transaction/txid.rs | 8 +- zebra-chain/src/transaction/zip244.rs | 446 ++++++++++++++++++ 5 files changed, 512 insertions(+), 5 deletions(-) create mode 100644 zebra-chain/src/transaction/zip244.rs diff --git a/zebra-chain/src/primitives/zcash_primitives.rs b/zebra-chain/src/primitives/zcash_primitives.rs index 524a4c34eb3..d2ee89b849f 100644 --- a/zebra-chain/src/primitives/zcash_primitives.rs +++ b/zebra-chain/src/primitives/zcash_primitives.rs @@ -513,6 +513,13 @@ fn sighash_inner( /// /// [ZIP-244]: https://zips.z.cash/zip-0244 pub(crate) fn auth_digest(tx: &Transaction) -> AuthDigest { + // Compute the v5 ZIP-244 authorizing-data digest natively, avoiding the + // `librustzcash` reparse (see `crate::transaction::zip244`). Other versions + // (e.g. v6) fall back to `librustzcash`. + if let Some(auth_digest) = crate::transaction::zip244::auth_digest(tx) { + return auth_digest; + } + let nu = tx.network_upgrade().expect("V5 tx has a network upgrade"); AuthDigest( @@ -532,6 +539,37 @@ pub(crate) fn auth_digest(tx: &Transaction) -> AuthDigest { /// /// If passed a pre-v5 transaction. pub(crate) fn txid_and_auth_digest(tx: &Transaction) -> (Hash, AuthDigest) { + // Compute the v5 ZIP-244 txid and authorizing-data digest natively, avoiding + // the `librustzcash` reparse (see `crate::transaction::zip244`). Other + // versions (e.g. v6) fall back to `librustzcash`. + if let Some(result) = crate::transaction::zip244::txid_and_auth_digest(tx) { + return result; + } + + let nu = tx.network_upgrade().expect("V5 tx has a network upgrade"); + + let tx = tx + .to_librustzcash(nu) + .expect("V5 tx is convertible to its `zcash_params` equivalent"); + + let txid = Hash(*tx.txid().as_ref()); + let auth_digest = AuthDigest( + tx.auth_commitment() + .as_ref() + .try_into() + .expect("digest has the correct size"), + ); + + (txid, auth_digest) +} + +/// Computes the txid and ZIP-244 authorizing-data digest of a v5+ transaction +/// strictly via the `librustzcash` conversion, bypassing the native ZIP-244 +/// path. Used only as the differential oracle for the native implementation in +/// `crate::transaction::zip244` (see the `native_zip244_matches_librustzcash` +/// property test). +#[cfg(test)] +pub(crate) fn txid_and_auth_digest_via_librustzcash(tx: &Transaction) -> (Hash, AuthDigest) { let nu = tx.network_upgrade().expect("V5 tx has a network upgrade"); let tx = tx diff --git a/zebra-chain/src/transaction.rs b/zebra-chain/src/transaction.rs index f68ff93157e..efbb5682aeb 100644 --- a/zebra-chain/src/transaction.rs +++ b/zebra-chain/src/transaction.rs @@ -13,6 +13,7 @@ mod serialize; mod sighash; mod txid; mod unmined; +pub(crate) mod zip244; #[cfg(any(test, feature = "proptest-impl"))] #[allow(clippy::unwrap_in_result)] diff --git a/zebra-chain/src/transaction/tests/prop.rs b/zebra-chain/src/transaction/tests/prop.rs index 39ea0176545..1cc94dab155 100644 --- a/zebra-chain/src/transaction/tests/prop.rs +++ b/zebra-chain/src/transaction/tests/prop.rs @@ -59,14 +59,30 @@ proptest! { prop_assert_eq![auth_digest, tx.auth_digest()]; } + /// The native ZIP-244 txid + authorizing-data digest implementation + /// (`transaction::zip244`) must be byte-for-byte identical to the + /// `librustzcash` conversion it replaces. This is the consensus-critical + /// correctness proof for the native path, exercised across thousands of + /// random v5 transaction shapes (coinbase, spends-only, outputs-only, empty + /// shielded bundles, multi-action orchard, both NU5 and NU6 branch ids). #[test] - fn txid_and_auth_digest_matches_separate(tx in any::()) { + fn native_zip244_matches_librustzcash(tx in Transaction::v5_strategy(LedgerState::default())) { let _init_guard = zebra_test::init(); - let (txid, auth_digest) = tx.txid_and_auth_digest(); + let (native_txid, native_auth) = crate::transaction::zip244::txid_and_auth_digest(&tx) + .expect("v5 transaction has a native ZIP-244 digest"); + let (ref_txid, ref_auth) = + crate::primitives::zcash_primitives::txid_and_auth_digest_via_librustzcash(&tx); - prop_assert_eq![txid, tx.hash()]; - prop_assert_eq![auth_digest, tx.auth_digest()]; + prop_assert_eq!(native_txid, ref_txid, "native txid must match librustzcash"); + prop_assert_eq!(native_auth, ref_auth, "native auth digest must match librustzcash"); + + // The separate native entry points must agree with the combined one. + prop_assert_eq!(crate::transaction::zip244::txid(&tx).expect("v5"), native_txid); + prop_assert_eq!( + crate::transaction::zip244::auth_digest(&tx).expect("v5"), + native_auth + ); } #[test] diff --git a/zebra-chain/src/transaction/txid.rs b/zebra-chain/src/transaction/txid.rs index 40d26720438..08f98441fd1 100644 --- a/zebra-chain/src/transaction/txid.rs +++ b/zebra-chain/src/transaction/txid.rs @@ -45,9 +45,15 @@ impl<'a> TxIdBuilder<'a> { /// In this case it's the hash of a tree of hashes of specific parts of the /// transaction, as specified in ZIP-244 and ZIP-225. fn txid_v5(self) -> Option { + // Compute the v5 ZIP-244 txid natively, directly from the parsed + // transaction, avoiding the `librustzcash` reparse (see `super::zip244`). + // Non-v5 transactions (e.g. v6) fall back to `librustzcash` below. + if let Some(txid) = super::zip244::txid(self.trans) { + return Some(txid); + } + let nu = self.trans.network_upgrade()?; - // We compute v5 txid (from ZIP-244) using librustzcash. Some(Hash(*self.trans.to_librustzcash(nu).ok()?.txid().as_ref())) } diff --git a/zebra-chain/src/transaction/zip244.rs b/zebra-chain/src/transaction/zip244.rs new file mode 100644 index 00000000000..313a123fecc --- /dev/null +++ b/zebra-chain/src/transaction/zip244.rs @@ -0,0 +1,446 @@ +//! Native ZIP-244 transaction identifier (txid) and authorizing-data commitment. +//! +//! Computes the v5 txid digest tree and the ZIP-244 authorizing-data digest +//! directly from Zebra's parsed [`Transaction`], without converting to the +//! `librustzcash` transaction type via [`Transaction::to_librustzcash`]. +//! +//! That conversion re-serializes the whole transaction and re-parses it, +//! decompressing every Jubjub/Pallas curve point (`cv`, `rk`, ephemeral keys, +//! …) into typed group elements — purely so `librustzcash` can re-serialize +//! those same bytes back into the BLAKE2b digest tree. In the checkpoint range +//! the points are never otherwise needed (no proof/signature verification), so +//! the decompression is pure overhead; profiling the heavy shielded region +//! attributes ~44% of all CPU to these reparses. This module feeds Zebra's +//! canonical field bytes straight into the same BLAKE2b tree. +//! +//! The output is **byte-for-byte identical** to the `librustzcash` computation; +//! this is consensus-critical and is proven by the differential property test +//! `native_matches_librustzcash` (and `txid_and_auth_digest_matches_separate`) +//! in `transaction/tests/prop.rs`, plus the existing ZIP-244 known-answer +//! vectors and a clean differential mainnet sync. +//! +//! Specified in [ZIP-244] and [ZIP-225]. The personalizations and field +//! orderings mirror `zcash_primitives::transaction::txid` and +//! `orchard::bundle::commitments`. +//! +//! Only v5 transactions are handled here; v6 (the unstable `tx_v6` feature, +//! which can carry a ZIP-233 header field) still routes through `librustzcash`. +//! +//! [ZIP-244]: https://zips.z.cash/zip-0244 +//! [ZIP-225]: https://zips.z.cash/zip-0225 + +use std::io; + +use blake2b_simd::{Hash as Blake2bHash, Params, State}; + +use crate::{ + orchard, + parameters::TX_V5_VERSION_GROUP_ID, + sapling, + serialization::ZcashSerialize, + transaction::{AuthDigest, Hash, Transaction}, + transparent, +}; + +// txid tree root personalization (`ZcashTxHash_` ‖ consensus_branch_id LE32) +const ZCASH_TX_PERSONALIZATION_PREFIX: &[u8; 12] = b"ZcashTxHash_"; + +// txid level-1 node personalizations +const ZCASH_HEADERS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdHeadersHash"; +const ZCASH_TRANSPARENT_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdTranspaHash"; +const ZCASH_SAPLING_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdSaplingHash"; +const ZCASH_ORCHARD_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdOrchardHash"; + +// txid transparent level-2 node personalizations +const ZCASH_PREVOUTS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdPrevoutHash"; +const ZCASH_SEQUENCE_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdSequencHash"; +const ZCASH_OUTPUTS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdOutputsHash"; + +// txid sapling level-2 node personalizations +const ZCASH_SAPLING_SPENDS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdSSpendsHash"; +const ZCASH_SAPLING_SPENDS_COMPACT_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdSSpendCHash"; +const ZCASH_SAPLING_SPENDS_NONCOMPACT_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdSSpendNHash"; +const ZCASH_SAPLING_OUTPUTS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdSOutputHash"; +const ZCASH_SAPLING_OUTPUTS_COMPACT_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdSOutC__Hash"; +const ZCASH_SAPLING_OUTPUTS_MEMOS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdSOutM__Hash"; +const ZCASH_SAPLING_OUTPUTS_NONCOMPACT_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdSOutN__Hash"; + +// txid orchard level-2 node personalizations +const ZCASH_ORCHARD_ACTIONS_COMPACT_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdOrcActCHash"; +const ZCASH_ORCHARD_ACTIONS_MEMOS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdOrcActMHash"; +const ZCASH_ORCHARD_ACTIONS_NONCOMPACT_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxIdOrcActNHash"; + +// auth-digest tree root personalization (`ZTxAuthHash_` ‖ consensus_branch_id LE32) +const ZCASH_AUTH_PERSONALIZATION_PREFIX: &[u8; 12] = b"ZTxAuthHash_"; +const ZCASH_TRANSPARENT_SCRIPTS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxAuthTransHash"; +const ZCASH_SAPLING_SIGS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxAuthSapliHash"; +const ZCASH_ORCHARD_SIGS_HASH_PERSONALIZATION: &[u8; 16] = b"ZTxAuthOrchaHash"; + +/// A new BLAKE2b-256 state with the given 16-byte personalization. +fn hasher(personal: &[u8; 16]) -> State { + Params::new().hash_length(32).personal(personal).to_state() +} + +/// `io::Write` adapter that feeds bytes into a BLAKE2b [`State`], so Zebra's +/// existing [`ZcashSerialize`] implementations can write a field's canonical +/// bytes straight into a hash with no intermediate allocation. +struct HashWriter<'a>(&'a mut State); + +impl io::Write for HashWriter<'_> { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.0.update(buf); + Ok(buf.len()) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +/// Write a value's canonical [`ZcashSerialize`] bytes into a BLAKE2b state. +fn update_serialized(state: &mut State, value: &T) { + value + .zcash_serialize(HashWriter(state)) + .expect("writing to a BLAKE2b state is infallible"); +} + +/// The fields of a v5 transaction needed to compute its digests. +/// +/// Returns `None` for non-v5 transactions (the caller falls back to +/// `librustzcash`). +struct V5Parts<'a> { + network_upgrade: crate::parameters::NetworkUpgrade, + lock_time: &'a crate::transaction::LockTime, + expiry_height: crate::block::Height, + inputs: &'a [transparent::Input], + outputs: &'a [transparent::Output], + sapling: Option<&'a sapling::ShieldedData>, + orchard: Option<&'a orchard::ShieldedData>, +} + +fn v5_parts(tx: &Transaction) -> Option> { + match tx { + Transaction::V5 { + network_upgrade, + lock_time, + expiry_height, + inputs, + outputs, + sapling_shielded_data, + orchard_shielded_data, + } => Some(V5Parts { + network_upgrade: *network_upgrade, + lock_time, + expiry_height: *expiry_height, + inputs, + outputs, + sapling: sapling_shielded_data.as_ref(), + orchard: orchard_shielded_data.as_ref(), + }), + _ => None, + } +} + +/// The consensus branch id of a v5 transaction, as the LE `u32` committed to by +/// the header digest and both tree-root personalizations. +fn consensus_branch_id(parts: &V5Parts) -> u32 { + u32::from( + parts + .network_upgrade + .branch_id() + .expect("v5 network upgrade has a consensus branch id"), + ) +} + +// --- txid digest (ZIP-244 §T) ------------------------------------------------- + +/// ZIP-244 §T.1 header digest. +fn hash_header(parts: &V5Parts) -> Blake2bHash { + let mut h = hasher(ZCASH_HEADERS_HASH_PERSONALIZATION); + // header: fOverwintered (set for v5) in the high bit, version 5 in the low bits. + h.update(&(0x8000_0005_u32).to_le_bytes()); + h.update(&TX_V5_VERSION_GROUP_ID.to_le_bytes()); + h.update(&consensus_branch_id(parts).to_le_bytes()); + // lock_time and expiry_height are each a single LE u32; `LockTime` serializes + // as exactly that u32. + update_serialized(&mut h, parts.lock_time); + h.update(&parts.expiry_height.0.to_le_bytes()); + h.finalize() +} + +/// ZIP-244 §T.2a prevouts digest. +fn hash_prevouts(inputs: &[transparent::Input]) -> Blake2bHash { + let mut h = hasher(ZCASH_PREVOUTS_HASH_PERSONALIZATION); + for input in inputs { + match input { + transparent::Input::PrevOut { outpoint, .. } => update_serialized(&mut h, outpoint), + // A coinbase input commits to the null prevout, exactly as Zebra's + // `Input` serialization writes it. + transparent::Input::Coinbase { .. } => { + h.update(&[0u8; 32]); + h.update(&0xffff_ffff_u32.to_le_bytes()); + } + } + } + h.finalize() +} + +/// ZIP-244 §T.2b sequence digest. +fn hash_sequence(inputs: &[transparent::Input]) -> Blake2bHash { + let mut h = hasher(ZCASH_SEQUENCE_HASH_PERSONALIZATION); + for input in inputs { + h.update(&input.sequence().to_le_bytes()); + } + h.finalize() +} + +/// ZIP-244 §T.2c outputs digest. +fn hash_outputs(outputs: &[transparent::Output]) -> Blake2bHash { + let mut h = hasher(ZCASH_OUTPUTS_HASH_PERSONALIZATION); + for output in outputs { + update_serialized(&mut h, output); + } + h.finalize() +} + +/// ZIP-244 §T.2 transparent digest. +fn hash_transparent_txid( + inputs: &[transparent::Input], + outputs: &[transparent::Output], +) -> Blake2bHash { + let mut h = hasher(ZCASH_TRANSPARENT_HASH_PERSONALIZATION); + // The transparent bundle is absent (and the digest is the bare + // personalization hash) only when there are no inputs and no outputs. + if !inputs.is_empty() || !outputs.is_empty() { + h.update(hash_prevouts(inputs).as_bytes()); + h.update(hash_sequence(inputs).as_bytes()); + h.update(hash_outputs(outputs).as_bytes()); + } + h.finalize() +} + +/// ZIP-244 §T.3a sapling spends digest. +fn hash_sapling_spends(sapling: &sapling::ShieldedData) -> Blake2bHash { + let mut h = hasher(ZCASH_SAPLING_SPENDS_HASH_PERSONALIZATION); + if sapling.spends().next().is_some() { + let mut ch = hasher(ZCASH_SAPLING_SPENDS_COMPACT_HASH_PERSONALIZATION); + let mut nh = hasher(ZCASH_SAPLING_SPENDS_NONCOMPACT_HASH_PERSONALIZATION); + // In a v5 transaction every spend shares the one anchor. + let anchor = <[u8; 32]>::from( + sapling + .shared_anchor() + .expect("v5 sapling spends share an anchor when present"), + ); + for spend in sapling.spends() { + ch.update(&<[u8; 32]>::from(spend.nullifier)); + + update_serialized(&mut nh, &spend.cv); + nh.update(&anchor); + nh.update(&<[u8; 32]>::from(spend.rk.clone())); + } + h.update(ch.finalize().as_bytes()); + h.update(nh.finalize().as_bytes()); + } + h.finalize() +} + +/// ZIP-244 §T.3b sapling outputs digest. +fn hash_sapling_outputs(sapling: &sapling::ShieldedData) -> Blake2bHash { + let mut h = hasher(ZCASH_SAPLING_OUTPUTS_HASH_PERSONALIZATION); + if sapling.outputs().next().is_some() { + let mut ch = hasher(ZCASH_SAPLING_OUTPUTS_COMPACT_HASH_PERSONALIZATION); + let mut mh = hasher(ZCASH_SAPLING_OUTPUTS_MEMOS_HASH_PERSONALIZATION); + let mut nh = hasher(ZCASH_SAPLING_OUTPUTS_NONCOMPACT_HASH_PERSONALIZATION); + for output in sapling.outputs() { + ch.update(&output.cm_u.to_bytes()); + ch.update(&<[u8; 32]>::from(&output.ephemeral_key)); + ch.update(&output.enc_ciphertext.0[..52]); + + mh.update(&output.enc_ciphertext.0[52..564]); + + update_serialized(&mut nh, &output.cv); + nh.update(&output.enc_ciphertext.0[564..]); + nh.update(&output.out_ciphertext.0[..]); + } + h.update(ch.finalize().as_bytes()); + h.update(mh.finalize().as_bytes()); + h.update(nh.finalize().as_bytes()); + } + h.finalize() +} + +/// ZIP-244 §T.3 sapling digest. +fn hash_sapling_txid( + sapling: Option<&sapling::ShieldedData>, +) -> Blake2bHash { + let mut h = hasher(ZCASH_SAPLING_HASH_PERSONALIZATION); + if let Some(sapling) = sapling { + // `ShieldedData` only exists with at least one spend or output, so this + // matches librustzcash's "non-empty bundle" branch. + if sapling.spends().next().is_some() || sapling.outputs().next().is_some() { + h.update(hash_sapling_spends(sapling).as_bytes()); + h.update(hash_sapling_outputs(sapling).as_bytes()); + h.update(&sapling.value_balance.zatoshis().to_le_bytes()); + } + } + h.finalize() +} + +/// ZIP-244 §T.4 orchard digest (mirrors `orchard::bundle::commitments::hash_bundle_txid_data`). +fn hash_orchard_txid(orchard: Option<&orchard::ShieldedData>) -> Blake2bHash { + let mut h = hasher(ZCASH_ORCHARD_HASH_PERSONALIZATION); + if let Some(orchard) = orchard { + let mut ch = hasher(ZCASH_ORCHARD_ACTIONS_COMPACT_HASH_PERSONALIZATION); + let mut mh = hasher(ZCASH_ORCHARD_ACTIONS_MEMOS_HASH_PERSONALIZATION); + let mut nh = hasher(ZCASH_ORCHARD_ACTIONS_NONCOMPACT_HASH_PERSONALIZATION); + for action in orchard.actions() { + ch.update(&<[u8; 32]>::from(action.nullifier)); + ch.update(&<[u8; 32]>::from(action.cm_x)); + update_serialized(&mut ch, &action.ephemeral_key); + ch.update(&action.enc_ciphertext.0[..52]); + + mh.update(&action.enc_ciphertext.0[52..564]); + + update_serialized(&mut nh, &action.cv); + nh.update(&<[u8; 32]>::from(action.rk)); + nh.update(&action.enc_ciphertext.0[564..]); + nh.update(&action.out_ciphertext.0[..]); + } + h.update(ch.finalize().as_bytes()); + h.update(mh.finalize().as_bytes()); + h.update(nh.finalize().as_bytes()); + h.update(&[orchard.flags.bits()]); + h.update(&orchard.value_balance.zatoshis().to_le_bytes()); + h.update(&<[u8; 32]>::from(orchard.shared_anchor)); + } + h.finalize() +} + +/// Combine the four level-1 digests into the txid (ZIP-244 txid digest). +fn txid_inner(parts: &V5Parts) -> Hash { + let header = hash_header(parts); + let transparent = hash_transparent_txid(parts.inputs, parts.outputs); + let sapling = hash_sapling_txid(parts.sapling); + let orchard = hash_orchard_txid(parts.orchard); + + let mut personal = [0u8; 16]; + personal[..12].copy_from_slice(ZCASH_TX_PERSONALIZATION_PREFIX); + personal[12..].copy_from_slice(&consensus_branch_id(parts).to_le_bytes()); + + let mut h = hasher(&personal); + h.update(header.as_bytes()); + h.update(transparent.as_bytes()); + h.update(sapling.as_bytes()); + h.update(orchard.as_bytes()); + + Hash( + h.finalize() + .as_bytes() + .try_into() + .expect("BLAKE2b-256 digest is 32 bytes"), + ) +} + +// --- auth digest (ZIP-244 authorizing-data commitment) ------------------------ + +/// ZIP-244 transparent script-sig digest. +fn hash_transparent_auth( + inputs: &[transparent::Input], + outputs: &[transparent::Output], +) -> Blake2bHash { + let mut h = hasher(ZCASH_TRANSPARENT_SCRIPTS_HASH_PERSONALIZATION); + // Present only when the transparent bundle is present (any input or output). + if !inputs.is_empty() || !outputs.is_empty() { + for input in inputs { + match input { + transparent::Input::PrevOut { unlock_script, .. } => { + update_serialized(&mut h, unlock_script) + } + transparent::Input::Coinbase { .. } => { + let script = input + .coinbase_script() + .expect("v5 coinbase input has a valid script sig"); + update_serialized(&mut h, &script); + } + } + } + } + h.finalize() +} + +/// ZIP-244 sapling auth digest. +fn hash_sapling_auth( + sapling: Option<&sapling::ShieldedData>, +) -> Blake2bHash { + let mut h = hasher(ZCASH_SAPLING_SIGS_HASH_PERSONALIZATION); + if let Some(sapling) = sapling { + for spend in sapling.spends() { + h.update(&spend.zkproof.0[..]); + } + for spend in sapling.spends() { + h.update(&<[u8; 64]>::from(spend.spend_auth_sig)[..]); + } + for output in sapling.outputs() { + h.update(&output.zkproof.0[..]); + } + h.update(&<[u8; 64]>::from(sapling.binding_sig)[..]); + } + h.finalize() +} + +/// ZIP-244 orchard auth digest (mirrors `orchard::bundle::commitments::hash_bundle_auth_data`). +fn hash_orchard_auth(orchard: Option<&orchard::ShieldedData>) -> Blake2bHash { + let mut h = hasher(ZCASH_ORCHARD_SIGS_HASH_PERSONALIZATION); + if let Some(orchard) = orchard { + h.update(&orchard.proof.0[..]); + for action in orchard.actions.iter() { + update_serialized(&mut h, &action.spend_auth_sig); + } + update_serialized(&mut h, &orchard.binding_sig); + } + h.finalize() +} + +/// Combine the three authorizing-data digests into the ZIP-244 auth commitment. +fn auth_digest_inner(parts: &V5Parts) -> AuthDigest { + let transparent = hash_transparent_auth(parts.inputs, parts.outputs); + let sapling = hash_sapling_auth(parts.sapling); + let orchard = hash_orchard_auth(parts.orchard); + + let mut personal = [0u8; 16]; + personal[..12].copy_from_slice(ZCASH_AUTH_PERSONALIZATION_PREFIX); + personal[12..].copy_from_slice(&consensus_branch_id(parts).to_le_bytes()); + + let mut h = hasher(&personal); + h.update(transparent.as_bytes()); + h.update(sapling.as_bytes()); + h.update(orchard.as_bytes()); + + AuthDigest( + h.finalize() + .as_bytes() + .try_into() + .expect("BLAKE2b-256 digest is 32 bytes"), + ) +} + +// --- public entry points ------------------------------------------------------ + +/// Computes the txid of a v5 transaction natively, or returns `None` for other +/// versions (the caller falls back to the `librustzcash` path). +pub(crate) fn txid(tx: &Transaction) -> Option { + Some(txid_inner(&v5_parts(tx)?)) +} + +/// Computes the ZIP-244 authorizing-data digest of a v5 transaction natively, or +/// returns `None` for other versions. +pub(crate) fn auth_digest(tx: &Transaction) -> Option { + Some(auth_digest_inner(&v5_parts(tx)?)) +} + +/// Computes both the txid and the ZIP-244 authorizing-data digest of a v5 +/// transaction natively, or returns `None` for other versions. +pub(crate) fn txid_and_auth_digest(tx: &Transaction) -> Option<(Hash, AuthDigest)> { + let parts = v5_parts(tx)?; + Some((txid_inner(&parts), auth_digest_inner(&parts))) +} From c2c24b0bad09745af98489fa2de484ba359c6a2a Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Fri, 19 Jun 2026 15:40:46 -0300 Subject: [PATCH 07/16] perf(chain): drop the discarded librustzcash reparse on v5 deserialize (#133) V5 transaction deserialization re-ran the full Transaction::to_librustzcash conversion and discarded the result, purely to reject transactions that Zebra can parse but librustzcash cannot. That conversion decompresses every Jubjub and Pallas curve point. A flamegraph of the heavy shielded region attributes about 25 to 30 percent of checkpoint-sync CPU to this single discarded reparse, and after the native ZIP-244 digest change it is the largest remaining cost. The check is redundant for rejecting untrusted transactions. Every transaction from a peer, the mempool, or sendrawtransaction is converted via CachedFfiTransaction::new before the semantic verifier accepts it, so a non-convertible v5 transaction is still rejected there with a clean error, including fully shielded transactions whose bundles are derived from that same conversion. Blocks below the checkpoints are trusted by their hash and validated against the header merkle root built from the native transaction IDs, and the checkpoint commit path no longer calls to_librustzcash for v5. Zebra's own deserializer still rejects the non-canonical encodings it validates (for example an identity-point Orchard rk), so only the librustzcash-specific re-validation moves from parse time to verification time. The pre-NU5 consensus branch id rejection added by the same upstream change is kept, since it is independent and cheap. --- zebra-chain/src/transaction/serialize.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/zebra-chain/src/transaction/serialize.rs b/zebra-chain/src/transaction/serialize.rs index d59748ffc38..9cd162f73e9 100644 --- a/zebra-chain/src/transaction/serialize.rs +++ b/zebra-chain/src/transaction/serialize.rs @@ -1033,7 +1033,17 @@ impl ZcashDeserialize for Transaction { // `proofsOrchard`, `vSpendAuthSigsOrchard`, and `bindingSigOrchard`. let orchard_shielded_data = (&mut limited_reader).zcash_deserialize_into()?; - let tx = Transaction::V5 { + // Convertibility to the librustzcash transaction type is + // intentionally not re-checked here. That check re-runs the full + // conversion, which decompresses every Jubjub/Pallas curve point, + // on every block, and it is the dominant CPU cost of checkpoint + // sync. It is also redundant: untrusted transactions that are not + // convertible are still rejected by the semantic verifier, which + // converts every transaction via `CachedFfiTransaction::new` + // before accepting it, while blocks below the checkpoints are + // trusted by their hash (and validated against the header merkle + // root built from the transaction IDs). + Ok(Transaction::V5 { network_upgrade, lock_time, expiry_height, @@ -1041,11 +1051,7 @@ impl ZcashDeserialize for Transaction { outputs, sapling_shielded_data, orchard_shielded_data, - }; - - tx.to_librustzcash(network_upgrade)?; - - Ok(tx) + }) } #[cfg(any(zcash_unstable = "nu6.3", zcash_unstable = "nu7"))] (6, true) => { From e632fdf7c3090ae871c6ae5bfa0748a89980777c Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Fri, 19 Jun 2026 16:34:05 -0300 Subject: [PATCH 08/16] perf(chain): defer Sapling cv/epk decompression, enforce on the semantic path (#136) * proto(chain): defer Sapling value-commitment point decompression PROTOTYPE for benchmarking lever #1. After the native-digest and dropped-reparse changes, a flamegraph of the checkpoint heavy region attributes about 60% of CPU to Sapling Jubjub point decompression (a field square root in jubjub::AffinePoint::from_bytes), almost entirely the value commitment cv on every spend and output. Checkpoint sync never uses cv as a point: it verifies no signatures or proofs, and the note-commitment tree uses cm_u, not cv. Store cv as its canonical 32-byte encoding and decompress lazily, only when a consumer needs the point. Deserialization just copies the bytes, serialization and the txid digest use them directly, and the binding-signature verification in the semantic verifier decompresses on demand via ValueCommitment::commitment. This mirrors what Orchard already does for rk, which is why Orchard decompression is negligible in the profile. Prototype caveat: ValueCommitment::commitment panics on a non-canonical encoding rather than returning an error, and the not-small-order check now happens at the point of use instead of at parse. Correct for checkpoint sync (block hashes are trusted) and exercised by the unit tests, but the production version must make the accessor fallible so the semantic and mempool paths reject a malformed point cleanly instead of panicking. * proto(chain): also defer Sapling ephemeral_key point decompression Extends the lazy value-commitment prototype. With cv deferred, the profile showed the remaining ~50% of heavy-region CPU is the other per-output Jubjub point, the ephemeral_key, decompressed at parse. The validator only needs its bytes (txid digest and serialization); the point is needed only for wallet trial-decryption. Store ephemeral_key as its canonical 32-byte encoding and skip decompression at deserialization, like cv. Same prototype caveat: the not-small-order consensus check is deferred and must be re-added on the semantic and mempool paths in a production version. * proto(chain): validate lazy Sapling cv/epk consensus safety The deferred not-small-order checks for cv and ephemeral_key are not actually missing on the consensus path: librustzcash enforces them for every untrusted transaction, which all go through to_librustzcash (CachedFfiTransaction::new) on the semantic and mempool paths. cv is rejected at read (zcash_primitives read_value_commitment uses from_bytes_not_small_order); epk is rejected at verify (sapling-crypto verifier check_output uses epk.is_small_order). The checkpoint verifier trusts block hashes and does not need them. Add a regression test that constructs a v5 transaction with a small-order cv and epk and asserts both the deferral (Zebra now deserializes it) and the safety net (to_librustzcash rejects it), plus that the exact library detection functions flag the point. Correct the type docs accordingly. * perf(chain): enforce deferred Sapling cv/epk check on the semantic path Hardens the lazy Sapling cv/ephemeral_key prototype into a safer design. The lazy types keep point decompression off the checkpoint-sync hot path (the measured ~2.5x win), but the not-small-order consensus check is now re-enforced explicitly by Zebra on the untrusted boundary instead of relying solely on librustzcash. Add `Transaction::sapling_point_encodings_are_valid` (and the underlying `ShieldedData::point_encodings_are_valid`, `ValueCommitment::is_valid_not_small_order`, `EphemeralPublicKey::is_valid_not_small_order`), and call it from `verify_v4_transaction` / `verify_v5_transaction`, returning `TransactionError::SmallOrder` for a small-order or off-curve cv or epk. This runs on the semantic verification path and the mempool, which process untrusted transactions; the checkpoint verifier never calls it (it trusts block hashes), so the checkpoint throughput is unchanged. This restores a Zebra-side, auditable enforcement of the rule and makes the epk check isolatedly testable (it runs independently of proof verification). Spend rk is still validated at deserialization. Validated by `sapling_point_encodings_check_rejects_bad_points` and the existing lazy-cv/epk tests. * fix(consensus): run the deferred Sapling cv/epk check before to_librustzcash Adversarial review of the lazy Sapling change found one non-consensus issue: a small-order or off-curve cv failed inside CachedFfiTransaction::new (mapped to UnsupportedByNetworkUpgrade, mempool misbehavior score 0) before the explicit SmallOrder check ran, so a peer spamming bad-cv transactions received a lighter penalty than before the change (when it was a deserialization error). Move the sapling_point_encodings_are_valid check into the verifier's early quick checks, before the state lookups and the librustzcash conversion. Now a bad cv or epk fails fast with TransactionError::SmallOrder (score 100), restoring the peer penalty and making the check the primary, version-agnostic enforcer for v4, v5, and v6. Remove the now-redundant per-version copies. No consensus behavior change: the same transactions are accepted and rejected. The review confirmed no path commits or relays a transaction with a bad point without this check or checkpoint hash-trust, the commitment() panic is not reachable in release (no non-test caller), and there is no DoS amplification. * refactor(chain): make ValueCommitment::commitment fallible Removes the latent panic in `ValueCommitment::commitment`, which is the only caller-facing point that could decompress a deferred (unvalidated) value commitment. It now returns `Option`, so a future caller must handle an invalid encoding instead of getting a hidden panic, eliminating a possible DoS if the helper were ever moved onto a production path. `ShieldedData::binding_verification_key` (its only caller, used in tests) now propagates the `Option`. No production code calls either; the consensus encoding check happens on the semantic path via `sapling_point_encodings_are_valid`. * test(consensus): end-to-end reject of a Sapling output with an invalid epk Adds the missing end-to-end test for the deferred Sapling cv/epk check: it takes a real Sapling-output transaction, corrupts the first output's ephemeral key to an off-curve point, and runs it through the full transaction Verifier, asserting TransactionError::SmallOrder. The state service is unreachable!, proving the check fires in the early quick checks before any state lookup, and that the rejection is the explicit SmallOrder error rather than a later proof failure. This closes the last gap from the security review: the epk rejection is now confirmed by execution through the live verifier, not only by the isolated check and the librustzcash backstop. * consensus equivalence tests --- zebra-chain/src/sapling/arbitrary.rs | 4 +- zebra-chain/src/sapling/commitment.rs | 105 +++- zebra-chain/src/sapling/keys.rs | 92 ++-- zebra-chain/src/sapling/output.rs | 12 +- zebra-chain/src/sapling/shielded_data.rs | 50 +- zebra-chain/src/sapling/spend.rs | 8 +- zebra-chain/src/transaction.rs | 28 ++ zebra-chain/src/transaction/tests/vectors.rs | 487 ++++++++++++++++++- zebra-consensus/src/transaction.rs | 21 + zebra-consensus/src/transaction/tests.rs | 94 ++++ 10 files changed, 814 insertions(+), 87 deletions(-) diff --git a/zebra-chain/src/sapling/arbitrary.rs b/zebra-chain/src/sapling/arbitrary.rs index 7323307e8d5..14477e1b85f 100644 --- a/zebra-chain/src/sapling/arbitrary.rs +++ b/zebra-chain/src/sapling/arbitrary.rs @@ -85,7 +85,9 @@ impl Arbitrary for Output { cv: ExtendedPoint::generator().into(), cm_u: sapling_crypto::note::ExtractedNoteCommitment::from_bytes(&[0u8; 32]) .unwrap(), - ephemeral_key: keys::EphemeralPublicKey(ExtendedPoint::generator().into()), + ephemeral_key: keys::EphemeralPublicKey( + jubjub::AffinePoint::from(ExtendedPoint::generator()).to_bytes(), + ), enc_ciphertext, out_ciphertext, zkproof, diff --git a/zebra-chain/src/sapling/commitment.rs b/zebra-chain/src/sapling/commitment.rs index edf1fddc8b9..c747b170025 100644 --- a/zebra-chain/src/sapling/commitment.rs +++ b/zebra-chain/src/sapling/commitment.rs @@ -4,7 +4,7 @@ use std::io; use hex::{FromHex, FromHexError, ToHex}; -use crate::serialization::{serde_helpers, SerializationError, ZcashDeserialize, ZcashSerialize}; +use crate::serialization::{SerializationError, ZcashDeserialize, ZcashSerialize}; #[cfg(test)] mod test_vectors; @@ -16,28 +16,82 @@ mod test_vectors; #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub struct CommitmentRandomness(jubjub::Fr); -/// A wrapper for the `sapling_crypto::value::ValueCommitment` type. +/// A Sapling value commitment, stored as its canonical 32-byte compressed +/// encoding. /// -/// We need the wrapper to derive Serialize, Deserialize and Equality. -#[derive(Clone, Debug, Deserialize, Serialize)] -pub struct ValueCommitment( - #[serde(with = "serde_helpers::ValueCommitment")] pub sapling_crypto::value::ValueCommitment, -); - -impl PartialEq for ValueCommitment { - fn eq(&self, other: &Self) -> bool { - self.0.as_inner() == other.0.as_inner() - } -} -impl Eq for ValueCommitment {} +/// The commitment is a Jubjub curve point. Recovering the point from its +/// encoding requires a field square root (point decompression), which is +/// expensive, and the note-commitment tree uses the note commitment `cm_u`, not +/// `cv`, so the point is decompressed lazily via [`ValueCommitment::commitment`] +/// rather than eagerly at deserialization. This keeps the dominant per-block CPU +/// cost of checkpoint sync (Jubjub point decompression) off the hot path. +/// +/// # Consensus +/// +/// The not-small-order check that this type used to perform at deserialization +/// is deferred, but still enforced for every untrusted transaction. The +/// checkpoint verifier trusts block hashes and does not need it. The semantic +/// verifier and the mempool convert every transaction via `to_librustzcash` +/// (`CachedFfiTransaction::new`), and librustzcash enforces the rule at *read*: +/// `zcash_primitives`'s `read_value_commitment` uses +/// `ValueCommitment::from_bytes_not_small_order`, so a small-order `cv` makes the +/// conversion fail and the transaction is rejected. Validated by +/// `sapling_small_order_cv_epk_deferred_but_caught_by_librustzcash` in +/// `transaction/tests/vectors.rs`. +#[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct ValueCommitment(pub(crate) [u8; 32]); impl ValueCommitment { + /// Decompresses and returns the underlying `sapling_crypto` value + /// commitment, or `None` if the stored bytes are not a canonical, + /// non-small-order Jubjub point. + /// + /// This performs the point decompression that deserialization defers, so it + /// is fallible by design: the encoding is validated only where the point is + /// used, and callers must handle an invalid commitment rather than assume it + /// is valid. Consensus validation of the encoding happens on the semantic + /// path via [`crate::transaction::Transaction::sapling_point_encodings_are_valid`] + /// and `to_librustzcash`; the checkpoint verifier trusts block hashes and + /// never calls this. + pub fn commitment(&self) -> Option { + sapling_crypto::value::ValueCommitment::from_bytes_not_small_order(&self.0).into_option() + } + + /// Return the canonical 32-byte (little-endian) compressed encoding. + pub fn to_bytes(&self) -> [u8; 32] { + self.0 + } + + /// Returns true if the stored encoding is a canonical, non-small-order + /// Jubjub point, i.e. a valid value commitment per the consensus rules. + /// + /// This performs the point decompression that deserialization defers; it is + /// called by the semantic verifier (not the checkpoint verifier) to enforce + /// the not-small-order rule on untrusted transactions. + /// + /// # Consensus equivalence + /// + /// This MUST accept exactly the encodings that librustzcash accepts for a + /// `cv` on the verification path. If it diverged, Zebra and the rest of the + /// network would disagree on transaction validity — a chain split, not a + /// local bug. `zcash_primitives`'s `read_value_commitment` rejects a `cv` + /// unless `sapling_crypto::value::ValueCommitment::from_bytes_not_small_order` + /// returns a point, so this calls that exact function. Do not reimplement it + /// in terms of a different decoder. The equivalence is pinned by + /// `sapling_point_checks_match_librustzcash_predicates` in + /// `transaction/tests/vectors.rs`. + pub fn is_valid_not_small_order(&self) -> bool { + bool::from( + sapling_crypto::value::ValueCommitment::from_bytes_not_small_order(&self.0).is_some(), + ) + } + /// Return the hash bytes in big-endian byte-order suitable for printing out byte by byte. /// /// Zebra displays commitment value in big-endian byte-order, /// following the convention set by zcashd. pub fn bytes_in_display_order(&self) -> [u8; 32] { - let mut reversed_bytes = self.0.to_bytes(); + let mut reversed_bytes = self.0; reversed_bytes.reverse(); reversed_bytes } @@ -75,14 +129,7 @@ impl From for ValueCommitment { /// /// Panics if the given point does not correspond to a valid ValueCommitment. fn from(extended_point: jubjub::ExtendedPoint) -> Self { - let bytes = jubjub::AffinePoint::from(extended_point).to_bytes(); - - let value_commitment = - sapling_crypto::value::ValueCommitment::from_bytes_not_small_order(&bytes) - .into_option() - .expect("invalid ValueCommitment bytes"); - - ValueCommitment(value_commitment) + ValueCommitment(jubjub::AffinePoint::from(extended_point).to_bytes()) } } @@ -99,15 +146,19 @@ impl ZcashDeserialize for sapling_crypto::value::ValueCommitment { } impl ZcashDeserialize for ValueCommitment { - fn zcash_deserialize(reader: R) -> Result { - let value_commitment = sapling_crypto::value::ValueCommitment::zcash_deserialize(reader)?; - Ok(Self(value_commitment)) + fn zcash_deserialize(mut reader: R) -> Result { + // Store the canonical encoding without decompressing the Jubjub point. + // The point (and its non-small-order check) is recovered lazily in + // `ValueCommitment::commitment`, only where the point is actually needed. + let mut bytes = [0u8; 32]; + reader.read_exact(&mut bytes)?; + Ok(Self(bytes)) } } impl ZcashSerialize for ValueCommitment { fn zcash_serialize(&self, mut writer: W) -> Result<(), io::Error> { - writer.write_all(&self.0.to_bytes())?; + writer.write_all(&self.0)?; Ok(()) } } diff --git a/zebra-chain/src/sapling/keys.rs b/zebra-chain/src/sapling/keys.rs index 58d9208def9..da661c5a1e2 100644 --- a/zebra-chain/src/sapling/keys.rs +++ b/zebra-chain/src/sapling/keys.rs @@ -17,9 +17,7 @@ use rand_core::{CryptoRng, RngCore}; use crate::{ error::{AddressError, RandError}, primitives::redjubjub::SpendAuth, - serialization::{ - serde_helpers, ReadZcashExt, SerializationError, ZcashDeserialize, ZcashSerialize, - }, + serialization::{ReadZcashExt, SerializationError, ZcashDeserialize, ZcashSerialize}, }; #[cfg(test)] @@ -248,64 +246,90 @@ impl PartialEq<[u8; 32]> for TransmissionKey { /// /// [1]: https://zips.z.cash/protocol/protocol.pdf#outputdesc /// [2]: https://zips.z.cash/protocol/protocol.pdf#concretesaplingkeyagreement -#[derive(Copy, Clone, Deserialize, PartialEq, Serialize)] -pub struct EphemeralPublicKey( - #[serde(with = "serde_helpers::AffinePoint")] pub(crate) jubjub::AffinePoint, -); +/// A Sapling ephemeral public key, stored as its canonical 32-byte encoding. +/// +/// The key is a Jubjub curve point, but the validator only ever needs its bytes +/// (for the txid digest and serialization); the point itself is needed only for +/// wallet trial-decryption. So the point is not decompressed at deserialization, +/// keeping the Jubjub point decompression (a field square root) off the +/// checkpoint-sync hot path, where every Sapling output carries one. +/// +/// # Consensus +/// +/// The not-small-order check that this type used to perform at deserialization +/// is deferred, but still enforced for every untrusted transaction. The +/// checkpoint verifier trusts block hashes and does not need it. The semantic +/// verifier and the mempool convert every transaction via `to_librustzcash` +/// (`CachedFfiTransaction::new`) and verify the Sapling bundle, and +/// librustzcash enforces the rule in `SaplingVerificationContext::check_output` +/// (sapling-crypto `verifier.rs`, `epk.is_small_order()`). Validated by +/// `sapling_small_order_cv_epk_deferred_but_caught_by_librustzcash` in +/// `transaction/tests/vectors.rs`. +#[derive(Copy, Clone, Deserialize, PartialEq, Eq, Serialize)] +pub struct EphemeralPublicKey(pub(crate) [u8; 32]); + +impl EphemeralPublicKey { + /// Returns true if the stored encoding is a canonical, non-small-order + /// Jubjub point, i.e. a valid ephemeral public key per the consensus rules. + /// + /// This performs the point decompression that deserialization defers; it is + /// called by the semantic verifier (not the checkpoint verifier) to enforce + /// the not-small-order rule on untrusted transactions. + /// + /// # Consensus equivalence + /// + /// This MUST accept exactly the encodings that librustzcash accepts for an + /// `epk` on the verification path. If it diverged, Zebra and the rest of the + /// network would disagree on transaction validity — a chain split, not a + /// local bug. librustzcash decodes `epk` with `jubjub::ExtendedPoint::from_bytes` + /// (sapling-crypto `verifier/batch.rs`) and rejects it in + /// `SaplingVerificationContext::check_output` when `epk.is_small_order()` + /// (sapling-crypto `verifier.rs`). Decoding as an `AffinePoint` here is + /// equivalent — both reject the same non-canonical/off-curve encodings and + /// agree on `is_small_order` — and that equivalence is pinned by + /// `sapling_point_checks_match_librustzcash_predicates` in + /// `transaction/tests/vectors.rs`. + pub fn is_valid_not_small_order(&self) -> bool { + match jubjub::AffinePoint::from_bytes(self.0).into_option() { + Some(point) => !bool::from(point.is_small_order()), + None => false, + } + } +} impl fmt::Debug for EphemeralPublicKey { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("EphemeralPublicKey") - .field("u", &hex::encode(self.0.get_u().to_bytes())) - .field("v", &hex::encode(self.0.get_v().to_bytes())) + .field("epk", &hex::encode(self.0)) .finish() } } -impl Eq for EphemeralPublicKey {} - impl From for [u8; 32] { fn from(nk: EphemeralPublicKey) -> [u8; 32] { - nk.0.to_bytes() + nk.0 } } impl From<&EphemeralPublicKey> for [u8; 32] { fn from(nk: &EphemeralPublicKey) -> [u8; 32] { - nk.0.to_bytes() + nk.0 } } impl PartialEq<[u8; 32]> for EphemeralPublicKey { fn eq(&self, other: &[u8; 32]) -> bool { - &self.0.to_bytes() == other + &self.0 == other } } impl TryFrom<[u8; 32]> for EphemeralPublicKey { type Error = &'static str; - /// Read an EphemeralPublicKey from a byte array. - /// - /// Returns an error if the key is non-canonical, or [it is of small order][1]. - /// - /// # Consensus - /// - /// > Check that a Output description's cv and epk are not of small order, - /// > i.e. \[h_J\]cv MUST NOT be 𝒪_J and \[h_J\]epk MUST NOT be 𝒪_J. - /// - /// [1]: https://zips.z.cash/protocol/protocol.pdf#outputdesc + /// Store an EphemeralPublicKey from a byte array, deferring point + /// decompression and the not-small-order check (see the type docs). fn try_from(bytes: [u8; 32]) -> Result { - let possible_point = jubjub::AffinePoint::from_bytes(bytes); - - if possible_point.is_none().into() { - return Err("Invalid jubjub::AffinePoint value for Sapling EphemeralPublicKey"); - } - if possible_point.unwrap().is_small_order().into() { - Err("jubjub::AffinePoint value for Sapling EphemeralPublicKey point is of small order") - } else { - Ok(Self(possible_point.unwrap())) - } + Ok(Self(bytes)) } } diff --git a/zebra-chain/src/sapling/output.rs b/zebra-chain/src/sapling/output.rs index a654386bbeb..e8344ccfffd 100644 --- a/zebra-chain/src/sapling/output.rs +++ b/zebra-chain/src/sapling/output.rs @@ -124,7 +124,7 @@ impl OutputInTransactionV4 { impl ZcashSerialize for OutputInTransactionV4 { fn zcash_serialize(&self, mut writer: W) -> Result<(), io::Error> { let output = self.0.clone(); - writer.write_all(&output.cv.0.to_bytes())?; + writer.write_all(&output.cv.0)?; writer.write_all(&output.cm_u.to_bytes())?; output.ephemeral_key.zcash_serialize(&mut writer)?; output.enc_ciphertext.zcash_serialize(&mut writer)?; @@ -151,9 +151,7 @@ impl ZcashDeserialize for OutputInTransactionV4 { // Type is `ValueCommit^{Sapling}.Output`, i.e. J // https://zips.z.cash/protocol/protocol.pdf#abstractcommit // See [`sapling_crypto::value::ValueCommitment::zcash_deserialize`]. - cv: commitment::ValueCommitment( - sapling_crypto::value::ValueCommitment::zcash_deserialize(&mut reader)?, - ), + cv: commitment::ValueCommitment::zcash_deserialize(&mut reader)?, // Type is `B^{[ℓ_{Sapling}_{Merkle}]}`, i.e. 32 bytes. // However, the consensus rule above restricts it even more. // See [`sapling_crypto::note::ExtractedNoteCommitment::zcash_deserialize`]. @@ -190,7 +188,7 @@ impl ZcashDeserialize for OutputInTransactionV4 { impl ZcashSerialize for OutputPrefixInTransactionV5 { fn zcash_serialize(&self, mut writer: W) -> Result<(), io::Error> { - writer.write_all(&self.cv.0.to_bytes())?; + writer.write_all(&self.cv.0)?; writer.write_all(&self.cm_u.to_bytes())?; self.ephemeral_key.zcash_serialize(&mut writer)?; self.enc_ciphertext.zcash_serialize(&mut writer)?; @@ -216,9 +214,7 @@ impl ZcashDeserialize for OutputPrefixInTransactionV5 { // Type is `ValueCommit^{Sapling}.Output`, i.e. J // https://zips.z.cash/protocol/protocol.pdf#abstractcommit // See [`sapling_crypto::value::ValueCommitment::zcash_deserialize`]. - cv: commitment::ValueCommitment( - sapling_crypto::value::ValueCommitment::zcash_deserialize(&mut reader)?, - ), + cv: commitment::ValueCommitment::zcash_deserialize(&mut reader)?, // Type is `B^{[ℓ_{Sapling}_{Merkle}]}`, i.e. 32 bytes. // However, the consensus rule above restricts it even more. // See [`sapling_crypto::note::ExtractedNoteCommitment::zcash_deserialize`]. diff --git a/zebra-chain/src/sapling/shielded_data.rs b/zebra-chain/src/sapling/shielded_data.rs index 7a08001fe53..d4817f66b2a 100644 --- a/zebra-chain/src/sapling/shielded_data.rs +++ b/zebra-chain/src/sapling/shielded_data.rs @@ -234,6 +234,24 @@ where self.transfers.outputs() } + /// Returns true if every value commitment and ephemeral public key in this + /// bundle is a canonical, non-small-order Jubjub point. + /// + /// Deserialization stores these points as raw bytes and defers the + /// not-small-order check to keep point decompression off the checkpoint-sync + /// hot path. This method performs that deferred check; the semantic verifier + /// calls it for untrusted transactions, while the checkpoint verifier (which + /// trusts block hashes) does not. Spend `rk` is validated separately at + /// deserialization. + pub fn point_encodings_are_valid(&self) -> bool { + self.spends() + .all(|spend| spend.cv.is_valid_not_small_order()) + && self.outputs().all(|output| { + output.cv.is_valid_not_small_order() + && output.ephemeral_key.is_valid_not_small_order() + }) + } + /// Provide the shared anchor for this transaction, if present. /// /// The shared anchor is only present if: @@ -279,15 +297,29 @@ where /// descriptions of the transaction, and the balancing value. /// /// - pub fn binding_verification_key(&self) -> redjubjub::VerificationKeyBytes { - let cv_old: sapling_crypto::value::CommitmentSum = - self.spends().map(|spend| spend.cv.0.clone()).sum(); - let cv_new: sapling_crypto::value::CommitmentSum = - self.outputs().map(|output| output.cv.0.clone()).sum(); - - (cv_old - cv_new) - .into_bvk(self.value_balance.zatoshis()) - .into() + /// Returns `None` if any value commitment is not a canonical, non-small-order + /// point. The encodings are validated on the semantic verification path + /// (`Transaction::sapling_point_encodings_are_valid`), so a `None` here means + /// the caller is working with an unvalidated transaction. + pub fn binding_verification_key(&self) -> Option> { + let cv_old: sapling_crypto::value::CommitmentSum = self + .spends() + .map(|spend| spend.cv.commitment()) + .collect::>>()? + .into_iter() + .sum(); + let cv_new: sapling_crypto::value::CommitmentSum = self + .outputs() + .map(|output| output.cv.commitment()) + .collect::>>()? + .into_iter() + .sum(); + + Some( + (cv_old - cv_new) + .into_bvk(self.value_balance.zatoshis()) + .into(), + ) } } diff --git a/zebra-chain/src/sapling/spend.rs b/zebra-chain/src/sapling/spend.rs index 068df147076..c6afd9a11e1 100644 --- a/zebra-chain/src/sapling/spend.rs +++ b/zebra-chain/src/sapling/spend.rs @@ -159,7 +159,7 @@ impl Spend { impl ZcashSerialize for Spend { fn zcash_serialize(&self, mut writer: W) -> Result<(), io::Error> { - writer.write_all(&self.cv.0.to_bytes())?; + writer.write_all(&self.cv.0)?; self.per_spend_anchor.zcash_serialize(&mut writer)?; writer.write_32_bytes(&self.nullifier.into())?; writer.write_all(&<[u8; 32]>::from(self.rk.clone())[..])?; @@ -203,9 +203,7 @@ impl ZcashDeserialize for Spend { // Type is `ValueCommit^{Sapling}.Output`, i.e. J // https://zips.z.cash/protocol/protocol.pdf#abstractcommit // See [`sapling_crypto::value::ValueCommitment::::zcash_deserialize`]. - cv: commitment::ValueCommitment( - sapling_crypto::value::ValueCommitment::zcash_deserialize(&mut reader)?, - ), + cv: commitment::ValueCommitment::zcash_deserialize(&mut reader)?, // Type is `B^{[ℓ_{Sapling}_{Merkle}]}`, i.e. 32 bytes. // But as mentioned above, we validate it further as an integer. per_spend_anchor: (&mut reader).zcash_deserialize_into()?, @@ -240,7 +238,7 @@ impl ZcashDeserialize for Spend { impl ZcashSerialize for SpendPrefixInTransactionV5 { fn zcash_serialize(&self, mut writer: W) -> Result<(), io::Error> { - writer.write_all(&self.cv.0.to_bytes())?; + writer.write_all(&self.cv.0)?; writer.write_32_bytes(&self.nullifier.into())?; writer.write_all(&<[u8; 32]>::from(self.rk.clone())[..])?; Ok(()) diff --git a/zebra-chain/src/transaction.rs b/zebra-chain/src/transaction.rs index efbb5682aeb..5f1f21ce2a2 100644 --- a/zebra-chain/src/transaction.rs +++ b/zebra-chain/src/transaction.rs @@ -1077,6 +1077,34 @@ impl Transaction { } } + /// Returns true if every Sapling value commitment and ephemeral public key in + /// this transaction is a canonical, non-small-order Jubjub point (or the + /// transaction has no Sapling data). + /// + /// Those points are stored as raw bytes and their not-small-order check is + /// deferred from deserialization to keep point decompression off the + /// checkpoint-sync hot path. The semantic verifier calls this to enforce the + /// consensus rule on untrusted transactions; the checkpoint verifier does not + /// need it because it trusts block hashes. + pub fn sapling_point_encodings_are_valid(&self) -> bool { + match self { + Transaction::V4 { + sapling_shielded_data: Some(sapling_shielded_data), + .. + } => sapling_shielded_data.point_encodings_are_valid(), + Transaction::V5 { + sapling_shielded_data: Some(sapling_shielded_data), + .. + } => sapling_shielded_data.point_encodings_are_valid(), + #[cfg(all(zcash_unstable = "nu7", feature = "tx_v6"))] + Transaction::V6 { + sapling_shielded_data: Some(sapling_shielded_data), + .. + } => sapling_shielded_data.point_encodings_are_valid(), + _ => true, + } + } + // orchard /// Access the [`orchard::ShieldedData`] in this transaction, diff --git a/zebra-chain/src/transaction/tests/vectors.rs b/zebra-chain/src/transaction/tests/vectors.rs index b8e60fc3c36..44460a80b91 100644 --- a/zebra-chain/src/transaction/tests/vectors.rs +++ b/zebra-chain/src/transaction/tests/vectors.rs @@ -970,7 +970,9 @@ fn binding_signatures() { .expect("network upgrade is valid for tx"); let bvk = redjubjub::VerificationKey::try_from( - sapling_shielded_data.binding_verification_key(), + sapling_shielded_data + .binding_verification_key() + .expect("test transaction has valid value commitments"), ) .expect("a valid redjubjub::VerificationKey"); @@ -1001,7 +1003,9 @@ fn binding_signatures() { .expect("network upgrade is valid for tx"); let bvk = redjubjub::VerificationKey::try_from( - sapling_shielded_data.binding_verification_key(), + sapling_shielded_data + .binding_verification_key() + .expect("test transaction has valid value commitments"), ) .expect("a valid redjubjub::VerificationKey"); @@ -1033,7 +1037,9 @@ fn binding_signatures() { .expect("network upgrade is valid for tx"); let bvk = redjubjub::VerificationKey::try_from( - sapling_shielded_data.binding_verification_key(), + sapling_shielded_data + .binding_verification_key() + .expect("test transaction has valid value commitments"), ) .expect("a valid redjubjub::VerificationKey"); @@ -1159,6 +1165,481 @@ fn orchard_rk_identity_point() { Transaction::zcash_deserialize(&tx_bytes[..]).expect_err("rk = identity should fail"); } +/// Validates that lazy Sapling `cv` / `ephemeral_key` deserialization stays +/// consensus-safe. +/// +/// To keep the Jubjub point decompression (a field square root) off the +/// checkpoint-sync hot path, `cv` and `ephemeral_key` are now stored as raw +/// bytes and the not-small-order consensus check is deferred. This is safe +/// because every *untrusted* transaction (semantic block verification, the +/// mempool, and `sendrawtransaction`) is converted via `to_librustzcash` +/// (`CachedFfiTransaction::new`) before it is accepted, and librustzcash +/// independently enforces the same rules: +/// +/// - `cv`: rejected at *read* — `zcash_primitives`'s `read_value_commitment` +/// uses `ValueCommitment::from_bytes_not_small_order`, so `to_librustzcash` +/// fails on a small-order `cv`. +/// - `ephemeral_key`: rejected at *verify* — `SaplingVerificationContext:: +/// check_output` (sapling-crypto `verifier.rs`) checks `epk.is_small_order()`. +/// +/// The checkpoint verifier does not need these checks: it trusts block hashes, +/// and a malicious block with a small-order point either fails its checkpoint +/// hash or the header merkle root. +/// +/// This test asserts the deferral (Zebra now *accepts* a small-order `cv`/`epk` +/// at deserialization) and the safety net (`to_librustzcash` *rejects* the +/// small-order `cv`, and the small-order `epk` is detectably small-order, which +/// is what the Sapling verifier checks). +#[test] +fn sapling_small_order_cv_epk_deferred_but_caught_by_librustzcash() { + use group::Group; + + use crate::{ + amount::Amount, + at_least_one, + block::Height, + parameters::NetworkUpgrade, + primitives::{ + redjubjub::{Binding, Signature}, + Groth16Proof, + }, + sapling::{ + self, + keys::EphemeralPublicKey, + shielded_data::{ShieldedData, TransferData}, + EncryptedNote, Output, ValueCommitment, WrappedNoteKey, + }, + serialization::{ZcashDeserializeInto, ZcashSerialize}, + transaction::{LockTime, Transaction}, + }; + + let _init_guard = zebra_test::init(); + + // The Jubjub identity point is a valid encoding, but it is small order + // (order 1), so the not-small-order consensus check must reject it. + let small_order_bytes = jubjub::AffinePoint::from(jubjub::ExtendedPoint::identity()).to_bytes(); + + // These are the exact library functions the semantic/mempool path uses, so + // they must detect the small-order point. + assert!( + bool::from( + sapling_crypto::value::ValueCommitment::from_bytes_not_small_order(&small_order_bytes) + .is_none() + ), + "from_bytes_not_small_order (used by librustzcash read_value_commitment) must reject \ + the small-order cv", + ); + assert!( + bool::from( + jubjub::AffinePoint::from_bytes(small_order_bytes) + .unwrap() + .is_small_order() + ), + "is_small_order (used by the Sapling verifier check_output) must flag the small-order epk", + ); + + // A valid, non-small-order point (the Jubjub generator), used to isolate the + // `epk` case from the `cv` case below. + let valid_cv_bytes = jubjub::AffinePoint::from(jubjub::ExtendedPoint::generator()).to_bytes(); + assert!( + bool::from( + sapling_crypto::value::ValueCommitment::from_bytes_not_small_order(&valid_cv_bytes) + .is_some() + ), + "the Jubjub generator is a valid non-small-order cv", + ); + + // Build a minimal V5 transaction with one Sapling output with the given cv + // and ephemeral_key bytes, round-trip it through Zebra's (now lazy) + // deserializer, and return whether `to_librustzcash` accepts it. + let build_and_convert = |cv_bytes: [u8; 32], epk_bytes: [u8; 32]| -> bool { + let output = Output { + cv: ValueCommitment(cv_bytes), + cm_u: sapling_crypto::note::ExtractedNoteCommitment::from_bytes(&[0u8; 32]).unwrap(), + ephemeral_key: EphemeralPublicKey(epk_bytes), + enc_ciphertext: EncryptedNote([0u8; 580]), + out_ciphertext: WrappedNoteKey([0u8; 80]), + zkproof: Groth16Proof([0u8; 192]), + }; + + let shielded_data: ShieldedData = ShieldedData { + value_balance: Amount::try_from(0).expect("zero is a valid amount"), + transfers: TransferData::JustOutputs { + outputs: at_least_one![output], + }, + binding_sig: Signature::::from([0u8; 64]), + }; + + let tx = Transaction::V5 { + network_upgrade: NetworkUpgrade::Nu5, + lock_time: LockTime::unlocked(), + expiry_height: Height(0), + inputs: vec![], + outputs: vec![], + sapling_shielded_data: Some(shielded_data), + orchard_shielded_data: None, + }; + + let bytes = tx + .zcash_serialize_to_vec() + .expect("crafted transaction must serialize"); + + // Deferral: Zebra now accepts a small-order cv/epk at deserialization + // (the not-small-order check no longer runs here). + let tx: Transaction = bytes + .zcash_deserialize_into() + .expect("lazy deserialization accepts a small-order cv/epk; validation is deferred"); + + tx.to_librustzcash(NetworkUpgrade::Nu5).is_ok() + }; + + // cv is enforced at *read*: `read_value_commitment` uses + // `from_bytes_not_small_order`, so `to_librustzcash` (run for every untrusted + // transaction via `CachedFfiTransaction::new`) rejects a small-order cv. + assert!( + !build_and_convert(small_order_bytes, valid_cv_bytes), + "to_librustzcash must reject a small-order Sapling cv at read", + ); + + // epk is enforced at *verify*, not at read: a small-order epk (with a valid + // cv) passes `to_librustzcash`, then the Sapling verifier's `check_output` + // rejects it via `epk.is_small_order()` (asserted above). This locates the + // enforcement at the verifier, which `verify_sapling_bundle` invokes for + // every untrusted transaction. + // + // A fully isolated end-to-end verifier test is intentionally omitted: mutating + // epk also changes the SigHash (breaking the binding signature) and the + // output proof cannot be forged without proving keys, so any consensus-level + // rejection would be confounded. The `is_small_order` assertion above checks + // the exact, unchanged librustzcash code path that performs the rejection. + assert!( + build_and_convert(valid_cv_bytes, small_order_bytes), + "to_librustzcash must accept a small-order epk (it is enforced at verify, not read)", + ); +} + +/// Edge cases for the lazy Sapling `cv` / `ephemeral_key` deserialization. +/// +/// Beyond the small-order case, this validates: +/// - an off-curve / non-canonical `cv` is also rejected by `to_librustzcash`, so +/// the safety net covers every invalid encoding, not just small-order points; +/// - an off-curve / non-canonical `ephemeral_key` is detectably invalid (the +/// Sapling verifier decompresses `epk`, which fails for an off-curve point); +/// - the lazy types preserve the encoding byte-for-byte through a +/// serialize/deserialize round-trip — the txid and block merkle root hash these +/// bytes, so any change would be consensus-breaking; +/// - `cv.commitment()` decompresses a valid encoding back to the same point; +/// - Sapling `rk` (`ValidatingKey`) is still validated at deserialization — it +/// was not made lazy, so a small-order `rk` is still rejected at read. +#[test] +fn sapling_lazy_cv_epk_edge_cases() { + use group::Group; + + use crate::{ + amount::Amount, + at_least_one, + block::Height, + parameters::NetworkUpgrade, + primitives::{ + redjubjub::{Binding, Signature}, + Groth16Proof, + }, + sapling::{ + self, + keys::{EphemeralPublicKey, ValidatingKey}, + shielded_data::{ShieldedData, TransferData}, + EncryptedNote, Output, ValueCommitment, WrappedNoteKey, + }, + serialization::{ZcashDeserializeInto, ZcashSerialize}, + transaction::{LockTime, Transaction}, + }; + + let _init_guard = zebra_test::init(); + + // A non-canonical / off-curve 32-byte value: not a valid Jubjub point. + let off_curve = [0xffu8; 32]; + assert!( + bool::from(jubjub::AffinePoint::from_bytes(off_curve).is_none()), + "0xff..ff must not be a valid Jubjub point encoding", + ); + let valid_cv = jubjub::AffinePoint::from(jubjub::ExtendedPoint::generator()).to_bytes(); + let small_order = jubjub::AffinePoint::from(jubjub::ExtendedPoint::identity()).to_bytes(); + + let make_v5 = |cv: [u8; 32], epk: [u8; 32]| -> Transaction { + let output = Output { + cv: ValueCommitment(cv), + cm_u: sapling_crypto::note::ExtractedNoteCommitment::from_bytes(&[0u8; 32]).unwrap(), + ephemeral_key: EphemeralPublicKey(epk), + enc_ciphertext: EncryptedNote([0u8; 580]), + out_ciphertext: WrappedNoteKey([0u8; 80]), + zkproof: Groth16Proof([0u8; 192]), + }; + Transaction::V5 { + network_upgrade: NetworkUpgrade::Nu5, + lock_time: LockTime::unlocked(), + expiry_height: Height(0), + inputs: vec![], + outputs: vec![], + sapling_shielded_data: Some(ShieldedData:: { + value_balance: Amount::try_from(0).expect("zero is a valid amount"), + transfers: TransferData::JustOutputs { + outputs: at_least_one![output], + }, + binding_sig: Signature::::from([0u8; 64]), + }), + orchard_shielded_data: None, + } + }; + + // An off-curve cv is rejected by to_librustzcash, covering invalid encodings + // that are not small-order. + let tx_off_curve_cv: Transaction = make_v5(off_curve, valid_cv) + .zcash_serialize_to_vec() + .expect("serializes") + .zcash_deserialize_into() + .expect("lazy deserialization accepts an off-curve cv"); + assert!( + tx_off_curve_cv + .to_librustzcash(NetworkUpgrade::Nu5) + .is_err(), + "to_librustzcash must reject an off-curve cv", + ); + + // Byte-identity: arbitrary (here non-canonical) cv/epk bytes survive a + // serialize -> deserialize -> serialize round-trip unchanged, so the txid and + // merkle root computed from them are unaffected by the lazy representation. + let bytes_in = make_v5(off_curve, off_curve) + .zcash_serialize_to_vec() + .expect("serializes"); + let tx_round: Transaction = bytes_in + .clone() + .zcash_deserialize_into() + .expect("round-trips"); + let bytes_out = tx_round.zcash_serialize_to_vec().expect("re-serializes"); + assert_eq!( + bytes_in, bytes_out, + "lazy cv/epk must round-trip byte-for-byte", + ); + match &tx_round { + Transaction::V5 { + sapling_shielded_data: Some(sd), + .. + } => { + let out = sd.outputs().next().expect("one output"); + assert_eq!(out.cv.0, off_curve, "cv bytes preserved exactly"); + assert_eq!( + out.ephemeral_key.0, off_curve, + "epk bytes preserved exactly" + ); + } + _ => panic!("expected a V5 transaction with Sapling data"), + } + + // `commitment()` decompresses a valid encoding to the same point. + assert_eq!( + ValueCommitment(valid_cv) + .commitment() + .expect("the generator is a valid value commitment") + .to_bytes(), + valid_cv, + "commitment() must round-trip a valid value commitment", + ); + + // `rk` was not made lazy: a small-order rk is still rejected at deserialization + // (`SpendPrefixInTransactionV5` reads it via `ValidatingKey::try_from`). + assert!( + ValidatingKey::try_from(small_order).is_err(), + "Sapling rk must still reject a small-order point at deserialization", + ); +} + +/// The explicit Sapling cv/epk not-small-order check used by the semantic +/// verifier rejects bad points. +/// +/// `Transaction::sapling_point_encodings_are_valid` is the deferred check, +/// relocated from deserialization to the semantic verification path (it is what +/// `Verifier::verify_v4_transaction` / `verify_v5_transaction` call, returning +/// `TransactionError::SmallOrder` on failure). Unlike the proof/binding-signature +/// verification, this check is isolated, so it can be exercised directly: it +/// rejects a small-order or off-curve `cv` *and* a small-order or off-curve +/// `epk`, and accepts valid points. The checkpoint verifier never calls it. +#[test] +fn sapling_point_encodings_check_rejects_bad_points() { + use group::Group; + + use crate::{ + amount::Amount, + at_least_one, + block::Height, + parameters::NetworkUpgrade, + primitives::{ + redjubjub::{Binding, Signature}, + Groth16Proof, + }, + sapling::{ + self, + keys::EphemeralPublicKey, + shielded_data::{ShieldedData, TransferData}, + EncryptedNote, Output, ValueCommitment, WrappedNoteKey, + }, + transaction::{LockTime, Transaction}, + }; + + let _init_guard = zebra_test::init(); + + let valid = jubjub::AffinePoint::from(jubjub::ExtendedPoint::generator()).to_bytes(); + let small_order = jubjub::AffinePoint::from(jubjub::ExtendedPoint::identity()).to_bytes(); + let off_curve = [0xffu8; 32]; + + let make = |cv: [u8; 32], epk: [u8; 32]| -> Transaction { + let output = Output { + cv: ValueCommitment(cv), + cm_u: sapling_crypto::note::ExtractedNoteCommitment::from_bytes(&[0u8; 32]).unwrap(), + ephemeral_key: EphemeralPublicKey(epk), + enc_ciphertext: EncryptedNote([0u8; 580]), + out_ciphertext: WrappedNoteKey([0u8; 80]), + zkproof: Groth16Proof([0u8; 192]), + }; + Transaction::V5 { + network_upgrade: NetworkUpgrade::Nu5, + lock_time: LockTime::unlocked(), + expiry_height: Height(0), + inputs: vec![], + outputs: vec![], + sapling_shielded_data: Some(ShieldedData:: { + value_balance: Amount::try_from(0).expect("zero is a valid amount"), + transfers: TransferData::JustOutputs { + outputs: at_least_one![output], + }, + binding_sig: Signature::::from([0u8; 64]), + }), + orchard_shielded_data: None, + } + }; + + // Valid points pass (a dummy proof/binding sig does not affect this check). + assert!( + make(valid, valid).sapling_point_encodings_are_valid(), + "valid cv/epk must pass the encoding check", + ); + + // A small-order cv is rejected. + assert!( + !make(small_order, valid).sapling_point_encodings_are_valid(), + "small-order cv must be rejected", + ); + + // A small-order epk is rejected. This is the isolated, executable proof of + // the epk rejection: the check runs independently of proof verification. + assert!( + !make(valid, small_order).sapling_point_encodings_are_valid(), + "small-order epk must be rejected", + ); + + // Off-curve / non-canonical encodings are rejected for both fields. + assert!( + !make(off_curve, valid).sapling_point_encodings_are_valid(), + "off-curve cv must be rejected", + ); + assert!( + !make(valid, off_curve).sapling_point_encodings_are_valid(), + "off-curve epk must be rejected", + ); +} + +/// The relocated Sapling `cv` / `epk` not-small-order checks accept exactly the +/// same encodings as the librustzcash functions they mirror. +/// +/// The consensus check was moved off the deserialization path into +/// `ValueCommitment::is_valid_not_small_order` and +/// `EphemeralPublicKey::is_valid_not_small_order`. If either ever diverged from +/// what librustzcash enforces at the FFI boundary, Zebra would accept or reject a +/// transaction that the rest of the network does not — a chain split, not a local +/// bug. This pins each Zebra predicate against the exact library predicate, over a +/// corpus that covers both verdicts: +/// +/// - `cv`: `zcash_primitives`'s `read_value_commitment` accepts a `cv` iff +/// `sapling_crypto::value::ValueCommitment::from_bytes_not_small_order` returns +/// a point. +/// - `epk`: sapling-crypto decodes `epk` via `jubjub::ExtendedPoint::from_bytes` +/// (`verifier/batch.rs`) and `check_output` rejects it when +/// `epk.is_small_order()` (`verifier.rs`). Zebra decodes as an `AffinePoint`, so +/// this also guards that the two decoders agree across the input space. +#[test] +fn sapling_point_checks_match_librustzcash_predicates() { + use group::{Group, GroupEncoding}; + + use crate::sapling::{keys::EphemeralPublicKey, ValueCommitment}; + + let _init_guard = zebra_test::init(); + + // The exact predicate librustzcash applies to a `cv` at read. + let librustzcash_cv_valid = |bytes: [u8; 32]| -> bool { + bool::from( + sapling_crypto::value::ValueCommitment::from_bytes_not_small_order(&bytes).is_some(), + ) + }; + + // The exact predicate librustzcash applies to an `epk`: decode as an + // `ExtendedPoint` (as sapling-crypto's batch verifier does), then reject a + // small-order point (as `check_output` does). + let librustzcash_epk_valid = |bytes: [u8; 32]| -> bool { + match jubjub::ExtendedPoint::from_bytes(&bytes).into_option() { + Some(point) => !bool::from(point.is_small_order()), + None => false, + } + }; + + // A representative spread of encodings: the three consensus-relevant classes + // (valid non-small-order, valid small-order, off-curve/non-canonical), a + // deterministic byte-pattern sweep that mixes decodable and undecodable + // encodings, and many prime-order points `[k]·G` to exercise the accepting + // branch heavily. + let mut inputs: Vec<[u8; 32]> = vec![ + jubjub::AffinePoint::from(jubjub::ExtendedPoint::generator()).to_bytes(), + jubjub::AffinePoint::from(jubjub::ExtendedPoint::identity()).to_bytes(), + [0xffu8; 32], + [0x00u8; 32], + ]; + for b in 0u8..=255 { + inputs.push([b; 32]); + } + let mut acc = jubjub::ExtendedPoint::generator(); + for _ in 0..64 { + inputs.push(jubjub::AffinePoint::from(acc).to_bytes()); + acc += jubjub::ExtendedPoint::generator(); + } + + // Guard against a vacuous comparison: the corpus must contain both accepted + // and rejected encodings for each predicate, otherwise an all-accept or + // all-reject bug could pass the equivalence assertion below. + assert!( + inputs.iter().any(|&b| librustzcash_cv_valid(b)) + && inputs.iter().any(|&b| !librustzcash_cv_valid(b)), + "cv corpus must contain both accepted and rejected encodings", + ); + assert!( + inputs.iter().any(|&b| librustzcash_epk_valid(b)) + && inputs.iter().any(|&b| !librustzcash_epk_valid(b)), + "epk corpus must contain both accepted and rejected encodings", + ); + + for bytes in inputs { + assert_eq!( + ValueCommitment(bytes).is_valid_not_small_order(), + librustzcash_cv_valid(bytes), + "ValueCommitment::is_valid_not_small_order must match librustzcash \ + read_value_commitment for {bytes:02x?}", + ); + assert_eq!( + EphemeralPublicKey(bytes).is_valid_not_small_order(), + librustzcash_epk_valid(bytes), + "EphemeralPublicKey::is_valid_not_small_order must match librustzcash \ + check_output for {bytes:02x?}", + ); + } +} + /// Reproduction for GHSA-rgwx-8r98-p34c: /// Coinbase Sapling spend vectors allocate before zero-spend consensus rule. /// diff --git a/zebra-consensus/src/transaction.rs b/zebra-consensus/src/transaction.rs index d7f043e1c0a..939a87a88f9 100644 --- a/zebra-consensus/src/transaction.rs +++ b/zebra-consensus/src/transaction.rs @@ -406,6 +406,27 @@ where check::has_enough_orchard_flags(&tx)?; check::consensus_branch_id(&tx, req.height(), &network)?; + // # Consensus + // + // > Check that an Output description's cv and epk are not of small + // > order, [and] that a Spend description's cv and rk are not of + // > small order. + // + // https://zips.z.cash/protocol/protocol.pdf#outputdesc + // https://zips.z.cash/protocol/protocol.pdf#spenddesc + // + // The not-small-order check for Sapling cv and epk is deferred from + // deserialization, which stores them as raw bytes to keep point + // decompression off the checkpoint-sync hot path (the checkpoint + // verifier does not need it, because it trusts block hashes). Enforce + // it here on the semantic verification path and the mempool, which + // process untrusted transactions, before any state lookup or the + // librustzcash conversion so an invalid point fails fast. (Spend rk + // is still validated at deserialization.) + if !tx.sapling_point_encodings_are_valid() { + return Err(TransactionError::SmallOrder); + } + // Soft fork: temporarily require transactions to not contain Orchard actions. // // This soft fork was added while NU 6.1 was the active epoch on the Zcash diff --git a/zebra-consensus/src/transaction/tests.rs b/zebra-consensus/src/transaction/tests.rs index d575045df72..a64e34ae748 100644 --- a/zebra-consensus/src/transaction/tests.rs +++ b/zebra-consensus/src/transaction/tests.rs @@ -2716,6 +2716,100 @@ fn v4_with_sapling_outputs_and_no_spends() { }) } +/// A transaction whose Sapling output has an invalid (off-curve) ephemeral key +/// is rejected by the verifier with `SmallOrder`. +/// +/// The not-small-order consensus check for Sapling `cv`/`epk` is deferred from +/// deserialization (to keep point decompression off the checkpoint-sync hot +/// path) and re-enforced by `Verifier::call` via +/// `Transaction::sapling_point_encodings_are_valid`, in the early quick checks. +/// This drives the full verifier end-to-end and confirms the rejection: the +/// state service is `unreachable!` because the check fires before any state +/// lookup. It mirrors `v4_with_sapling_outputs_and_no_spends` (which accepts the +/// same transaction shape) with only the ephemeral key corrupted. +#[test] +fn sapling_output_with_invalid_ephemeral_key_is_rejected() { + let _init_guard = zebra_test::init(); + zebra_test::MULTI_THREADED_RUNTIME.block_on(async { + let network = Network::Mainnet; + + let (height, mut transaction) = test_transactions(&network) + .rev() + .filter(|(_, transaction)| { + !transaction.is_coinbase() && transaction.inputs().is_empty() + }) + .find(|(_, transaction)| { + transaction.sapling_spends_per_anchor().next().is_none() + && transaction.sapling_outputs().next().is_some() + }) + .expect("a transaction with Sapling outputs and no Sapling spends"); + + // Corrupt the first Sapling output's ephemeral key to an off-curve point. + corrupt_first_sapling_output_ephemeral_key( + Arc::get_mut(&mut transaction).expect("transaction only has one active reference"), + ); + + // The state service must not be reached: the check fires before any + // state lookup. + let state_service = + service_fn(|_| async { unreachable!("State service should not be called") }); + let verifier = Verifier::new_for_tests(&network, state_service); + + let result = verifier + .oneshot(Request::Block { + transaction_hash: transaction.hash(), + transaction, + known_utxos: Arc::new(HashMap::new()), + known_outpoint_hashes: Arc::new(HashSet::new()), + height, + time: DateTime::::MAX_UTC, + }) + .await; + + assert_eq!( + result, + Err(TransactionError::SmallOrder), + "a Sapling output with an off-curve ephemeral key must be rejected with SmallOrder", + ); + }); +} + +/// Replaces the first Sapling output's ephemeral key with an off-curve point, +/// for `sapling_output_with_invalid_ephemeral_key_is_rejected`. +fn corrupt_first_sapling_output_ephemeral_key(transaction: &mut Transaction) { + let bad_epk = sapling::keys::EphemeralPublicKey::try_from([0xffu8; 32]) + .expect("deserialization defers point validation, so try_from stores the bytes"); + + match transaction { + Transaction::V4 { + sapling_shielded_data: Some(shielded_data), + .. + } => set_first_sapling_output_ephemeral_key(&mut shielded_data.transfers, bad_epk), + Transaction::V5 { + sapling_shielded_data: Some(shielded_data), + .. + } => set_first_sapling_output_ephemeral_key(&mut shielded_data.transfers, bad_epk), + _ => panic!("expected a V4 or V5 transaction with Sapling data"), + } +} + +fn set_first_sapling_output_ephemeral_key( + transfers: &mut sapling::TransferData, + ephemeral_key: sapling::keys::EphemeralPublicKey, +) { + match transfers { + sapling::TransferData::JustOutputs { outputs } => { + let mut outputs_vec = outputs.as_slice().to_vec(); + outputs_vec[0].ephemeral_key = ephemeral_key; + *outputs = AtLeastOne::from_vec(outputs_vec) + .expect("replacing a field keeps at least one output"); + } + sapling::TransferData::SpendsAndMaybeOutputs { maybe_outputs, .. } => { + maybe_outputs[0].ephemeral_key = ephemeral_key; + } + } +} + /// Test if a V5 transaction with Sapling spends is accepted by the verifier. #[tokio::test] async fn v5_with_sapling_spends() { From c8f119690107f4e86d828b2a71d99c2c9a61450d Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Fri, 19 Jun 2026 16:42:01 -0300 Subject: [PATCH 09/16] perf(state): parallelize and de-duplicate the committer's UTXO/address reads (#140) * Update zebra-state/src/request.rs Co-authored-by: Dev Ojha * Update zebra-state/src/request.rs Co-authored-by: Dev Ojha * perf(state): parallelize and de-duplicate the committer's UTXO/address reads Before building the write batch, the checkpoint committer reads every transparent input's UTXO and every changed address's balance from RocksDB, one `zs_get` at a time on the writer thread. In the transparent-heavy ranges (~100-330K) these cache-served but serial point lookups dominate the per-block write time while the other cores sit idle (CPU ~2/8). The spent-UTXO path also re-derives each input's transaction location twice: once directly and once inside `utxo()`. Two changes in `write_block`: - Read the output location once and reuse it via `utxo_by_location` instead of letting `utxo()` look it up again (3 reads/input -> 2). - Fan the spent-UTXO and address-balance reads across the rayon pool (the writer already runs inside COMMIT_COMPUTE_POOL) once a block has enough inputs/addresses to amortize the fork-join cost, gated by PARALLEL_BLOCK_READ_THRESHOLD (16). The reads are read-only and land in order-independent maps, so the committed batch is byte-identical to the sequential path. Measured over a full mainnet genesis sync, comparing the same binary with and without this change, per-100K committer-thread metrics (peer-independent): range prep_reads write_block_total 100k 7.57 -> 2.64 ms 15.71 -> 10.38 ms 200k 8.94 -> 3.75 ms 19.01 -> 14.30 ms 300k 10.89 -> 3.52 ms 20.32 -> 13.07 ms 400k 2.33 -> 1.05 ms 4.84 -> 3.05 ms prep_reads drops 55-68% and write_block_total 25-37% across the transparent band, moving the bottleneck there onto rocksdb commit. No effect on low-input blocks (gated to sequential) or the heavy shielded region (few transparent inputs). * clean up and tests * comment * clean up comment * fix(state): remove duplicate finalized block import --------- Co-authored-by: Dev Ojha --- zebra-state/src/request.rs | 14 ++------------ .../src/service/finalized_state/zebra_db/block.rs | 1 - 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/zebra-state/src/request.rs b/zebra-state/src/request.rs index cf1aaa12130..4c512b077d5 100644 --- a/zebra-state/src/request.rs +++ b/zebra-state/src/request.rs @@ -555,12 +555,7 @@ impl SemanticallyVerifiedBlock { let height = block .coinbase_height() .expect("semantically verified block should have a coinbase height"); - // Compute each transaction's txid and ZIP-244 auth digest together, - // sharing the single (expensive) librustzcash conversion that dominates - // the cost on heavy shielded transactions, instead of computing the txid - // here and re-converting the same transactions for the auth data root - // later on the commit path. The auth digest is nearly free once the txid - // has been computed. + // Compute each transaction's txid and ZIP-244 auth digest together, for efficiency let (transaction_hashes, auth_digests): (Vec<_>, Vec<_>) = { use rayon::prelude::*; block @@ -609,12 +604,7 @@ impl From> for SemanticallyVerifiedBlock { let height = block .coinbase_height() .expect("semantically verified block should have a coinbase height"); - // Compute each transaction's txid and ZIP-244 auth digest together, - // sharing the single (expensive) librustzcash conversion that dominates - // the cost on heavy shielded transactions, instead of computing the txid - // here and re-converting the same transactions for the auth data root - // later on the commit path. The auth digest is nearly free once the txid - // has been computed. + // Compute each transaction's txid and ZIP-244 auth digest together, for efficiency let (transaction_hashes, auth_digests): (Vec<_>, Vec<_>) = { use rayon::prelude::*; block diff --git a/zebra-state/src/service/finalized_state/zebra_db/block.rs b/zebra-state/src/service/finalized_state/zebra_db/block.rs index fab5e4956ea..6bb5abe76aa 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/block.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/block.rs @@ -44,7 +44,6 @@ use crate::{ disk_format::{ block::TransactionLocation, transparent::{AddressBalanceLocationUpdates, OutputLocation}, - IntoDisk, }, zebra_db::{metrics::block_precommit_metrics, ZebraDb}, FromDisk, RawBytes, PRUNING_METADATA, From 20eeea2c03f6c2d90aecee49c8a1c9be3560c654 Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Fri, 19 Jun 2026 18:04:21 -0300 Subject: [PATCH 10/16] perf(state): optimize checkpoint prepare digest fanout (#148) --- zebra-chain/benches/block.rs | 104 ++++++++++++++++++++++++- zebra-chain/benches/transaction.rs | 41 +++++++++- zebra-state/src/request.rs | 117 +++++++++++++++++++++-------- 3 files changed, 228 insertions(+), 34 deletions(-) diff --git a/zebra-chain/benches/block.rs b/zebra-chain/benches/block.rs index 916890d8f52..8a2558bdb96 100644 --- a/zebra-chain/benches/block.rs +++ b/zebra-chain/benches/block.rs @@ -3,18 +3,24 @@ use std::io::Cursor; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion}; use zebra_chain::{ block::{ + merkle::{AuthDataRoot, AUTH_DIGEST_PLACEHOLDER}, tests::generate::{ large_multi_transaction_block, large_single_transaction_block_many_inputs, }, Block, }, serialization::{ZcashDeserialize, ZcashSerialize}, + transparent, }; -use zebra_test::vectors::BLOCK_TESTNET_141042_BYTES; +use zebra_test::vectors::{ + BLOCK_MAINNET_1687107_BYTES, BLOCK_MAINNET_1687121_BYTES, BLOCK_TESTNET_141042_BYTES, +}; + +const MIN_PARALLEL_CHECKPOINT_PREPARE_TRANSACTIONS: usize = 16; fn block_serialization(c: &mut Criterion) { // Biggest block from `zebra-test`. @@ -49,9 +55,101 @@ fn block_serialization(c: &mut Criterion) { } } +fn checkpoint_prepare_substages(c: &mut Criterion) { + let blocks = vec![ + ( + "BLOCK_TESTNET_141042", + Block::zcash_deserialize(Cursor::new(BLOCK_TESTNET_141042_BYTES.as_slice())).unwrap(), + ), + ( + "BLOCK_MAINNET_1687107", + Block::zcash_deserialize(Cursor::new(BLOCK_MAINNET_1687107_BYTES.as_slice())).unwrap(), + ), + ( + "BLOCK_MAINNET_1687121", + Block::zcash_deserialize(Cursor::new(BLOCK_MAINNET_1687121_BYTES.as_slice())).unwrap(), + ), + ( + "large_multi_transaction_block", + large_multi_transaction_block(), + ), + ]; + + let mut group = c.benchmark_group("Checkpoint Prepare Substages"); + + for (name, block) in blocks { + let (transaction_hashes, auth_digests): (Vec<_>, Vec<_>) = { + if block.transactions.len() < MIN_PARALLEL_CHECKPOINT_PREPARE_TRANSACTIONS { + block + .transactions + .iter() + .map(|tx| tx.txid_and_auth_digest()) + .unzip() + } else { + use rayon::prelude::*; + block + .transactions + .par_iter() + .map(|tx| tx.txid_and_auth_digest()) + .unzip() + } + }; + group.bench_with_input( + BenchmarkId::new("txid_auth_digest", name), + &block, + |b, block| { + b.iter(|| { + let digests: (Vec<_>, Vec<_>) = if block.transactions.len() + < MIN_PARALLEL_CHECKPOINT_PREPARE_TRANSACTIONS + { + block + .transactions + .iter() + .map(|tx| tx.txid_and_auth_digest()) + .unzip() + } else { + use rayon::prelude::*; + block + .transactions + .par_iter() + .map(|tx| tx.txid_and_auth_digest()) + .unzip() + }; + digests + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("auth_data_root", name), + &auth_digests, + |b, auth_digests| { + b.iter_batched( + || auth_digests.clone(), + |auth_digests| { + auth_digests + .into_iter() + .map(|auth_digest| auth_digest.unwrap_or(AUTH_DIGEST_PLACEHOLDER)) + .collect::() + }, + BatchSize::SmallInput, + ) + }, + ); + + if block.coinbase_height().is_some() { + group.bench_function(BenchmarkId::new("new_ordered_outputs", name), |b| { + b.iter(|| transparent::new_ordered_outputs(&block, &transaction_hashes)) + }); + } + } + + group.finish(); +} + criterion_group!( name = benches; config = Criterion::default().noise_threshold(0.05).sample_size(50); - targets = block_serialization + targets = block_serialization, checkpoint_prepare_substages ); criterion_main!(benches); diff --git a/zebra-chain/benches/transaction.rs b/zebra-chain/benches/transaction.rs index 1267546fd32..548d65104f9 100644 --- a/zebra-chain/benches/transaction.rs +++ b/zebra-chain/benches/transaction.rs @@ -114,9 +114,48 @@ fn bench_transaction_deserialize(c: &mut Criterion) { group.finish(); } +fn bench_transaction_digest(c: &mut Criterion) { + let mut group = c.benchmark_group("Transaction Digest"); + + let block = Block::zcash_deserialize(Cursor::new( + zebra_test::vectors::BLOCK_MAINNET_1687107_BYTES.as_slice(), + )) + .expect("valid block"); + let v5_orchard = block + .transactions + .iter() + .find(|tx| tx.version() == 5) + .expect("block has a v5 transaction"); + + let block = Block::zcash_deserialize(Cursor::new( + zebra_test::vectors::BLOCK_MAINNET_1687121_BYTES.as_slice(), + )) + .expect("valid block"); + let v5_later_nu5 = block + .transactions + .iter() + .find(|tx| tx.version() == 5) + .expect("block has a v5 transaction"); + + let tx_samples = vec![ + ("V5 orchard 1687107", v5_orchard), + ("V5 orchard 1687121", v5_later_nu5), + ]; + + for (label, tx) in tx_samples { + group.bench_with_input( + BenchmarkId::new("txid_and_auth_digest", label), + tx, + |b, tx| b.iter(|| tx.txid_and_auth_digest()), + ); + } + + group.finish(); +} + criterion_group! { name = benches; config = Criterion::default().noise_threshold(0.1).sample_size(50); - targets = bench_transaction_deserialize + targets = bench_transaction_deserialize, bench_transaction_digest } criterion_main!(benches); diff --git a/zebra-state/src/request.rs b/zebra-state/src/request.rs index 4c512b077d5..cbf8d25e79d 100644 --- a/zebra-state/src/request.rs +++ b/zebra-state/src/request.rs @@ -40,6 +40,27 @@ use crate::{ CommitSemanticallyVerifiedError, }; +/// Times `$body` and records its duration to the named histogram when the +/// `commit-metrics` feature is enabled; otherwise just evaluates `$body` with +/// zero overhead. Used to profile checkpoint prepare phases. +macro_rules! timed_prepare_phase { + ($name:expr, $body:expr) => {{ + #[cfg(feature = "commit-metrics")] + let _start = std::time::Instant::now(); + let result = $body; + #[cfg(feature = "commit-metrics")] + metrics::histogram!($name).record(_start.elapsed().as_secs_f64()); + result + }}; +} + +/// Minimum transaction count before checkpoint prepare uses Rayon for +/// per-transaction digest fanout. +/// +/// Small blocks are faster serially because Rayon scheduling costs dominate the +/// native ZIP-244 digest work. +const MIN_PARALLEL_CHECKPOINT_PREPARE_TRANSACTIONS: usize = 16; + /// Identify a spend by a transparent outpoint or revealed nullifier. /// /// This enum implements `From` for [`transparent::OutPoint`], [`sprout::Nullifier`], @@ -549,27 +570,77 @@ impl CheckpointVerifiedBlock { } } +fn prepare_block_data( + block: &Block, +) -> ( + Arc<[transaction::Hash]>, + AuthDataRoot, + HashMap, +) { + #[cfg(feature = "commit-metrics")] + { + let transaction_count = block.transactions.len(); + let output_count: usize = block + .transactions + .iter() + .map(|transaction| transaction.outputs().len()) + .sum(); + let v5_transaction_count = block + .transactions + .iter() + .filter(|transaction| transaction.version() == 5) + .count(); + + if let Some(height) = block.coinbase_height() { + metrics::gauge!("zebra.state.prepare.block.height").set(height.0 as f64); + } + metrics::histogram!("zebra.state.prepare.block_tx_count").record(transaction_count as f64); + metrics::histogram!("zebra.state.prepare.block_output_count").record(output_count as f64); + metrics::histogram!("zebra.state.prepare.block_v5_tx_count") + .record(v5_transaction_count as f64); + } + + // Compute each transaction's txid and ZIP-244 auth digest together, for efficiency. + let (transaction_hashes, auth_digests): (Vec<_>, Vec<_>) = + timed_prepare_phase!("zebra.state.prepare.txid_auth_digest.duration_seconds", { + if block.transactions.len() < MIN_PARALLEL_CHECKPOINT_PREPARE_TRANSACTIONS { + block + .transactions + .iter() + .map(|tx| tx.txid_and_auth_digest()) + .unzip() + } else { + use rayon::prelude::*; + block + .transactions + .par_iter() + .map(|tx| tx.txid_and_auth_digest()) + .unzip() + } + }); + let transaction_hashes: Arc<[_]> = transaction_hashes.into(); + let auth_data_root = timed_prepare_phase!( + "zebra.state.prepare.auth_data_root.duration_seconds", + auth_digests + .into_iter() + .map(|auth_digest| auth_digest.unwrap_or(AUTH_DIGEST_PLACEHOLDER)) + .collect::() + ); + let new_outputs = timed_prepare_phase!( + "zebra.state.prepare.new_ordered_outputs.duration_seconds", + transparent::new_ordered_outputs(block, &transaction_hashes) + ); + + (transaction_hashes, auth_data_root, new_outputs) +} + impl SemanticallyVerifiedBlock { /// Creates [`SemanticallyVerifiedBlock`] from [`Block`] and [`block::Hash`]. pub fn with_hash(block: Arc, hash: block::Hash) -> Self { let height = block .coinbase_height() .expect("semantically verified block should have a coinbase height"); - // Compute each transaction's txid and ZIP-244 auth digest together, for efficiency - let (transaction_hashes, auth_digests): (Vec<_>, Vec<_>) = { - use rayon::prelude::*; - block - .transactions - .par_iter() - .map(|tx| tx.txid_and_auth_digest()) - .unzip() - }; - let transaction_hashes: Arc<[_]> = transaction_hashes.into(); - let auth_data_root = auth_digests - .into_iter() - .map(|auth_digest| auth_digest.unwrap_or(AUTH_DIGEST_PLACEHOLDER)) - .collect::(); - let new_outputs = transparent::new_ordered_outputs(&block, &transaction_hashes); + let (transaction_hashes, auth_data_root, new_outputs) = prepare_block_data(&block); Self { block, @@ -604,21 +675,7 @@ impl From> for SemanticallyVerifiedBlock { let height = block .coinbase_height() .expect("semantically verified block should have a coinbase height"); - // Compute each transaction's txid and ZIP-244 auth digest together, for efficiency - let (transaction_hashes, auth_digests): (Vec<_>, Vec<_>) = { - use rayon::prelude::*; - block - .transactions - .par_iter() - .map(|tx| tx.txid_and_auth_digest()) - .unzip() - }; - let transaction_hashes: Arc<[_]> = transaction_hashes.into(); - let auth_data_root = auth_digests - .into_iter() - .map(|auth_digest| auth_digest.unwrap_or(AUTH_DIGEST_PLACEHOLDER)) - .collect::(); - let new_outputs = transparent::new_ordered_outputs(&block, &transaction_hashes); + let (transaction_hashes, auth_data_root, new_outputs) = prepare_block_data(&block); Self { block, From 7fb4ffed3f8bc50d1db7418a14d98c54aa21c13c Mon Sep 17 00:00:00 2001 From: Roman Akhtariev Date: Sat, 20 Jun 2026 01:15:39 -0300 Subject: [PATCH 11/16] perf(state): precompute note-commitment tree hashing off the committer (#144) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * perf(state): precompute note-commitment tree hashing off the committer [prototype] Move the dominant per-block committer cost — the Sapling/Orchard note-commitment tree update — off the single serial committer thread. `parallel_append` is split into `precompute_subtree_roots` (the per-leaf Merkle hashing, position-independent: it needs only the starting note count, not the frontier's hashes) and `graft` (the cheap O(log N) merge on the committer). The finalized write loop runs a 1-block look-ahead: before committing block N it spawns block N+1's hashing on the commit-compute pool, so the hashing overlaps N's commit on otherwise-idle cores; the committer then only grafts. A size-match guard makes a stale precompute fall back to inline hashing, so this can only affect speed, never correctness. Byte-identical to the inline append (differential proptests over the split and the tracked-subtree boundary). `NOTE_PRECOMPUTE_DISABLE` env var forces the inline path for single-binary A/B benchmarking. A/B over the sandblast region (1.71M-1.735M): committer update_trees -54% (12.5 -> 5.7 ms/block). Throughput was flat there because that window is feed- bound (downloads buffered, CPU ~3.5/8), not committer-bound; the win applies where the committer is the gate. Prototype pending the feed instrumentation. * tests, clean up, changelog * fix(chain): use checked arithmetic for precompute capacity check The precompute batch path takes a caller-supplied start_size, so start_size + nodes.len() could wrap past the MAX_LEAVES capacity check (and panic on overflow in debug builds) for values near u64::MAX, building an inconsistent precompute that could later panic in graft. Use checked_add and reject over-capacity sizes with a clean MaxDepthExceeded error. Adds a regression test. * fix(chain): return recoverable errors from precompute helpers instead of panicking The precompute and graft helpers enforced caller-controlled preconditions with panicking assertions: precompute_subtree_roots / precompute_append_ batch_with_subtree on an empty batch, and graft on a frontier size that did not match the precompute's start position. Reachable via the public BlockNotePrecompute path, these turned invalid input into a process panic. Replace the assertions with recoverable BatchFrontierError variants (EmptyBatch, PrecomputeStartMismatch) and map them through a new NoteCommitmentTreeError::InvalidPrecompute in the Sapling/Orchard wrappers. The in-node path is unaffected (it guards empty note sets and size-matches before applying). Adds tests. * perf(chain): run Sapling and Orchard precompute concurrently BlockNotePrecompute::compute hashed the two pools sequentially. Although each pool's append is internally parallel, the two no longer overlapped the way update_trees_parallel's per-pool spawn_fifo tasks did. Restore the cross-pool overlap with rayon::join. * perf(chain): gate note-commitment precompute parallelism on batch size Below PARALLEL_HASH_THRESHOLD (16) note commitments, the per-leaf Merkle hashing now runs entirely serially: benchmarks show that for small batches the rayon join/par_iter overhead matches or exceeds the hashing it parallelizes (crossover ~16 for both Sapling Pedersen and Orchard Sinsemilla), and most blocks outside the sandblast region are small. The gate is on the whole-batch decision only; above the threshold each chunk still splits down to the leaves, so medium batches keep their internal parallelism. BlockNotePrecompute::compute likewise only spawns the cross-pool rayon::join when a pool is large enough to repay it. Adds the precompute_threshold benchmark (and bench-only precompute_then_ graft_root shims) used to find the crossover. Correctness is unchanged and covered by the existing differential proptests. * fix(state): make the look-ahead note precompute cancellable The finalized write loop starts the next block's note-commitment precompute before the current block has committed, to overlap the hashing with the commit. A current block that fails to commit (e.g. a checkpoint-range block whose authorizing-data commitment is only rejected at finalized-state commit) leaves that speculative work unwanted, and the spawned task previously had no cancellation path: it hashed the discarded child in full before noticing the receiver was dropped. Thread an Arc cancellation flag through spawn_note_precompute into BlockNotePrecompute::compute. The two pools are now hashed sequentially (each still internally parallel) so the flag is checked between them; the writer trips it whenever it drops a pending precompute (commit failure, parent-failure skip, height mismatch, or hash mismatch), bounding the wasted work for a discarded child to at most one pool. Correctness is unaffected (the committer still size-checks before applying). Adds a cancellation test. Also normalizes the prior 'graft' terminology to 'apply_precompute'. * perf(chain): keep the cross-pool join in the cancellable precompute Restore the rayon::join (and small-block sequential gating) for the two pools in BlockNotePrecompute::compute, which was dropped when compute was made cancellable. Cancellation is now done by checking the flag up front and at the start of each pool's hashing rather than strictly between the pools, so the cross-pool overlap is preserved while a cancel that lands before a pool starts still skips its work. * fix(chain): bind note precompute to its block, not just the tree size A BlockNotePrecompute was selected solely by start_size == tree.count(), and in that branch the block's own note-commitment arguments were ignored in favor of the precompute's leaves. A precompute accidentally paired with a different block of the same starting tree size would therefore be grafted, silently producing a wrong note-commitment root. The node avoided this by pairing each precompute with the exact block hash in the write loop, but that invariant lived outside zebra-chain's API. Record the block hash in BlockNotePrecompute::compute and have update_trees_parallel_with apply the precompute only when its block_hash matches the block being committed; a mismatch falls back to inline hashing (correct, just slower). Adds a test that a precompute for a different block at the same starting size is rejected. --- CHANGELOG.md | 11 + zebra-chain/Cargo.toml | 5 + zebra-chain/benches/precompute_threshold.rs | 96 ++++ zebra-chain/src/orchard/tree.rs | 98 ++++ zebra-chain/src/parallel/batch_frontier.rs | 424 +++++++++++++++++- zebra-chain/src/parallel/tree.rs | 307 ++++++++++++- zebra-chain/src/sapling/tree.rs | 264 +++++++++++ .../src/service/check/tests/nullifier.rs | 18 +- zebra-state/src/service/check/tests/utxo.rs | 8 +- zebra-state/src/service/finalized_state.rs | 55 ++- .../disk_format/tests/snapshot.rs | 2 +- .../src/service/finalized_state/tests/prop.rs | 3 + .../service/finalized_state/tests/rollback.rs | 2 +- .../zebra_db/block/tests/prune.rs | 20 +- .../zebra_db/block/tests/snapshot.rs | 2 +- .../service/finalized_state/zebra_db/prune.rs | 2 +- zebra-state/src/service/write.rs | 121 ++++- zebra-state/src/tests/setup.rs | 2 +- 18 files changed, 1389 insertions(+), 51 deletions(-) create mode 100644 zebra-chain/benches/precompute_threshold.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 460fb785503..7c4738fff1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,17 @@ and this project adheres to [Semantic Versioning](https://semver.org). hosts (~20 → ~42 blk/s on an 8-core machine at 1.7M height). A new default-off `commit-metrics` feature emits per-block timing histograms (`zebra.state.write.*`) for future profiling. +- Precompute note-commitment tree hashing ahead of the finalized committer. The + per-leaf Merkle hashing for a block (the dominant committer cost on shielded + blocks) depends only on the starting note count, not the frontier's hashes, so + the finalized write loop now does a one-block look-ahead and runs the next + block's Sapling/Orchard hashing on idle cores while the current block commits; + the committer then only applies the precomputed subtree roots onto the frontier + (`update_trees_parallel_with` in `zebra-chain`). The precompute is applied only + if its starting tree size still matches at commit time and otherwise falls back + to inline hashing, so it affects only speed, never the resulting tree. This cuts + the committer's tree-update cost by ~54% (12.5 → 5.7 ms/block) where the + committer is the bottleneck. ### Changed diff --git a/zebra-chain/Cargo.toml b/zebra-chain/Cargo.toml index 2d0bc55361d..96c78903b6e 100644 --- a/zebra-chain/Cargo.toml +++ b/zebra-chain/Cargo.toml @@ -173,5 +173,10 @@ harness = false name = "note_commitment_hash" harness = false +[[bench]] +name = "precompute_threshold" +harness = false +required-features = ["bench"] + [lints] workspace = true diff --git a/zebra-chain/benches/precompute_threshold.rs b/zebra-chain/benches/precompute_threshold.rs new file mode 100644 index 00000000000..fd051f84539 --- /dev/null +++ b/zebra-chain/benches/precompute_threshold.rs @@ -0,0 +1,96 @@ +//! Benchmarks to find where the precompute's rayon parallelism stops paying off. +//! +//! For a range of per-block note counts, this compares: +//! - `serial`: appending the notes one at a time to a fresh tree (no rayon), the +//! cost the committer pays inline today; and +//! - `parallel`: `NoteCommitmentTree::precompute_append` (rayon `into_par_iter` + +//! `rayon::join`), the off-committer precompute. +//! +//! The crossover — the smallest count where `parallel` beats `serial` — is the +//! point below which gating off rayon (hashing serially) avoids paying overhead +//! that does not buy anything. Orchard's Sinsemilla `combine` dominates, so it is +//! the meaningful pool to measure; Sapling is shown as a control. + +// Disabled due to warnings in criterion macros +#![allow(missing_docs)] + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use halo2::pasta::pallas; + +use zebra_chain::orchard::tree::NoteCommitmentTree as OrchardTree; +use zebra_chain::sapling::tree::NoteCommitmentTree as SaplingTree; + +/// Note counts spanning the small-batch region where rayon overhead is expected +/// to dominate, up to sizes where parallelism clearly wins. +const NOTE_COUNTS: &[usize] = &[1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]; + +fn orchard_notes(count: usize) -> Vec { + // Small integers are canonical Pallas field elements. + (0..count as u64).map(pallas::Base::from).collect() +} + +fn sapling_notes(count: usize) -> Vec { + (0..count as u64) + .map(|value| { + let mut bytes = [0u8; 32]; + bytes[..8].copy_from_slice(&value.to_le_bytes()); + Option::from(sapling_crypto::note::ExtractedNoteCommitment::from_bytes( + &bytes, + )) + .expect("small little-endian integer is a canonical Jubjub base") + }) + .collect() +} + +fn bench_orchard(c: &mut Criterion) { + let mut group = c.benchmark_group("orchard_precompute_threshold"); + + for &count in NOTE_COUNTS { + let notes = orchard_notes(count); + group.throughput(Throughput::Elements(count as u64)); + + group.bench_with_input(BenchmarkId::new("serial", count), ¬es, |b, notes| { + b.iter(|| { + let mut tree = OrchardTree::default(); + for note in notes { + tree.append(*black_box(note)).expect("tree is not full"); + } + black_box(tree.root()); + }) + }); + + group.bench_with_input(BenchmarkId::new("parallel", count), ¬es, |b, notes| { + b.iter(|| black_box(OrchardTree::precompute_then_apply_root(black_box(notes)))) + }); + } + + group.finish(); +} + +fn bench_sapling(c: &mut Criterion) { + let mut group = c.benchmark_group("sapling_precompute_threshold"); + + for &count in NOTE_COUNTS { + let notes = sapling_notes(count); + group.throughput(Throughput::Elements(count as u64)); + + group.bench_with_input(BenchmarkId::new("serial", count), ¬es, |b, notes| { + b.iter(|| { + let mut tree = SaplingTree::default(); + for note in notes { + tree.append(*black_box(note)).expect("tree is not full"); + } + black_box(tree.root()); + }) + }); + + group.bench_with_input(BenchmarkId::new("parallel", count), ¬es, |b, notes| { + b.iter(|| black_box(SaplingTree::precompute_then_apply_root(black_box(notes)))) + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_orchard, bench_sapling); +criterion_main!(benches); diff --git a/zebra-chain/src/orchard/tree.rs b/zebra-chain/src/orchard/tree.rs index 099b8f21905..8da1f13e1ca 100644 --- a/zebra-chain/src/orchard/tree.rs +++ b/zebra-chain/src/orchard/tree.rs @@ -28,12 +28,30 @@ use zcash_primitives::merkle_tree::HashSer; use sinsemilla::HashDomain; use crate::{ + parallel::batch_frontier::{ + apply_append_batch_with_subtree, precompute_append_batch_with_subtree, BatchFrontierError, + PrecomputedSubtreeAppend, + }, serialization::{ serde_helpers, ReadZcashExt, SerializationError, ZcashDeserialize, ZcashSerialize, }, subtree::{NoteCommitmentSubtreeIndex, TRACKED_SUBTREE_HEIGHT}, }; +/// The precomputed parallel-append work for one block's Orchard note commitments, +/// produced off the committer by [`NoteCommitmentTree::precompute_append`] and +/// applied with [`NoteCommitmentTree::apply_precomputed_append`]. +#[derive(Clone, Debug)] +pub(crate) struct PrecomputedAppendBatch(PrecomputedSubtreeAppend); + +impl PrecomputedAppendBatch { + /// The tree size (leaf [`count`](NoteCommitmentTree::count)) this precompute + /// must be applied to. + pub(crate) fn start_size(&self) -> u64 { + self.0.start_size() + } +} + pub mod legacy; use legacy::LegacyNoteCommitmentTree; @@ -344,6 +362,25 @@ impl<'de> serde::Deserialize<'de> for Node { pub enum NoteCommitmentTreeError { #[error("The note commitment tree is full")] FullTree, + + #[error("Invalid precompute: empty batch, stale start size, or multi-subtree batch")] + InvalidPrecompute, +} + +impl From for NoteCommitmentTreeError { + fn from(error: BatchFrontierError) -> Self { + match error { + // A capacity overflow is the tree being full. + BatchFrontierError::Frontier(_) => NoteCommitmentTreeError::FullTree, + // The remaining variants are caller-supplied precompute misuse, which + // is reported as a recoverable error rather than panicking. + BatchFrontierError::BatchSpansMultipleSubtrees + | BatchFrontierError::EmptyBatch + | BatchFrontierError::PrecomputeStartMismatch { .. } => { + NoteCommitmentTreeError::InvalidPrecompute + } + } + } } /// Orchard Incremental Note Commitment Tree @@ -458,6 +495,67 @@ impl NoteCommitmentTree { })) } + /// Precomputes the parallel-append work for `note_commitments` against a tree + /// of size `start_size`, off the committer. See the Sapling equivalent. Returns + /// [`NoteCommitmentTreeError::InvalidPrecompute`] for an empty `note_commitments`, + /// rather than panicking. + pub(crate) fn precompute_append( + start_size: u64, + note_commitments: &[NoteCommitmentUpdate], + ) -> Result { + let nodes: Vec = note_commitments + .iter() + .map(|commitment_x| (*commitment_x).into()) + .collect(); + + let inner = precompute_append_batch_with_subtree::<_, MERKLE_DEPTH>(start_size, &nodes)?; + + Ok(PrecomputedAppendBatch(inner)) + } + + /// Applies a [`PrecomputedAppendBatch`] from [`Self::precompute_append`], + /// returning any completed [`TRACKED_SUBTREE_HEIGHT`] subtree, exactly like + /// [`Self::append_batch`]. `precomputed.start_size()` must equal this tree's + /// [`count`](Self::count); a stale precompute returns + /// [`NoteCommitmentTreeError::InvalidPrecompute`] (rather than panicking) so + /// callers can fall back to [`Self::append_batch`]. + #[allow(clippy::unwrap_in_result)] + pub(crate) fn apply_precomputed_append( + &mut self, + precomputed: PrecomputedAppendBatch, + ) -> Result, NoteCommitmentTreeError> { + let (frontier, completed) = + apply_append_batch_with_subtree(self.inner.clone(), precomputed.0)?; + + self.inner = frontier; + *self + .cached_root + .get_mut() + .expect("a thread that previously held exclusive lock access panicked") = None; + + Ok(completed.map(|(index_value, root)| { + let index = NoteCommitmentSubtreeIndex( + index_value.try_into().expect("subtree index fits in u16"), + ); + (index, root) + })) + } + + /// Benchmark-only: precompute the parallel append for `note_commitments` + /// (rayon hashing), apply the precomputed subtree roots onto a fresh tree, and return the resulting root. + /// Mirrors the committer's precompute path end-to-end so the + /// `precompute_threshold` benchmark can compare it against a serial append. + #[cfg(feature = "bench")] + #[doc(hidden)] + pub fn precompute_then_apply_root(note_commitments: &[NoteCommitmentUpdate]) -> [u8; 32] { + let mut tree = NoteCommitmentTree::default(); + let precomputed = + Self::precompute_append(0, note_commitments).expect("non-empty batch in benchmark"); + tree.apply_precomputed_append(precomputed) + .expect("fresh tree matches start size 0"); + tree.root().into() + } + /// Returns frontier of non-empty tree, or `None` if the tree is empty. fn frontier(&self) -> Option<&NonEmptyFrontier> { self.inner.value() diff --git a/zebra-chain/src/parallel/batch_frontier.rs b/zebra-chain/src/parallel/batch_frontier.rs index 12ae93fe7f7..621ca8526f7 100644 --- a/zebra-chain/src/parallel/batch_frontier.rs +++ b/zebra-chain/src/parallel/batch_frontier.rs @@ -46,6 +46,18 @@ pub enum BatchFrontierError { /// The batch would complete more than one tracked subtree. BatchSpansMultipleSubtrees, + + /// A precompute was requested for, or applied to, an empty batch of leaves. + EmptyBatch, + + /// A precompute was applied to a frontier whose size does not match the size + /// the precompute was computed against (a stale look-ahead). + PrecomputeStartMismatch { + /// The tree size the precompute was computed against. + expected: u64, + /// The actual size of the frontier the precompute was applied to. + found: u64, + }, } impl fmt::Display for BatchFrontierError { @@ -57,6 +69,15 @@ impl fmt::Display for BatchFrontierError { BatchFrontierError::BatchSpansMultipleSubtrees => { write!(f, "batch spans more than one tracked subtree boundary") } + BatchFrontierError::EmptyBatch => { + write!(f, "precompute requested for an empty batch of leaves") + } + BatchFrontierError::PrecomputeStartMismatch { expected, found } => { + write!( + f, + "precompute computed for tree size {expected} applied to a frontier of size {found}" + ) + } } } } @@ -104,9 +125,24 @@ fn merge_complete_subtree( } } +/// Below this many leaves in a batch, the per-leaf Merkle hashing is done entirely +/// serially (no rayon at all). Benchmarks (`precompute_threshold`) show that for +/// small batches the rayon `join`/`par_iter` overhead matches or exceeds the +/// hashing it parallelizes — the crossover is ~16 note commitments for both +/// Sapling Pedersen and Orchard Sinsemilla — so gating below it avoids paying for +/// parallelism that does not buy anything on the common small/empty blocks. +/// +/// This gates the *whole-batch* decision only. Above it, the per-chunk reduction +/// still splits all the way down (see [`perfect_subtree_root`]): the largest chunk +/// of a medium batch benefits from internal parallelism, so capping the split +/// granularity here would instead *serialize* that chunk and regress medium +/// batches. +pub(crate) const PARALLEL_HASH_THRESHOLD: usize = 16; + /// Computes the root of a perfect subtree of exactly `2^k` `leaves`, using a -/// parallel divide-and-conquer reduction. The combine hashes within and across -/// the two halves are independent, so this scales across the rayon pool. +/// parallel divide-and-conquer reduction across the rayon pool. The combine hashes +/// within and across the two halves are independent, so this scales across cores. +/// Used for large batches; small batches use [`perfect_subtree_root_serial`]. fn perfect_subtree_root(leaves: &[H]) -> H { debug_assert!(leaves.len().is_power_of_two()); if leaves.len() == 1 { @@ -123,6 +159,23 @@ fn perfect_subtree_root(leaves: &[H]) -> H { H::combine(child_level, &l, &r) } +/// Serial reduction of a perfect subtree of exactly `2^k` `leaves`, with no rayon +/// overhead. Used for small batches (see [`PARALLEL_HASH_THRESHOLD`]). +fn perfect_subtree_root_serial(leaves: &[H]) -> H { + debug_assert!(leaves.len().is_power_of_two()); + if leaves.len() == 1 { + return leaves[0].clone(); + } + let half = leaves.len() / 2; + let child_level = Level::from(half.trailing_zeros() as u8); + let (left, right) = leaves.split_at(half); + H::combine( + child_level, + &perfect_subtree_root_serial(left), + &perfect_subtree_root_serial(right), + ) +} + /// Returns true if the leaves before the frontier tip include a complete /// `2^level` subtree. /// @@ -366,6 +419,239 @@ where } } +// --- Off-committer precompute / apply_precompute split --------------------------------- +// +// [`parallel_append`] does two things: it hashes the new leaves into complete +// subtree roots (the dominant cost on heavy shielded blocks), and it merges +// those roots onto the existing frontier. The hashing depends only on the +// starting leaf *position*, not on the frontier's hashes, so it can run ahead of +// the committer, concurrently across many blocks. [`precompute_subtree_roots`] +// does that hashing; [`apply_precompute`] does the cheap merge on the committer. Their +// composition is byte-identical to [`parallel_append`] (differential proptests). + +/// The position-independent result of appending a run of `num_leaves` leaves +/// starting at tree size [`start_position`](Self::start_position): the +/// parallel-hashed complete subtree roots, plus the last (raw tip) leaf. +#[derive(Clone, Debug)] +pub(crate) struct PrecomputedAppend { + /// Tree size (next leaf position) this was hashed against. [`apply_precompute`] must be + /// applied to a frontier of exactly this size. + start_position: u64, + /// Number of leaves in the run (>= 1). + num_leaves: usize, + /// `(level, root)` for each complete subtree chunk of the first + /// `num_leaves - 1` leaves, in ascending position order. + chunk_roots: Vec<(usize, H)>, + /// The last leaf, which becomes the applied frontier's raw tip. + tip_leaf: H, +} + +/// Hashes the complete subtree roots for appending `new_leaves` to a tree of size +/// `start_position`, in parallel. The expensive, position-independent half of +/// [`parallel_append`]; pair with [`apply_precompute`]. +/// +/// Returns [`BatchFrontierError::EmptyBatch`] if `new_leaves` is empty: the +/// precompute represents a non-empty append (its tip is the last leaf), so an +/// empty batch is reported as a recoverable error rather than panicking. +pub(crate) fn precompute_subtree_roots( + start_position: u64, + new_leaves: &[H], +) -> Result, BatchFrontierError> +where + H: Hashable + Clone + Send + Sync, +{ + let num_leaves = new_leaves.len(); + let (tip_leaf, leaves_to_merge) = new_leaves + .split_last() + .ok_or(BatchFrontierError::EmptyBatch)?; + let tip_leaf = tip_leaf.clone(); + + let chunks = complete_subtree_chunks(start_position, leaves_to_merge); + // Small batches hash entirely serially (no rayon); larger batches fan the chunks + // out across the pool and split each chunk down to the leaves. See + // [`PARALLEL_HASH_THRESHOLD`]. + let chunk_roots: Vec<(usize, H)> = if leaves_to_merge.len() <= PARALLEL_HASH_THRESHOLD { + chunks + .into_iter() + .map(|(level, leaves)| (level, perfect_subtree_root_serial(leaves))) + .collect() + } else { + chunks + .into_par_iter() + .map(|(level, leaves)| (level, perfect_subtree_root(leaves))) + .collect() + }; + + Ok(PrecomputedAppend { + start_position, + num_leaves, + chunk_roots, + tip_leaf, + }) +} + +/// Merges a [`PrecomputedAppend`] onto `frontier`, returning the updated frontier. +/// The cheap, committer-side half of [`parallel_append`] (O(log N) merges). +/// +/// The frontier's size MUST equal the precompute's `start_position`. Callers +/// compare and recompute via [`parallel_append`] on mismatch, so a mismatch here +/// is reported as a recoverable [`BatchFrontierError::PrecomputeStartMismatch`] +/// (a stale precompute must not panic the process). +pub(crate) fn apply_precompute( + frontier: Frontier, + precomputed: PrecomputedAppend, +) -> Result, BatchFrontierError> +where + H: Hashable + Clone + Send + Sync, +{ + let (mut complete_subtree_roots, next_leaf_position) = + frontier_complete_subtree_roots(&frontier); + + if next_leaf_position != precomputed.start_position { + return Err(BatchFrontierError::PrecomputeStartMismatch { + expected: precomputed.start_position, + found: next_leaf_position, + }); + } + + for (level, root) in precomputed.chunk_roots { + merge_complete_subtree(&mut complete_subtree_roots, level, root); + } + + let new_tip_position = next_leaf_position + (precomputed.num_leaves as u64 - 1); + let complete_subtree_roots = complete_subtree_roots.into_iter().flatten().collect(); + + Ok(Frontier::from_parts( + Position::from(new_tip_position), + precomputed.tip_leaf, + complete_subtree_roots, + )?) +} + +/// The precomputed form of [`append_batch_with_subtree`]: the parallel hashing for +/// one block's nodes, split at the tracked-subtree boundary if it crosses one. +/// Produced by [`precompute_append_batch_with_subtree`] off the committer and +/// applied with [`apply_append_batch_with_subtree`]. +#[derive(Clone, Debug)] +pub(crate) struct PrecomputedSubtreeAppend { + /// Tree size this was hashed against; the frontier it is applied to must match. + start_size: u64, + inner: PrecomputedSubtreeKind, +} + +#[derive(Clone, Debug)] +enum PrecomputedSubtreeKind { + /// The batch fits within one tracked-subtree window. + Single(PrecomputedAppend), + /// The batch crosses one tracked-subtree boundary, completing the subtree at + /// `index_value`. `head` ends the subtree; `tail` continues after it (`None` + /// if the batch ends exactly on the boundary). + Boundary { + head: PrecomputedAppend, + tail: Option>, + index_value: u64, + }, +} + +impl PrecomputedSubtreeAppend { + /// The tree size this precompute assumes — the frontier `tree_size` it must + /// be applied to. + pub(crate) fn start_size(&self) -> u64 { + self.start_size + } +} + +/// Precomputes the parallel hashing for appending `nodes` to a tree of size +/// `start_size`, off the committer. Mirrors [`append_batch_with_subtree`]'s +/// boundary handling. `nodes` must be non-empty. +pub(crate) fn precompute_append_batch_with_subtree( + start_size: u64, + nodes: &[H], +) -> Result, BatchFrontierError> +where + H: Hashable + Clone + Send + Sync, +{ + use crate::subtree::TRACKED_SUBTREE_HEIGHT; + + if nodes.is_empty() { + return Err(BatchFrontierError::EmptyBatch); + } + + let new_size = start_size + .checked_add(nodes.len() as u64) + .filter(|&new_size| new_size <= TreeCapacity::::MAX_LEAVES) + .ok_or(BatchFrontierError::Frontier( + FrontierError::MaxDepthExceeded { + depth: DEPTH.saturating_add(1), + }, + ))?; + + let subtree_size = 1u64 << TRACKED_SUBTREE_HEIGHT; + let boundary = (start_size / subtree_size) + .checked_add(1) + .and_then(|n| n.checked_mul(subtree_size)); + if boundary + .and_then(|b| b.checked_add(subtree_size)) + .is_some_and(|second_boundary| second_boundary <= new_size) + { + return Err(BatchFrontierError::BatchSpansMultipleSubtrees); + } + + let inner = if boundary.is_some_and(|b| b <= new_size) { + let boundary = boundary.expect("checked above"); + let head_len = (boundary - start_size) as usize; + let (head, tail) = nodes.split_at(head_len); + let index_value = (boundary >> TRACKED_SUBTREE_HEIGHT) - 1; + PrecomputedSubtreeKind::Boundary { + head: precompute_subtree_roots(start_size, head)?, + tail: (!tail.is_empty()) + .then(|| precompute_subtree_roots(boundary, tail)) + .transpose()?, + index_value, + } + } else { + PrecomputedSubtreeKind::Single(precompute_subtree_roots(start_size, nodes)?) + }; + + Ok(PrecomputedSubtreeAppend { start_size, inner }) +} + +/// Applies a [`PrecomputedSubtreeAppend`] onto `frontier`, returning the completed +/// tracked subtree's `(index_value, root)` if the batch crossed a boundary. The +/// counterpart to [`precompute_append_batch_with_subtree`]; byte-identical to +/// [`append_batch_with_subtree`]. +pub(crate) fn apply_append_batch_with_subtree( + frontier: Frontier, + precomputed: PrecomputedSubtreeAppend, +) -> Result<(Frontier, Option<(u64, H)>), BatchFrontierError> +where + H: Hashable + Clone + Send + Sync, +{ + use crate::subtree::TRACKED_SUBTREE_HEIGHT; + + match precomputed.inner { + PrecomputedSubtreeKind::Single(pre) => Ok((apply_precompute(frontier, pre)?, None)), + PrecomputedSubtreeKind::Boundary { + head, + tail, + index_value, + } => { + let f1 = apply_precompute(frontier, head)?; + // The boundary subtree root needs the applied head, so it is computed + // here on the committer (rare: once per 2^16 leaves). + let root = f1 + .value() + .expect("just appended at least one leaf") + .root(Some(Level::from(TRACKED_SUBTREE_HEIGHT))); + let f2 = match tail { + Some(tail) => apply_precompute(f1, tail)?, + None => f1, + }; + Ok((f2, Some((index_value, root)))) + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -559,6 +845,66 @@ mod tests { "frontier parts mismatch" ); } + + /// The off-committer split: precompute the subtree roots keyed only on the + /// starting leaf *count* (no frontier hashes), then apply the precomputed subtree roots onto the real + /// frontier. Must be byte-identical to the sequential append, proving the + /// precompute can run ahead of the committer using just the note position. + #[test] + fn precompute_then_apply_precompute_matches_sequential( + prefix_len in 0usize..300, + batch in proptest::collection::vec(any::().prop_map(TestNode), 1..300), + ) { + let prefix: Vec = (0..prefix_len as u64).map(TestNode).collect(); + let start = build_frontier::(&prefix); + + // Precompute is given only the count (prefix_len), not `start`. + let precomputed = precompute_subtree_roots(prefix_len as u64, &batch) + .expect("non-empty batch in tests"); + prop_assert_eq!(precomputed.start_position, prefix_len as u64); + + let seq = sequential_append::(start.clone(), &batch); + let applied = apply_precompute(start, precomputed).expect("no overflow in tests"); + + prop_assert_eq!(seq.root(), applied.root(), "root mismatch"); + prop_assert_eq!( + seq.value().map(|f| f.clone().into_parts()), + applied.value().map(|f| f.clone().into_parts()), + "frontier parts mismatch" + ); + } + + /// The precomputed batch-with-subtree path (off-committer precompute + apply_precompute) + /// must produce the same frontier AND the same completed-subtree result as + /// the inline `append_batch_with_subtree`, across the tracked-subtree boundary. + #[test] + fn precompute_subtree_matches_append_batch_with_subtree( + prefix_len in 0u64..300, + batch_len in 1usize..300, + ) { + // Exercise the boundary by starting just below it, so some batches cross it. + use crate::subtree::TRACKED_SUBTREE_HEIGHT; + let boundary = 1u64 << TRACKED_SUBTREE_HEIGHT; + let start_size = boundary - 1 - prefix_len.min(boundary - 1); + let prefix: Vec = (0..start_size).map(TestNode).collect(); + let start = build_frontier::(&prefix); + let batch: Vec = (1000..1000 + batch_len as u64).map(TestNode).collect(); + + let (inline_frontier, inline_completed) = + append_batch_with_subtree::<_, DEPTH>(start.clone(), batch.clone()) + .expect("no overflow in tests"); + + let precomputed = + precompute_append_batch_with_subtree::<_, DEPTH>(start_size, &batch) + .expect("no overflow in tests"); + prop_assert_eq!(precomputed.start_size(), start_size); + let (pre_frontier, pre_completed) = + apply_append_batch_with_subtree(start, precomputed) + .expect("no overflow in tests"); + + prop_assert_eq!(inline_frontier.root(), pre_frontier.root(), "root mismatch"); + prop_assert_eq!(inline_completed, pre_completed, "completed subtree mismatch"); + } } /// Spot-check small exhaustive sizes for off-by-one boundary bugs. @@ -626,6 +972,80 @@ mod tests { ); } + /// A caller-supplied `start_size` near `u64::MAX` must report a clean capacity + /// error rather than wrapping past the `MAX_LEAVES` check (which would build an + /// inconsistent precompute and panic in `apply_precompute`, or panic on overflow in debug + /// builds). + #[test] + fn precompute_start_size_overflow_is_reported() { + let batch = [TestNode(1), TestNode(2)]; + + let is_capacity_error = |result| { + matches!( + result, + Err(BatchFrontierError::Frontier( + FrontierError::MaxDepthExceeded { .. } + )) + ) + }; + + // `start_size + nodes.len()` overflows u64. + assert!( + is_capacity_error(precompute_append_batch_with_subtree::<_, DEPTH>( + u64::MAX - 1, + &batch + )), + "overflowing start_size must report a capacity error" + ); + + // `start_size` past the tree's capacity without overflowing u64. + assert!( + is_capacity_error(precompute_append_batch_with_subtree::<_, DEPTH>( + TreeCapacity::::MAX_LEAVES, + &batch + )), + "start_size at capacity must report a capacity error" + ); + } + + /// Empty input is a recoverable error, not a panic: the precompute represents a + /// non-empty append (its tip is the last leaf). + #[test] + fn precompute_empty_batch_is_reported() { + let empty: [TestNode; 0] = []; + + assert_eq!( + precompute_subtree_roots(0, &empty).err(), + Some(BatchFrontierError::EmptyBatch), + "precompute_subtree_roots rejects an empty slice" + ); + assert_eq!( + precompute_append_batch_with_subtree::<_, DEPTH>(0, &empty).err(), + Some(BatchFrontierError::EmptyBatch), + "precompute_append_batch_with_subtree rejects an empty slice" + ); + } + + /// Applying a precompute onto a frontier of the wrong size is a recoverable + /// error, not a panic, so a stale look-ahead can never crash the process. + #[test] + fn apply_precompute_size_mismatch_is_reported() { + let batch = [TestNode(1), TestNode(2), TestNode(3)]; + // Precompute is keyed on tree size 5. + let precomputed = precompute_subtree_roots(5, &batch).expect("non-empty batch"); + + // Apply it to a frontier of size 2 (a different starting size). + let frontier = build_frontier::(&[TestNode(10), TestNode(11)]); + assert_eq!( + apply_precompute(frontier, precomputed).err(), + Some(BatchFrontierError::PrecomputeStartMismatch { + expected: 5, + found: 2, + }), + "apply_precompute reports a size mismatch instead of panicking" + ); + } + /// Batches that would complete more than one tracked subtree are rejected, /// because the return type can only report one completed subtree. #[test] diff --git a/zebra-chain/src/parallel/tree.rs b/zebra-chain/src/parallel/tree.rs index da4981d7cb8..9f4efea18c6 100644 --- a/zebra-chain/src/parallel/tree.rs +++ b/zebra-chain/src/parallel/tree.rs @@ -1,12 +1,17 @@ //! Parallel note commitment tree update methods. -use std::sync::Arc; +use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, +}; use thiserror::Error; use crate::{ - block::Block, - orchard, sapling, sprout, + block::{self, Block}, + orchard, + parallel::batch_frontier::PARALLEL_HASH_THRESHOLD, + sapling, sprout, subtree::{NoteCommitmentSubtree, NoteCommitmentSubtreeIndex}, }; @@ -57,6 +62,24 @@ impl NoteCommitmentTrees { pub fn update_trees_parallel( &mut self, block: &Arc, + ) -> Result<(), NoteCommitmentTreeError> { + self.update_trees_parallel_with(block, None) + } + + /// Like [`update_trees_parallel`](Self::update_trees_parallel), but applies a + /// [`BlockNotePrecompute`] computed ahead of time off the committer when one is + /// supplied and still matches the current tree sizes. + /// + /// The Sapling/Orchard per-leaf Merkle hashing is the dominant cost of + /// committing a shielded block; precomputing it concurrently (keyed only on the + /// note position) lets the committer do just the cheap apply the precomputed subtree roots. A `None` or + /// size-mismatched precompute transparently falls back to hashing inline, so the + /// result is always identical to the plain update. + #[allow(clippy::unwrap_in_result)] + pub fn update_trees_parallel_with( + &mut self, + block: &Arc, + precompute: Option, ) -> Result<(), NoteCommitmentTreeError> { let block = block.clone(); let height = block @@ -75,6 +98,17 @@ impl NoteCommitmentTrees { let sapling_note_commitments: Vec<_> = block.sapling_note_commitments().cloned().collect(); let orchard_note_commitments: Vec<_> = block.orchard_note_commitments().cloned().collect(); + // Only use the precompute if it was computed for this exact block. A + // precompute is otherwise keyed only by starting tree size, so without this + // check one accidentally paired with a different block of the same starting + // size would apply the wrong leaves and silently produce a wrong root. A + // mismatch (or `None`) falls back to inline hashing, which is correct, just + // slower — so this can only cost speed, never correctness. + let (sapling_precompute, orchard_precompute) = match precompute { + Some(p) if p.block_hash == block.hash() => (p.sapling, p.orchard), + _ => (None, None), + }; + let mut sprout_result = None; let mut sapling_result = None; let mut orchard_result = None; @@ -91,18 +125,20 @@ impl NoteCommitmentTrees { if !sapling_note_commitments.is_empty() { scope.spawn_fifo(|_scope| { - sapling_result = Some(Self::update_sapling_note_commitment_tree( + sapling_result = Some(Self::update_sapling_note_commitment_tree_with( sapling, sapling_note_commitments, + sapling_precompute, )); }); } if !orchard_note_commitments.is_empty() { scope.spawn_fifo(|_scope| { - orchard_result = Some(Self::update_orchard_note_commitment_tree( + orchard_result = Some(Self::update_orchard_note_commitment_tree_with( orchard, orchard_note_commitments, + orchard_precompute, )); }); } @@ -212,4 +248,265 @@ impl NoteCommitmentTrees { Ok((orchard, subtree_root)) } + + /// Like [`update_sapling_note_commitment_tree`](Self::update_sapling_note_commitment_tree), + /// but applies `precompute` (off-committer parallel hashing) when present and its + /// `start_size` still matches the tree; otherwise hashes inline. Identical result. + #[allow(clippy::unwrap_in_result)] + pub(crate) fn update_sapling_note_commitment_tree_with( + mut sapling: Arc, + sapling_note_commitments: Vec, + precompute: Option, + ) -> Result< + ( + Arc, + Option<(NoteCommitmentSubtreeIndex, sapling_crypto::Node)>, + ), + NoteCommitmentTreeError, + > { + let sapling_nct = Arc::make_mut(&mut sapling); + + let subtree_root = match precompute { + Some(pre) if pre.start_size() == sapling_nct.count() => { + sapling_nct.apply_precomputed_append(pre)? + } + _ => sapling_nct.append_batch(&sapling_note_commitments)?, + }; + + // Re-calculate and cache the tree root. + let _ = sapling_nct.root(); + + Ok((sapling, subtree_root)) + } + + /// Like [`update_orchard_note_commitment_tree`](Self::update_orchard_note_commitment_tree), + /// but applies `precompute` when present and size-matched; otherwise inline. Identical result. + #[allow(clippy::unwrap_in_result)] + pub(crate) fn update_orchard_note_commitment_tree_with( + mut orchard: Arc, + orchard_note_commitments: Vec, + precompute: Option, + ) -> Result< + ( + Arc, + Option<(NoteCommitmentSubtreeIndex, orchard::tree::Node)>, + ), + NoteCommitmentTreeError, + > { + let orchard_nct = Arc::make_mut(&mut orchard); + + let subtree_root = match precompute { + Some(pre) if pre.start_size() == orchard_nct.count() => { + orchard_nct.apply_precomputed_append(pre)? + } + _ => orchard_nct.append_batch(&orchard_note_commitments)?, + }; + + // Re-calculate and cache the tree root. + let _ = orchard_nct.root(); + + Ok((orchard, subtree_root)) + } +} + +/// The off-committer precomputed parallel-append work for one block's Sapling and +/// Orchard note commitments, produced by [`BlockNotePrecompute::compute`] and applied +/// via [`NoteCommitmentTrees::update_trees_parallel_with`]. +#[derive(Clone, Debug)] +pub struct BlockNotePrecompute { + /// The hash of the block this precompute was computed for. The committer + /// applies the precompute only to this exact block, so a precompute that was + /// accidentally paired with a different block (even one with the same starting + /// tree size) is rejected instead of applying the wrong leaves. See + /// [`NoteCommitmentTrees::update_trees_parallel_with`]. + pub(crate) block_hash: block::Hash, + /// Precomputed Sapling append, if the block has Sapling outputs. + pub(crate) sapling: Option, + /// Precomputed Orchard append, if the block has Orchard actions. + pub(crate) orchard: Option, +} + +impl BlockNotePrecompute { + /// Precomputes the Sapling and Orchard per-leaf Merkle hashing for `block`, + /// given the tree sizes (cumulative note counts) the block will commit at. + /// + /// Runs off the committer, concurrently across blocks. The committer then only + /// applies the precomputed subtree roots. `sapling_start` / `orchard_start` are the respective tree `count`s + /// immediately before this block; the committer re-checks them and falls back to + /// inline hashing on any mismatch. Pools with no notes (or a precompute error) + /// are left `None`, also falling back to inline. + /// + /// The Sapling and Orchard precomputes run concurrently via [`rayon::join`], + /// mirroring the per-pool parallelism of [`NoteCommitmentTrees::update_trees_parallel`]: + /// each pool's hashing is already internally parallel, and the join lets the two + /// pools overlap. For small blocks (both pools below [`PARALLEL_HASH_THRESHOLD`]) + /// they are computed sequentially, since there is too little hashing to repay the + /// cross-pool join. + /// + /// # Cancellation + /// + /// This is started speculatively for the *next* block while the *current* block + /// is still committing, so a failed or invalid current block leaves the work + /// unwanted (the committer drops the receiver). `cancel` lets the writer abort it: + /// the flag is checked once up front and again at the start of each pool's hashing, + /// so a cancel that lands before a pool starts skips that pool's work. (Once a + /// pool's hashing is under way it runs to completion — the bound is best-effort, + /// not interrupt-in-the-middle.) A cancelled call returns an empty precompute, + /// which the committer treats like any other miss and hashes inline. + pub fn compute( + sapling_start: u64, + orchard_start: u64, + block: &Block, + cancel: &AtomicBool, + ) -> Self { + let block_hash = block.hash(); + + if cancel.load(Ordering::Relaxed) { + return Self { + block_hash, + sapling: None, + orchard: None, + }; + } + + let sapling_notes: Vec<_> = block.sapling_note_commitments().cloned().collect(); + let orchard_notes: Vec<_> = block.orchard_note_commitments().cloned().collect(); + + let sapling_fn = || { + if cancel.load(Ordering::Relaxed) || sapling_notes.is_empty() { + return None; + } + sapling::tree::NoteCommitmentTree::precompute_append(sapling_start, &sapling_notes).ok() + }; + let orchard_fn = || { + if cancel.load(Ordering::Relaxed) || orchard_notes.is_empty() { + return None; + } + orchard::tree::NoteCommitmentTree::precompute_append(orchard_start, &orchard_notes).ok() + }; + + let overlap_pools = sapling_notes.len() >= PARALLEL_HASH_THRESHOLD + || orchard_notes.len() >= PARALLEL_HASH_THRESHOLD; + let (sapling, orchard) = if overlap_pools { + rayon::join(sapling_fn, orchard_fn) + } else { + (sapling_fn(), orchard_fn()) + }; + + Self { + block_hash, + sapling, + orchard, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::serialization::ZcashDeserialize; + + /// A precompute started speculatively for the next block is cancellable: when + /// the writer trips the flag (because the current block's commit failed and the + /// child will be discarded), `compute` returns an empty precompute instead of + /// hashing the block. Uses a real NU5 block with Sapling notes; the flag check + /// is identical for the Orchard pool. + #[test] + fn block_note_precompute_respects_cancellation() { + let _init_guard = zebra_test::init(); + + let block = + Block::zcash_deserialize(zebra_test::vectors::BLOCK_MAINNET_1687106_BYTES.as_slice()) + .expect("hard-coded NU5 block vector deserializes"); + + // Precondition: the block exercises the Sapling pool. + assert!( + block.sapling_note_commitments().next().is_some(), + "test block must have Sapling notes" + ); + + // Not cancelled: the Sapling pool is precomputed. + let live = BlockNotePrecompute::compute(0, 0, &block, &AtomicBool::new(false)); + assert!( + live.sapling.is_some(), + "a live precompute hashes the populated pool" + ); + + // Cancelled before it runs: no hashing, an empty precompute the committer + // treats as a miss (hashing inline instead). + let cancelled = BlockNotePrecompute::compute(0, 0, &block, &AtomicBool::new(true)); + assert!( + cancelled.sapling.is_none() && cancelled.orchard.is_none(), + "a cancelled precompute does no work" + ); + } + + /// A precompute is bound to the block it was computed for: applying one built for + /// a *different* block — even with the same starting tree size, which the + /// size-only guard would have accepted — must be rejected and fall back to inline + /// hashing, so it can never silently graft the wrong block's leaves. + #[test] + fn precompute_is_bound_to_its_block() { + let _init_guard = zebra_test::init(); + + // Two distinct blocks that both add Sapling notes. + let candidates: [&[u8]; 6] = [ + zebra_test::vectors::BLOCK_MAINNET_1687106_BYTES.as_slice(), + zebra_test::vectors::BLOCK_MAINNET_1687107_BYTES.as_slice(), + zebra_test::vectors::BLOCK_MAINNET_1687108_BYTES.as_slice(), + zebra_test::vectors::BLOCK_MAINNET_1687113_BYTES.as_slice(), + zebra_test::vectors::BLOCK_MAINNET_1687118_BYTES.as_slice(), + zebra_test::vectors::BLOCK_MAINNET_1687121_BYTES.as_slice(), + ]; + let sapling_blocks: Vec = candidates + .iter() + .map(|bytes| Block::zcash_deserialize(*bytes).expect("block vector deserializes")) + .filter(|block| block.sapling_note_commitments().next().is_some()) + .collect(); + assert!( + sapling_blocks.len() >= 2, + "need two distinct Sapling blocks for this test" + ); + + let block_a = Arc::new(sapling_blocks[0].clone()); + let block_b = sapling_blocks[1].clone(); + assert_ne!(block_a.hash(), block_b.hash(), "blocks must differ"); + + // The correct trees for committing block A onto the genesis trees. + let mut correct = NoteCommitmentTrees::default(); + correct + .update_trees_parallel(&block_a) + .expect("appending block A's notes succeeds"); + + // A precompute built for block B at the same starting tree size (0) as A: its + // `start_size` matches A's tree, so the size-only guard would have applied B's + // leaves. The block-hash binding must reject it instead. + let pre_b = BlockNotePrecompute::compute(0, 0, &block_b, &AtomicBool::new(false)); + assert!( + pre_b.sapling.is_some(), + "block B exercises the Sapling pool" + ); + + let mut mismatched = NoteCommitmentTrees::default(); + mismatched + .update_trees_parallel_with(&block_a, Some(pre_b)) + .expect("update succeeds"); + assert_eq!( + mismatched.sapling.root(), + correct.sapling.root(), + "a precompute for a different block must be rejected, not grafted" + ); + + // The correctly-bound precompute for A is still applied and matches. + let pre_a = BlockNotePrecompute::compute(0, 0, &block_a, &AtomicBool::new(false)); + let mut matched = NoteCommitmentTrees::default(); + matched + .update_trees_parallel_with(&block_a, Some(pre_a)) + .expect("update succeeds"); + assert_eq!( + matched.sapling.root(), + correct.sapling.root(), + "a precompute bound to this block is applied" + ); + } } diff --git a/zebra-chain/src/sapling/tree.rs b/zebra-chain/src/sapling/tree.rs index 7316fdd108e..edbbbacfb08 100644 --- a/zebra-chain/src/sapling/tree.rs +++ b/zebra-chain/src/sapling/tree.rs @@ -23,12 +23,30 @@ use incrementalmerkletree::frontier::{Frontier, NonEmptyFrontier}; use thiserror::Error; use crate::{ + parallel::batch_frontier::{ + apply_append_batch_with_subtree, precompute_append_batch_with_subtree, BatchFrontierError, + PrecomputedSubtreeAppend, + }, serialization::{ serde_helpers, ReadZcashExt, SerializationError, ZcashDeserialize, ZcashSerialize, }, subtree::{NoteCommitmentSubtreeIndex, TRACKED_SUBTREE_HEIGHT}, }; +/// The precomputed parallel-append work for one block's Sapling note commitments, +/// produced off the committer by [`NoteCommitmentTree::precompute_append`] and +/// applied with [`NoteCommitmentTree::apply_precomputed_append`]. +#[derive(Clone, Debug)] +pub(crate) struct PrecomputedAppendBatch(PrecomputedSubtreeAppend); + +impl PrecomputedAppendBatch { + /// The tree size (leaf [`count`](NoteCommitmentTree::count)) this precompute + /// must be applied to. + pub(crate) fn start_size(&self) -> u64 { + self.0.start_size() + } +} + pub mod legacy; use legacy::LegacyNoteCommitmentTree; @@ -145,6 +163,25 @@ impl ZcashDeserialize for Root { pub enum NoteCommitmentTreeError { #[error("The note commitment tree is full")] FullTree, + + #[error("Invalid precompute: empty batch, stale start size, or multi-subtree batch")] + InvalidPrecompute, +} + +impl From for NoteCommitmentTreeError { + fn from(error: BatchFrontierError) -> Self { + match error { + // A capacity overflow is the tree being full. + BatchFrontierError::Frontier(_) => NoteCommitmentTreeError::FullTree, + // The remaining variants are caller-supplied precompute misuse, which + // is reported as a recoverable error rather than panicking. + BatchFrontierError::BatchSpansMultipleSubtrees + | BatchFrontierError::EmptyBatch + | BatchFrontierError::PrecomputeStartMismatch { .. } => { + NoteCommitmentTreeError::InvalidPrecompute + } + } + } } /// Sapling Incremental Note Commitment Tree. @@ -261,6 +298,73 @@ impl NoteCommitmentTree { })) } + /// Precomputes the parallel-append work for `note_commitments` against a tree + /// of size `start_size`, off the committer's critical path. + /// + /// This does the per-leaf Merkle hashing (the dominant cost of committing a + /// shielded block) using only the starting leaf *count*, so it can run + /// concurrently ahead of the committer. Apply with + /// [`Self::apply_precomputed_append`] on a tree whose [`count`](Self::count) + /// equals `start_size`. Returns [`NoteCommitmentTreeError::InvalidPrecompute`] + /// for an empty `note_commitments`, rather than panicking. + pub(crate) fn precompute_append( + start_size: u64, + note_commitments: &[NoteCommitmentUpdate], + ) -> Result { + let nodes: Vec = note_commitments + .iter() + .map(sapling_crypto::Node::from_cmu) + .collect(); + + let inner = precompute_append_batch_with_subtree::<_, MERKLE_DEPTH>(start_size, &nodes)?; + + Ok(PrecomputedAppendBatch(inner)) + } + + /// Applies a [`PrecomputedAppendBatch`] from [`Self::precompute_append`], + /// returning any completed [`TRACKED_SUBTREE_HEIGHT`] subtree, exactly like + /// [`Self::append_batch`]. `precomputed.start_size()` must equal this tree's + /// [`count`](Self::count); a stale precompute returns + /// [`NoteCommitmentTreeError::InvalidPrecompute`] (rather than panicking) so + /// callers can fall back to [`Self::append_batch`]. + #[allow(clippy::unwrap_in_result)] + pub(crate) fn apply_precomputed_append( + &mut self, + precomputed: PrecomputedAppendBatch, + ) -> Result, NoteCommitmentTreeError> + { + let (frontier, completed) = + apply_append_batch_with_subtree(self.inner.clone(), precomputed.0)?; + + self.inner = frontier; + *self + .cached_root + .get_mut() + .expect("a thread that previously held exclusive lock access panicked") = None; + + Ok(completed.map(|(index_value, root)| { + let index = NoteCommitmentSubtreeIndex( + index_value.try_into().expect("subtree index fits in u16"), + ); + (index, root) + })) + } + + /// Benchmark-only: precompute the parallel append for `note_commitments` + /// (rayon hashing), apply it onto a fresh tree, and return the resulting root. + /// Mirrors the committer's precompute path end-to-end so the + /// `precompute_threshold` benchmark can compare it against a serial append. + #[cfg(feature = "bench")] + #[doc(hidden)] + pub fn precompute_then_apply_root(note_commitments: &[NoteCommitmentUpdate]) -> [u8; 32] { + let mut tree = NoteCommitmentTree::default(); + let precomputed = + Self::precompute_append(0, note_commitments).expect("non-empty batch in benchmark"); + tree.apply_precomputed_append(precomputed) + .expect("fresh tree matches start size 0"); + tree.root().into() + } + /// Returns frontier of non-empty tree, or None. fn frontier(&self) -> Option<&NonEmptyFrontier> { self.inner.value() @@ -805,4 +909,164 @@ mod tests { tree.assert_frontier_eq(&original); assert_eq!(tree.root(), original.root()); } + + /// The off-committer precompute (`precompute_append` + `apply_precomputed_append`) + /// must produce the same frontier, root, and completed-subtree result as the + /// inline `append_batch` across a range of tree/batch sizes. + #[test] + fn precompute_append_matches_append_batch() { + let cases = [ + ("empty tree, one leaf", 0u64, 1usize), + ("empty tree, small batch", 0, 5), + ("odd tree, small batch", 3, 4), + ("power-of-two tree, small batch", 8, 7), + ("after power-of-two tree, small batch", 9, 6), + ]; + + for (name, prefix_len, batch_len) in cases { + let start = build_tree(prefix_len); + let note_commitments: Vec<_> = (0..batch_len as u64) + .map(|value| note_commitment(1_000 + prefix_len + value)) + .collect(); + + let mut inline_tree = start.clone(); + let _ = inline_tree.root(); + let inline_result = inline_tree + .append_batch(¬e_commitments) + .expect("inline append succeeds"); + + let mut precompute_tree = start; + let _ = precompute_tree.root(); + let precomputed = NoteCommitmentTree::precompute_append(prefix_len, ¬e_commitments) + .expect("precompute succeeds"); + assert_eq!(precomputed.start_size(), prefix_len, "{name}: start size"); + let precompute_result = precompute_tree + .apply_precomputed_append(precomputed) + .expect("apply precompute succeeds"); + + assert_eq!( + precompute_result, inline_result, + "{name}: subtree result mismatch" + ); + precompute_tree.assert_frontier_eq(&inline_tree); + assert_eq!( + precompute_tree.root(), + inline_tree.root(), + "{name}: root mismatch" + ); + } + } + + /// The precompute path matches inline `append_batch` when the batch crosses the + /// first tracked-subtree boundary, including the returned subtree index and root. + #[test] + fn precompute_append_crosses_subtree_boundary() { + let start = pre_subtree_boundary_tree(); + let note_commitments = [note_commitment(100), note_commitment(200)]; + + let mut inline_tree = start.clone(); + let _ = inline_tree.root(); + let inline_result = inline_tree + .append_batch(¬e_commitments) + .expect("inline append succeeds"); + assert!(inline_result.is_some(), "batch crosses a subtree boundary"); + + let mut precompute_tree = start; + let _ = precompute_tree.root(); + let start_size = precompute_tree.count(); + let precomputed = NoteCommitmentTree::precompute_append(start_size, ¬e_commitments) + .expect("precompute succeeds"); + let precompute_result = precompute_tree + .apply_precomputed_append(precomputed) + .expect("apply precompute succeeds"); + + assert_eq!(precompute_result, inline_result, "subtree result mismatch"); + precompute_tree.assert_frontier_eq(&inline_tree); + assert_eq!(precompute_tree.root(), inline_tree.root(), "root mismatch"); + } + + /// The committer's size-match guard in `update_sapling_note_commitment_tree_with`: + /// a precompute keyed on the wrong tree size must be rejected and fall back to + /// inline hashing, so a stale look-ahead can never corrupt the tree — it can only + /// lose the speedup. A correctly-keyed precompute and `None` must match inline too. + #[test] + fn update_with_falls_back_on_size_mismatch() { + use crate::parallel::tree::NoteCommitmentTrees; + use std::sync::Arc; + + let start = build_tree(9); + let note_commitments: Vec<_> = (0..6).map(|value| note_commitment(2_000 + value)).collect(); + + // Inline reference. + let mut inline_tree = start.clone(); + let _ = inline_tree.root(); + let expected_subtree = inline_tree + .append_batch(¬e_commitments) + .expect("inline append succeeds"); + let expected_root = inline_tree.root(); + + let run = |precompute: Option| { + let base = start.clone(); + let _ = base.root(); + let (tree, subtree) = NoteCommitmentTrees::update_sapling_note_commitment_tree_with( + Arc::new(base), + note_commitments.clone(), + precompute, + ) + .expect("update succeeds"); + (tree.root(), subtree) + }; + + // No precompute: inline path. + assert_eq!( + run(None), + (expected_root, expected_subtree), + "None fallback" + ); + + // Correctly-keyed precompute (start_size == tree count 9): applies the precomputed subtree roots, same result. + let matched = NoteCommitmentTree::precompute_append(9, ¬e_commitments) + .expect("precompute succeeds"); + assert_eq!( + run(Some(matched)), + (expected_root, expected_subtree), + "matched precompute" + ); + + // Wrong-keyed precompute (start_size 7 != tree count 9): the guard rejects it + // and falls back to inline, still producing the correct tree. + let stale = NoteCommitmentTree::precompute_append(7, ¬e_commitments) + .expect("precompute succeeds"); + assert_eq!( + run(Some(stale)), + (expected_root, expected_subtree), + "stale precompute falls back" + ); + } + + /// The public precompute wrappers report invalid input as a recoverable + /// `NoteCommitmentTreeError`, never a panic: an empty batch, and a stale + /// precompute applied directly to a mismatched tree. + #[test] + fn precompute_wrappers_report_invalid_input() { + // Empty batch. + assert_eq!( + NoteCommitmentTree::precompute_append(0, &[]).err(), + Some(NoteCommitmentTreeError::InvalidPrecompute), + "empty precompute_append is a recoverable error" + ); + + // Stale precompute applied to a tree of the wrong size. + let note_commitments: Vec<_> = (0..4).map(|value| note_commitment(3_000 + value)).collect(); + let stale = NoteCommitmentTree::precompute_append(5, ¬e_commitments) + .expect("precompute succeeds"); + + let mut tree = build_tree(2); + let _ = tree.root(); + assert_eq!( + tree.apply_precomputed_append(stale), + Err(NoteCommitmentTreeError::InvalidPrecompute), + "applying a stale precompute is a recoverable error" + ); + } } diff --git a/zebra-state/src/service/check/tests/nullifier.rs b/zebra-state/src/service/check/tests/nullifier.rs index f42858afda6..fd258c32cdc 100644 --- a/zebra-state/src/service/check/tests/nullifier.rs +++ b/zebra-state/src/service/check/tests/nullifier.rs @@ -85,7 +85,7 @@ proptest! { // randomly choose to commit the block to the finalized or non-finalized state if use_finalized_state { let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, None, "test"); // the block was committed prop_assert_eq!(Some((Height(1), block1.hash)), read::best_tip(&non_finalized_state, &finalized_state.db)); @@ -349,7 +349,7 @@ proptest! { // randomly choose to commit the next block to the finalized or non-finalized state if duplicate_in_finalized_state { let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, None, "test"); prop_assert_eq!(Some((Height(1), block1.hash)), read::best_tip(&non_finalized_state, &finalized_state.db)); prop_assert!(commit_result.is_ok()); @@ -448,7 +448,7 @@ proptest! { // randomly choose to commit the block to the finalized or non-finalized state if use_finalized_state { let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(),None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, None, "test"); prop_assert_eq!(Some((Height(1), block1.hash)), read::best_tip(&non_finalized_state, &finalized_state.db)); prop_assert!(commit_result.is_ok()); @@ -628,7 +628,7 @@ proptest! { // randomly choose to commit the next block to the finalized or non-finalized state if duplicate_in_finalized_state { let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(),None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, None, "test"); prop_assert_eq!(Some((Height(1), block1.hash)), read::best_tip(&non_finalized_state, &finalized_state.db)); prop_assert!(commit_result.is_ok()); @@ -725,7 +725,7 @@ proptest! { // randomly choose to commit the block to the finalized or non-finalized state if use_finalized_state { let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, None, "test"); prop_assert_eq!(Some((Height(1), block1.hash)), read::best_tip(&non_finalized_state, &finalized_state.db)); prop_assert!(commit_result.is_ok()); @@ -914,7 +914,7 @@ proptest! { // randomly choose to commit the next block to the finalized or non-finalized state if duplicate_in_finalized_state { let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, None, "test"); prop_assert_eq!(Some((Height(1), block1.hash)), read::best_tip(&non_finalized_state, &finalized_state.db)); prop_assert!(commit_result.is_ok()); @@ -1004,7 +1004,7 @@ proptest! { finalized_state.populate_with_anchors(&block2); let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.into(), None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.into(), None, None, "test"); prop_assert!(commit_result.is_ok()); let block2 = Arc::new(block2).prepare(); @@ -1058,7 +1058,7 @@ proptest! { finalized_state.populate_with_anchors(&block2); let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.into(), None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.into(), None, None, "test"); prop_assert!(commit_result.is_ok()); let block2 = Arc::new(block2).prepare(); @@ -1112,7 +1112,7 @@ proptest! { finalized_state.populate_with_anchors(&block2); let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.into(), None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.into(), None, None, "test"); prop_assert!(commit_result.is_ok()); let block2 = Arc::new(block2).prepare(); diff --git a/zebra-state/src/service/check/tests/utxo.rs b/zebra-state/src/service/check/tests/utxo.rs index dd9017bea20..69bfe446f69 100644 --- a/zebra-state/src/service/check/tests/utxo.rs +++ b/zebra-state/src/service/check/tests/utxo.rs @@ -185,7 +185,7 @@ proptest! { // randomly choose to commit the block to the finalized or non-finalized state if use_finalized_state { let block1 = CheckpointVerifiedBlock::from(Arc::new(block1)); - let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block1.clone().into(), None, None, "test"); // the block was committed prop_assert_eq!(Some((Height(1), block1.hash)), read::best_tip(&non_finalized_state, &finalized_state.db)); @@ -273,7 +273,7 @@ proptest! { if use_finalized_state_spend { let block2 = CheckpointVerifiedBlock::from(Arc::new(block2)); - let commit_result = finalized_state.commit_finalized_direct(block2.clone().into(),None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block2.clone().into(), None, None, "test"); // the block was committed prop_assert_eq!(Some((Height(2), block2.hash)), read::best_tip(&non_finalized_state, &finalized_state.db)); @@ -609,7 +609,7 @@ proptest! { if use_finalized_state_spend { let block2 = CheckpointVerifiedBlock::from(block2.clone()); - let commit_result = finalized_state.commit_finalized_direct(block2.clone().into(), None, "test"); + let commit_result = finalized_state.commit_finalized_direct(block2.clone().into(), None, None, "test"); // the block was committed prop_assert_eq!(Some((Height(2), block2.hash)), read::best_tip(&non_finalized_state, &finalized_state.db)); @@ -878,7 +878,7 @@ fn new_state_with_mainnet_transparent_data( if use_finalized_state { let block1 = CheckpointVerifiedBlock::from(block1.clone()); let commit_result = - finalized_state.commit_finalized_direct(block1.clone().into(), None, "test"); + finalized_state.commit_finalized_direct(block1.clone().into(), None, None, "test"); // the block was committed assert_eq!( diff --git a/zebra-state/src/service/finalized_state.rs b/zebra-state/src/service/finalized_state.rs index 63a0ec4fb89..fd8a1240146 100644 --- a/zebra-state/src/service/finalized_state.rs +++ b/zebra-state/src/service/finalized_state.rs @@ -22,7 +22,11 @@ use std::{ }, }; -use zebra_chain::{block, parallel::tree::NoteCommitmentTrees, parameters::Network}; +use zebra_chain::{ + block, + parallel::tree::{BlockNotePrecompute, NoteCommitmentTrees}, + parameters::Network, +}; use zebra_db::{ block::{RetentionPlan, ZAKURA_HEADER_BODY_SIZE_BY_HEIGHT}, chain::BLOCK_INFO, @@ -70,6 +74,46 @@ static COMMIT_COMPUTE_POOL: LazyLock = LazyLock::new(|| { .expect("rayon thread pool configuration is valid") }); +/// Spawns the note-commitment tree per-leaf hashing for `block` onto the +/// commit-compute pool, returning a receiver for the result and a cancellation +/// flag. +/// +/// The off-committer half of the tree-update pipeline: the finalized write loop +/// starts this for the *next* block — using the running tree sizes `sapling_start` +/// / `orchard_start` (the tree `count`s the block will commit at) — so the heavy +/// hashing overlaps the *current* block's commit on otherwise idle cores. The +/// committer then only applies the precomputed subtree roots. If the precompute is stale (its `start_size` no +/// longer matches the tree), the committer falls back to inline hashing, so this +/// is purely a scheduling optimization. +/// +/// Because it is started speculatively before the current block has committed, the +/// caller must keep the returned flag and set it if it discards the precompute — +/// e.g. when the current block's commit fails. The spawned task checks the flag +/// before each pool's hashing (and skips the send if cancelled), so a discarded +/// child that has not started a pool yet avoids that pool's work. +pub(crate) fn spawn_note_precompute( + sapling_start: u64, + orchard_start: u64, + block: Arc, +) -> ( + crossbeam_channel::Receiver, + Arc, +) { + let (tx, rx) = crossbeam_channel::bounded(1); + let cancel = Arc::new(AtomicBool::new(false)); + let task_cancel = cancel.clone(); + COMMIT_COMPUTE_POOL.spawn(move || { + let result = + BlockNotePrecompute::compute(sapling_start, orchard_start, &block, &task_cancel); + // If the precompute was cancelled, the receiver has been (or is being) + // dropped and the result is unwanted; skip the send. + if !task_cancel.load(Ordering::Relaxed) { + let _ = tx.send(result); + } + }); + (rx, cancel) +} + pub mod column_family; mod disk_db; @@ -520,11 +564,13 @@ impl FinalizedState { &mut self, ordered_block: QueuedCheckpointVerified, prev_note_commitment_trees: Option, + note_precompute: Option, ) -> Result<(CheckpointVerifiedBlock, NoteCommitmentTrees), CommitCheckpointVerifiedError> { let (checkpoint_verified, rsp_tx) = ordered_block; let result = self.commit_finalized_direct( checkpoint_verified.clone().into(), prev_note_commitment_trees, + note_precompute, "commit checkpoint-verified request", ); @@ -568,6 +614,7 @@ impl FinalizedState { &mut self, finalizable_block: FinalizableBlock, prev_note_commitment_trees: Option, + note_precompute: Option, source: &str, ) -> Result<(block::Hash, NoteCommitmentTrees), CommitCheckpointVerifiedError> { let (height, hash, finalized, prev_note_commitment_trees, retention) = @@ -631,9 +678,13 @@ impl FinalizedState { )); }); + // `note_precompute`, if present and still size-matched, + // lets the committer apply the precomputed subtree roots + // instead of re-hashing the notes here; else hashes inline. timed_commit_phase!( "zebra.state.write.update_trees.duration_seconds", - note_commitment_trees.update_trees_parallel(&block) + note_commitment_trees + .update_trees_parallel_with(&block, note_precompute) ) }) }); diff --git a/zebra-state/src/service/finalized_state/disk_format/tests/snapshot.rs b/zebra-state/src/service/finalized_state/disk_format/tests/snapshot.rs index 76a1f0cba99..10e21d4acf2 100644 --- a/zebra-state/src/service/finalized_state/disk_format/tests/snapshot.rs +++ b/zebra-state/src/service/finalized_state/disk_format/tests/snapshot.rs @@ -96,7 +96,7 @@ fn test_raw_rocksdb_column_families_with_network(network: Network) { .expect("test data deserializes"); state - .commit_finalized_direct(block.into(), None, "snapshot tests") + .commit_finalized_direct(block.into(), None, None, "snapshot tests") .expect("test block is valid"); let mut settings = insta::Settings::clone_current(); diff --git a/zebra-state/src/service/finalized_state/tests/prop.rs b/zebra-state/src/service/finalized_state/tests/prop.rs index 16140ef5e36..81c4a28a9fb 100644 --- a/zebra-state/src/service/finalized_state/tests/prop.rs +++ b/zebra-state/src/service/finalized_state/tests/prop.rs @@ -39,6 +39,7 @@ fn blocks_with_v5_transactions() -> Result<()> { let (hash, _) = state.commit_finalized_direct( checkpoint_verified.into(), None, + None, "blocks_with_v5_transactions test" ).unwrap(); prop_assert_eq!(Some(height), state.finalized_tip_height()); @@ -114,6 +115,7 @@ fn all_upgrades_and_wrong_commitments_with_fake_activation_heights() -> Result<( state.commit_finalized_direct( checkpoint_verified.into(), None, + None, "all_upgrades test" ).expect_err("Must fail commitment check"); failure_count += 1; @@ -124,6 +126,7 @@ fn all_upgrades_and_wrong_commitments_with_fake_activation_heights() -> Result<( let (hash, _) = state.commit_finalized_direct( checkpoint_verified.into(), None, + None, "all_upgrades test" ).unwrap(); prop_assert_eq!(Some(height), state.finalized_tip_height()); diff --git a/zebra-state/src/service/finalized_state/tests/rollback.rs b/zebra-state/src/service/finalized_state/tests/rollback.rs index 775e6b2b0ee..aa6e42fcc3d 100644 --- a/zebra-state/src/service/finalized_state/tests/rollback.rs +++ b/zebra-state/src/service/finalized_state/tests/rollback.rs @@ -79,7 +79,7 @@ fn sync_to(config: &Config, network: &Network, blocks: &[SemanticallyVerifiedBlo for block in blocks { let checkpoint_verified = CheckpointVerifiedBlock::from(block.block.clone()); state - .commit_finalized_direct(checkpoint_verified.into(), None, "rollback test") + .commit_finalized_direct(checkpoint_verified.into(), None, None, "rollback test") .expect("committing a generated block to a fresh state succeeds"); } } diff --git a/zebra-state/src/service/finalized_state/zebra_db/block/tests/prune.rs b/zebra-state/src/service/finalized_state/zebra_db/block/tests/prune.rs index 9bf0a03f4a0..27eb009b0b7 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/block/tests/prune.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/block/tests/prune.rs @@ -49,7 +49,7 @@ fn new_state_with_blocks(config: &Config, network: &Network) -> FinalizedState { .expect("test data deserializes"); state - .commit_finalized_direct(block.into(), None, "prune tests") + .commit_finalized_direct(block.into(), None, None, "prune tests") .expect("test block is valid"); } @@ -80,7 +80,7 @@ fn new_state_with_checkpoint_retention( .expect("test data deserializes"); state - .commit_finalized_direct(block.into(), None, "checkpoint retention tests") + .commit_finalized_direct(block.into(), None, None, "checkpoint retention tests") .expect("test block is valid"); } @@ -351,7 +351,7 @@ fn checkpoint_retention_hands_off_to_online_pruning_at_start() { .expect("test data deserializes"); state - .commit_finalized_direct(block.into(), None, "checkpoint handoff tests") + .commit_finalized_direct(block.into(), None, None, "checkpoint handoff tests") .expect("test block is valid"); } @@ -386,7 +386,7 @@ fn checkpoint_retention_hands_off_to_online_pruning_at_start() { .expect("test data deserializes"); state - .commit_finalized_direct(block.into(), None, "checkpoint handoff tests") + .commit_finalized_direct(block.into(), None, None, "checkpoint handoff tests") .expect("handoff block is valid"); let online_prune_until = @@ -630,7 +630,7 @@ fn archive_to_pruned_checkpoint_sync_drains_archive_raw_transactions_before_skip .expect("test data deserializes"); archive_state - .commit_finalized_direct(block.into(), None, "archive phase") + .commit_finalized_direct(block.into(), None, None, "archive phase") .expect("archive block is valid"); } @@ -670,7 +670,7 @@ fn archive_to_pruned_checkpoint_sync_drains_archive_raw_transactions_before_skip .expect("test data deserializes"); pruned_state - .commit_finalized_direct(block.into(), None, "archive to pruned checkpoint") + .commit_finalized_direct(block.into(), None, None, "archive to pruned checkpoint") .expect("checkpoint block is valid"); assert_eq!( @@ -728,7 +728,7 @@ fn archive_backlog_flag_is_recomputed_when_reopening_a_pruned_database() { .expect("test data deserializes"); archive_state - .commit_finalized_direct(block.into(), None, "archive phase") + .commit_finalized_direct(block.into(), None, None, "archive phase") .expect("archive block is valid"); } std::mem::drop(archive_state); @@ -761,7 +761,7 @@ fn archive_backlog_flag_is_recomputed_when_reopening_a_pruned_database() { .zcash_deserialize_into() .expect("test data deserializes"); pruned_state - .commit_finalized_direct(block.into(), None, "archive to pruned checkpoint") + .commit_finalized_direct(block.into(), None, None, "archive to pruned checkpoint") .expect("checkpoint block is valid"); assert_eq!( pruned_state.db.lowest_retained_height(), @@ -842,7 +842,7 @@ fn contextual_commits_keep_raw_transactions_before_checkpoint_retention_start() .zcash_deserialize_into() .expect("genesis test data deserializes"); state - .commit_finalized_direct(genesis.into(), None, "contextual retention tests") + .commit_finalized_direct(genesis.into(), None, None, "contextual retention tests") .expect("genesis block is valid"); let block: Arc = blocks @@ -858,7 +858,7 @@ fn contextual_commits_keep_raw_transactions_before_checkpoint_retention_start() let finalizable = FinalizableBlock::new(contextually_verified, Treestate::default()); state - .commit_finalized_direct(finalizable, None, "contextual retention tests") + .commit_finalized_direct(finalizable, None, None, "contextual retention tests") .expect("contextual block is valid"); assert!( diff --git a/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs b/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs index 82a214099c6..b875ed9903d 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/block/tests/snapshot.rs @@ -195,7 +195,7 @@ fn test_block_and_transaction_data_with_network(network: Network) { .expect("test data deserializes"); state - .commit_finalized_direct(block.into(), None, "snapshot tests") + .commit_finalized_direct(block.into(), None, None, "snapshot tests") .expect("test block is valid"); let mut settings = insta::Settings::clone_current(); diff --git a/zebra-state/src/service/finalized_state/zebra_db/prune.rs b/zebra-state/src/service/finalized_state/zebra_db/prune.rs index 42813e35378..e32c54f7988 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/prune.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/prune.rs @@ -355,7 +355,7 @@ mod tests { .expect("test data deserializes"); state - .commit_finalized_direct(block.into(), None, "offline prune tests") + .commit_finalized_direct(block.into(), None, None, "offline prune tests") .expect("test block is valid"); } diff --git a/zebra-state/src/service/write.rs b/zebra-state/src/service/write.rs index 96070107433..7cdcd33fc93 100644 --- a/zebra-state/src/service/write.rs +++ b/zebra-state/src/service/write.rs @@ -3,7 +3,10 @@ use std::{ collections::VecDeque, path::{Path, PathBuf}, - sync::Arc, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, time::Duration, }; @@ -16,12 +19,14 @@ use tokio::sync::{ use tracing::Span; use zebra_chain::block::{self, Height}; +use zebra_chain::parallel::tree::{BlockNotePrecompute, NoteCommitmentTrees}; + use crate::{ constants::MAX_BLOCK_REORG_HEIGHT, error::CommitHeaderRangeError, service::{ check, - finalized_state::{FinalizedState, ZebraDb}, + finalized_state::{spawn_note_precompute, FinalizedState, ZebraDb}, non_finalized_state::NonFinalizedState, queued_blocks::{QueuedCheckpointVerified, QueuedSemanticallyVerified}, ChainTipBlock, ChainTipSender, InvalidateError, ReconsiderError, @@ -36,6 +41,25 @@ use crate::service::{ non_finalized_state::Chain, }; +/// A speculatively-started note-commitment precompute for an upcoming finalized +/// block: the block hash it was started for, the channel to receive the result on, +/// and a flag to cancel it if the block is no longer going to be committed. +type PendingPrecompute = ( + block::Hash, + crossbeam_channel::Receiver, + Arc, +); + +/// Cancels and drops a pending look-ahead precompute, if any. +/// +/// Tripping the flag tells the spawned task (started before the current block +/// committed) to stop instead of hashing a block that will not be committed. +fn cancel_pending_precompute(pending: &mut Option) { + if let Some((_hash, _rx, cancel)) = pending.take() { + cancel.store(true, Ordering::Relaxed); + } +} + /// The maximum size of the parent error map. /// /// We allow enough space for multiple concurrent chain forks with errors. @@ -317,9 +341,26 @@ impl WriteBlockWorkerTask { backup_dir_path, } = &mut self; - let mut prev_finalized_note_commitment_trees = None; + let mut prev_finalized_note_commitment_trees: Option = None; let mut deferred_non_finalized_messages = VecDeque::new(); + // One-block look-ahead so the next block's note-commitment tree hashing can + // be precomputed off the committer (on idle cores) while the current block + // commits. `pending_precompute` holds the receiver and cancellation flag for + // the block started last iteration; `finalized_lookahead` buffers the peeked + // next block. The precompute is keyed on the running tree sizes and only + // applied if those still match at commit time, so this never affects + // correctness, only speed. + // + // Because the next block's precompute is started before the current block + // commits, a current block that fails to commit (e.g. an invalid block from + // a peer) leaves that speculative work unwanted. Whenever this loop discards + // a pending precompute it trips the cancellation flag via + // [`cancel_pending_precompute`], so the spawned task stops instead of hashing + // a block that will never be committed. + let mut pending_precompute: Option = None; + let mut finalized_lookahead: VecDeque = VecDeque::new(); + // Write all the finalized blocks sent by the state, // until the state closes the finalized block channel's sender. loop { @@ -338,13 +379,16 @@ impl WriteBlockWorkerTask { Err(TryRecvError::Disconnected) => {} } - let ordered_block = match finalized_block_write_receiver.try_recv() { - Ok(block) => block, - Err(TryRecvError::Empty) => { - std::thread::park_timeout(Duration::from_millis(10)); - continue; - } - Err(TryRecvError::Disconnected) => break, + let ordered_block = match finalized_lookahead.pop_front() { + Some(block) => block, + None => match finalized_block_write_receiver.try_recv() { + Ok(block) => block, + Err(TryRecvError::Empty) => { + std::thread::park_timeout(Duration::from_millis(10)); + continue; + } + Err(TryRecvError::Disconnected) => break, + }, }; // TODO: split these checks into separate functions @@ -375,15 +419,57 @@ impl WriteBlockWorkerTask { Assuming a parent block failed, and dropping this block", ); + // The pipeline is broken; cancel and drop any look-ahead so the next + // precompute re-seeds from the real tip (a stale precompute would + // only fall back anyway, but cancelling stops the wasted hashing). + cancel_pending_precompute(&mut pending_precompute); + finalized_lookahead.clear(); + // We don't want to send a reset here, because it could overwrite a valid sent hash std::mem::drop(ordered_block); continue; } + // Use the precompute for this block if we started it last iteration and + // it is for this exact block; otherwise cancel it (so the spawned task + // stops) and let the committer hash inline. + let note_precompute = match pending_precompute.take() { + Some((hash, rx, _cancel)) if hash == ordered_block.0.hash => rx.recv().ok(), + Some((_hash, _rx, cancel)) => { + cancel.store(true, Ordering::Relaxed); + None + } + None => None, + }; + + // Peek the next block and start its precompute, so the heavy hashing + // overlaps this block's commit. Its start sizes are the current tree + // sizes plus this block's note counts (the sizes after this block). + if finalized_lookahead.is_empty() { + if let Ok(next) = finalized_block_write_receiver.try_recv() { + finalized_lookahead.push_back(next); + } + } + if let (Some(trees), Some(next)) = ( + prev_finalized_note_commitment_trees.as_ref(), + finalized_lookahead.front(), + ) { + let block = &ordered_block.0.block; + let sapling_start = + trees.sapling.count() + block.sapling_note_commitments().count() as u64; + let orchard_start = + trees.orchard.count() + block.orchard_note_commitments().count() as u64; + let (rx, cancel) = + spawn_note_precompute(sapling_start, orchard_start, next.0.block.clone()); + pending_precompute = Some((next.0.hash, rx, cancel)); + } + // Try committing the block - match finalized_state - .commit_finalized(ordered_block, prev_finalized_note_commitment_trees.take()) - { + match finalized_state.commit_finalized( + ordered_block, + prev_finalized_note_commitment_trees.take(), + note_precompute, + ) { Ok((finalized, note_commitment_trees)) => { let tip_block = ChainTipBlock::from(finalized); prev_finalized_note_commitment_trees = Some(note_commitment_trees); @@ -392,6 +478,13 @@ impl WriteBlockWorkerTask { Err(error) => { let finalized_tip = finalized_state.db.tip(); + // The commit failed and the queue is being reset, so any + // look-ahead precompute is for a block that will not be + // committed: cancel it so the spawned task stops instead of + // hashing the discarded child, and clear the look-ahead. + cancel_pending_precompute(&mut pending_precompute); + finalized_lookahead.clear(); + // The last block in the queue failed, so we can't commit the next block. // Instead, we need to reset the state queue, // and discard any children of the invalid block in the channel. @@ -554,7 +647,7 @@ impl WriteBlockWorkerTask { tracing::trace!("finalizing block past the reorg limit"); let contextually_verified_with_trees = non_finalized_state.finalize(); prev_finalized_note_commitment_trees = finalized_state - .commit_finalized_direct(contextually_verified_with_trees, prev_finalized_note_commitment_trees.take(), "commit contextually-verified request") + .commit_finalized_direct(contextually_verified_with_trees, prev_finalized_note_commitment_trees.take(), None, "commit contextually-verified request") .expect( "unexpected finalized block commit error: note commitment and history trees were already checked by the non-finalized state", ).1.into(); diff --git a/zebra-state/src/tests/setup.rs b/zebra-state/src/tests/setup.rs index 7c4c4a0bd6c..34b1785a84d 100644 --- a/zebra-state/src/tests/setup.rs +++ b/zebra-state/src/tests/setup.rs @@ -113,7 +113,7 @@ pub(crate) fn new_state_with_mainnet_genesis( let genesis = CheckpointVerifiedBlock::from(genesis); finalized_state - .commit_finalized_direct(genesis.clone().into(), None, "test") + .commit_finalized_direct(genesis.clone().into(), None, None, "test") .expect("unexpected invalid genesis block test vector"); assert_eq!( From 763c5801a3e41630ca36e097f765a61eff279060 Mon Sep 17 00:00:00 2001 From: Roman Date: Sun, 21 Jun 2026 02:53:12 +0000 Subject: [PATCH 12/16] [REVERT] Roman's AI workspace --- CHECKPOINT_SYNC_FINDINGS.md | 602 ++++++++++++++++++++++++++++++++++++ COMMIT_OPTIMIZE.md | 96 ++++++ CPU_PROFILE_RESULTS.md | 76 +++++ FULL_SYNC_SUMMARY.md | 120 +++++++ HANDOFF.md | 143 +++++++++ HOL_HEDGE_RESULTS.md | 53 ++++ NOTE_TREE_PRECOMPUTE_AB.md | 123 ++++++++ OPTIMIZATION_EXPERIMENTS.md | 51 +++ PARALLEL_IDEA.md | 318 +++++++++++++++++++ RUNBOOK.md | 179 +++++++++++ SAPLING_HASH_RESULTS.md | 191 ++++++++++++ 11 files changed, 1952 insertions(+) create mode 100644 CHECKPOINT_SYNC_FINDINGS.md create mode 100644 COMMIT_OPTIMIZE.md create mode 100644 CPU_PROFILE_RESULTS.md create mode 100644 FULL_SYNC_SUMMARY.md create mode 100644 HANDOFF.md create mode 100644 HOL_HEDGE_RESULTS.md create mode 100644 NOTE_TREE_PRECOMPUTE_AB.md create mode 100644 OPTIMIZATION_EXPERIMENTS.md create mode 100644 PARALLEL_IDEA.md create mode 100644 RUNBOOK.md create mode 100644 SAPLING_HASH_RESULTS.md diff --git a/CHECKPOINT_SYNC_FINDINGS.md b/CHECKPOINT_SYNC_FINDINGS.md new file mode 100644 index 00000000000..de56bd10895 --- /dev/null +++ b/CHECKPOINT_SYNC_FINDINGS.md @@ -0,0 +1,602 @@ +# Checkpoint-zone sync from the 1.7M snapshot — findings & plan + +> ## ⚠️ STATUS (2026-06-18): this document is HISTORICAL — read this banner first +> +> Everything below is the investigation up to **2026-06-17**. Since then the work +> shipped and the bottleneck **moved**, so several "not yet built" levers here +> (esp. §419 and §9–§14) are now **done or disproven**. Do not re-investigate them. +> +> **Shipped / built since (PR stack on the fork, `valargroup/zebra`):** +> - §419 lever 1 (parallelize the tree update + isolate it from the verify pool) → +> **shipped**: dedicated `COMMIT_COMPUTE_POOL` (#122) + parallel batch tree append. +> - ZIP-244 auth-data-root / commitment check parallelized + hoisted off the serial +> committer into the concurrent download tasks → **shipped** (#121, #124, #127), +> and the `to_librustzcash` txid/auth conversion **de-duplicated** (#125). +> - Parallel writer-batch serialization (raw tx bytes + block size) → **shipped** (#128). +> - §419 levers 2 & 3 (pipeline the writer / compute treestate in a pre-stage = +> "any-order commit") → **BUILT + benchmarked, NO GAIN, ~10% slower** → parked as +> draft **PR #129 (DO NOT MERGE)**. +> +> **Current bottleneck (the key change):** the heavy region (1.72–1.73M) is now +> **CPU-saturated (~7.75/8 cores, downloads fully buffered)**. The limiter is +> **total CPU work across the whole sync pipeline**, *not* the serial commit stage. +> So commit-side restructuring (incl. the pipeline) can't help while CPU-bound; +> the only lever is **reducing total CPU work**. +> +> **Next levers (see PARALLEL_IDEA.md → "Reducing total CPU work"):** (1) profile the +> heavy region; (2) investigate whether per-tx txid computation can be skipped in +> checkpoint sync (biggest potential win — eliminates the `to_librustzcash` reparse); +> (3) else native ZIP-244 digests (skip the reparse). Note the de-dup is already done +> (#125); native digests are the step beyond it. +> +> **Authoritative current sources:** `ANY_ORDER_COMMIT_DESIGN.md` §7d (measured +> any-order result + why CPU-bound) and `PARALLEL_IDEA.md` top ("UPDATE 2026-06-18"). + +**Date:** 2026-06-16 (updated 2026-06-17 — see §6 for the shipped head-of-line fix) +**Baseline:** `ironwood-main` @ `94ae42f48` (release); as of 2026-06-17 advanced to `3a5035904` +(PR #102 retry-instead-of-restart squash-merged upstream). Active stack: +`ironwood-main` → #104 `sync-continuous-refill` (`c4672eed0`) → #105 `fix-sync-head-of-line-priority` +(`3a385b862`), both MERGEABLE. +**Snapshot:** `/mnt/roman-dev-2-data/zebra-ckpt-master` — mainnet height **1,707,210** (below the max +mainnet checkpoint 3,358,006, so forward sync exercises the **checkpoint verifier**) +**Harness:** `/root/wal-bench/prbench.sh LABEL BIN 420 5` — 7-min fork-runs scraping height, +`sync_downloads_in_flight`, `sync.missing.block.*`, and restart events. +**Bench config (all runs):** `checkpoint_verify_concurrency_limit=1500`, `download_concurrency_limit=150` +(pinned explicitly so the unmeasured PR2 default bump doesn't confound the code comparison). + +--- + +## Key findings + +- **Sync is not resource-bound.** Steady state from the snapshot: CPU ~17–28% (equihash bursts to + ~6.7/8 cores then idles), network duty cycle ~33–53%, state commit ~4% of wall, disk block-I/O wait + **0.00s even after `drop_caches`**. Every resource is mostly idle on average. +- **The dominant cost is cold-start restart-thrash.** For the first 1–3 min the node can't fetch the + head-of-line block, hits `NotFoundRegistry`, does `cancel_all` + 10s restart, and **discards the + in-flight pipeline** — repeating. This burns **100–200s of a 300s run**. This, not steady-state + pipelining, is the real lever. +- **PR A2 fixes the thrash.** Keep the pipeline and retry the head block (with backoff) instead of + restarting. Result: **`restart_waits` 6–17 → 0 in every run**, ~8.4k blocks vs baseline median ~6.2k. +- **The validated win is PR A2 — *not* PR C.** With advertisers absent (`busy=0`) or saturated + (funnel), PR C's routing never produced a clean isolated win; the prC numbers are attributable to + PR A2's retry. PR C's registration may not even be engaging. +- **Phase-1 (`AdvertisersBusy`) was tested and REVERTED — it's a regression.** With plentiful peers it + funnels all downloads onto the 3 advertisers and stalls (see matrix). Reverted; only its typed + `NotFoundClass` accessor (replacing brittle `Debug`-string matching) is kept. +- **The residual worst case is peer scarcity, not a code bug.** The prF1 176s freeze was peer-bound: + **8–9 outbound peers, 89% handshake failure, crawler added 0 peers in 176s.** +- **Correction — it is NOT "all peers genuinely lack the block."** During the freeze there were + **162 synthetic `NotFoundRegistry` misses vs 1 real peer `NotFoundResponse`** — the head block was + almost never actually *asked* of a peer. `NotFoundRegistry` fires both when peers are marked-missing + *and* when no peer is free; with `in_flight≈1997` look-ahead blocks saturating 9 peers, it's + "no free peer," compounded by the registration gap. **Local self-saturation, not genuine absence.** +- **Config bump (2000/150) was never measured** — every run used 1500/150. A *deeper* look-ahead + plausibly *worsens* the worst case (more look-ahead saturating peers, starving the head block). +- **CORRECTION (2026-06-17) — the stall is marker-staleness, not saturation.** The working + `pool.route_inv.*` counters (§6) show `no_ready=0` across every instrumented run and `all_missing` + 12–19×: peers were *never* saturated; every synthetic miss was "ready peers exist but ALL marked + missing the hash." The "local self-saturation" reading above is superseded — the real mechanism is + inventory-marker staleness, and the worst-case lever is the inventory registry / head-of-line + priority, **not** peer acquisition or buffer depth. See §6. + +--- + +## Data + +### Round 3 matrix — Δblocks from 1,707,210 (7-min runs; peer-noisy, 5–10× run-to-run) +| binary | Δblocks | restart_waits | registry_miss | note | +|---|---|---|---|---| +| baseline | 6185 / 8480 / 4998 / 6483 / 7894 | 6–17 | n/a | restart-thrash | +| **PR A2** | 8649 / 8461 / **222** | **0 / 0 / 0** | 1 / 19 / **203** | thrash gone; `222`=peer-bound edge | +| A2 + PR C | 8331 / 8568 / 8352 | 0 / 0 / 0 | 45 / 1 / 2 | consistent, but win is A2's | +| **phase-1** (reverted) | prF1 3191 · prF2 **494** · prF3 **385** | — | 162 / 0 / 0 | funnel: busy 0 / **806** / **440** | + +- **PR A2 → restart_waits 0** is the robust signal (holds across all runs regardless of peer luck). +- **Phase-1 funnel:** prF2 (25 peers) and prF3 (22 peers) collapse to ~385–494 blocks (~20× worse) + because `has_advertiser` is true for ~every hash, so requests 4…150 all `Busy`-defer onto 3 + advertisers instead of guessing a ready peer that has the block. + +### prF1 stall signature (CSV) +- Frozen **176s** at height 1,710,401; `in_flight=1997`, `reserve=0`, `busy=0`. +- `registry_miss` climbs **every 2s** (the backoff cadence) → head block retried locally, never served. +- **162 synthetic registry-misses : 1 real peer refusal** → block was barely ever put on the wire. +- Never escaped (killed by the 420s wall cap). Two distinct blocks stalled across the run, not one. + +--- + +## Status of each change + +| Change | State | Verdict | +|---|---|---| +| **PR A2** (retry-instead-of-restart) | built, in main tree | **Proven. Ship.** | +| **F1** (split Response/Registry retry counters) | built | Ship (A2 correctness) | +| **Q4** (`SYNC_RESTART_DELAY` 30→45s) | built | Ship (fixes real `ensure_timeouts_consistent` failure) | +| **typed `NotFoundClass`** accessor | built | Ship (standalone robustness win) | +| **PR C** (register multi-block invs as advertised, `handshake.rs`) | built | **Defer** — unproven, registry-bloat tradeoff | +| **config** (`checkpoint_verify=2000`, `download=150` defaults) | default only | **Defer** — never measured, may hurt worst case | +| **phase-1** (`AdvertisersBusy`) | reverted | Dropped — funnel regression | +| **PR B** (evict non-serving peers) | not built | Contraindicated — worst case isn't lie/prune | +| **pre-fetch producer** (eager `EXTEND_PREFETCH_WATERMARK`) | **built but stale & unbenched** (`/mnt/roman-dev-2-data/zebra-pr3`, missing F1/Q4) | Prioritized — needs rebase + build + bench | +| **continuous-refill** (`tokio::select!` `sync_round` rewrite) | **PR #104, rebased onto `ironwood-main`, MERGEABLE** | Unparked — it is the base the head-of-line fix needed (the `select!` loop hosts the non-blocking retry arm) | +| **head-of-line priority** (`sync.rs` gate + non-blocking backoff + `pool.route_inv.*` counters) | **built + benched + shipped; PR #105 on #104** | **Ship. 0/13 stalls, no regression.** Mitigation, not cure — see §6 | + +--- + +## Plan + +### 1. Ship now — the measured restart-thrash fix (`zebrad`-focused) +- **PR A2 + F1 + Q4 + typed `NotFoundClass`.** +- Scope the claim honestly: *eliminates cold-start head-of-line restart-thrash* (`restart_waits → 0`). + It does **not** fix the peer-bound worst case — that's separate, deferred work. + +### 2. Defer (hold, don't delete) — until isolated +- **PR C** (`handshake.rs` multi-block inv registration) — hold for a clean A2-vs-A2+C bench; its + routing benefit is unconfirmed (`busy=0`) and it carries a registry-bloat tradeoff. +- **Config tweaks** (2000 / 150 defaults) — leave defaults unchanged in the shipped PR; bench + separately later, and only in the *smaller-buffer / head-of-line-priority* direction the stall + evidence points to, not bigger-for-throughput. + +### 3. Next investigation — one cheap debug run (decides PR C + worst-case lever) — ✅ ANSWERED (§6) +- Re-run with `route_inv` arm counters (**"no ready peer" vs "all marked missing"**) + a + `register_inventory_status` registration counter. +- Settles: does PR C actually populate advertisers (`busy=0` lead)? Is the stall saturation, marking, + or genuine absence? → tells us whether the worst-case lever is **head-of-line priority**, **peer + acquisition**, or a **PR C fix**. +- **RESOLVED (2026-06-17):** the `pool.route_inv.*` counters were built (replacing the broken + `zcash.net.*` ones — see PR C fate §) and run. Verdict: **`no_ready=0`, `all_missing` 12–19× → the + stall is marker-staleness, and the lever is head-of-line priority / the inventory registry.** Not + saturation, not genuine absence, not peer acquisition. Full analysis and the shipped fix in §6. + +### 4. Prioritized build — pre-fetch producer +- Rebase the `zebra-pr3` eager-prefetch onto the current stack → build → bench (N≥3). +- Targets the steady-state sawtooth (overlap the next FindBlocks hash-fetch with downloads so the + buffer stays fed). *Caveat:* steady state is already healthy; this is a steady-state lever, and a + fuller buffer may interact with the saturation worst case — measure both throughput **and** + worst-case recovery. + +### 5. Built + benched — continuous-refill (parked as draft PR #104) +Built on top of the pre-fetch producer: replaced the sequential `try_to_sync_once` (drain → extend → +dispatch) with a single `tokio::select!` `sync_round` overlapping draining completed downloads, one +in-flight tip extension (`build_extend`, a self-contained refactor of `discover_extend_hashes`), and +dispatch. Branch `sync-continuous-refill`, draft PR #104 against `fix-sync-restart-thrash`. +(Implementation note: the optional-extension `select!` arm must use `OptionFuture`, not a guarded +`.expect()` — `select!` evaluates a branch expression even when its `if` precondition is false, which +panicked the first benched build within ~40s. Fixed; unit tests didn't catch the interleaving.) + +**Result — throughput increased and became less variable, but the head-of-line stall persists.** +- **Post-first-commit rate** (Δblocks ÷ (wall − escape); factors out cold-start peering), N=3 healthy + draws each, identical config (`checkpoint_verify=1500`, `download=150`): + - pre-fetch only: **22.8 / 20.9 / 17.2** blk/s (median 20.9, wide — one weak draw) + - + continuous-refill: **22.2 / 22.8 / 22.2** blk/s (median 22.2, tight) +- Throughput is **on par to slightly higher and notably less variable**; mean in-flight rises + **~1705 → ~1915 (+12%)** — the buffer is reliably fuller across FindBlocks round-trips (the intended + mechanism). 0 restarts, 0 restart_waits, no panics on the full runs. +- **But it does NOT remove the head-of-line / peer-scarcity stall.** A later batch drew a thin peer + window: two runs froze at the cold-start head-of-line block (fin stuck at 66, in_flight pinned 499, + registry-miss climbing on the 2s backoff to ~207) with **all local resources idle** — CPU ~0.01 + cores, net <0.25 MB/s, disk read 0, blkio-wait 0. Same signature as prF1 / prA2c / exp1. Refill + neither causes nor fixes it: it optimizes buffer depth, which is not the bottleneck when no peer will + serve the frontier block. + +**Bottleneck characterization (from the resource-sampled runs — `prbench_res.sh`):** +- *Healthy steady state:* **verify/commit-bound at ~22 blk/s.** Downloads run a full buffer (~1500–2000) + ahead of finalize and idle ~half the intervals against the lookahead cap; finalize is the steady + metronome (0 multi-second stalls; per-interval Δfin never zero). **Not** network/CPU/disk bound + (net <0.25 MB/s, blkio-wait 0, disk read 0) — most likely the serial state-writer (per-input UTXO + reads + ordered commit), ~45 ms/block. +- *Thin-peer draw:* **peer-availability-bound** (head-of-line block unservable), everything local idle. + +**Verdict: parked (draft PR #104).** The steady-state win is real but small and on the non-bottleneck +(download) side; the true levers are verify/commit serialization (healthy) and head-of-line / peer +acquisition (worst case), both untouched by refill. Keep the draft for if/when downstream work makes a +fuller pipeline matter. + +--- + +## 6. Head-of-line-priority fix — BUILT, BENCHED, SHIPPED (PR #105) — 2026-06-17 + +Built on `sync-continuous-refill` (#104). Stack after rebase: `ironwood-main` (`3a5035904`, #102 +squash-merged) → #104 (`c4672eed0`) → #105 `fix-sync-head-of-line-priority` (`3a385b862`). + +**What it does** (confined to `sync.rs` + read-only counters in `set.rs` — no DoS-sensitive routing +change, per the locked decision): +- **Part A — diagnostics (`route_inv`):** four `pool.route_inv.*` counters (advertiser / maybe / + notfound.no_ready / notfound.all_missing). Uses the `pool.*` prefix that scrapes correctly — this + **fixes the exp1 counter-export bug** (the old `zcash.net.*` names never exported; see PR C fate §). + Verified live: all series appear and increment. +- **Part B — fix:** while a required block is registry-missing (`registry_miss_retry` *map* non-empty), + (B1) pause new speculative dispatch, and (B2) move the 2s backoff out of the inline blocking `sleep` + into a non-blocking `biased` `select!` timer arm so the loop keeps draining/extending during the wait. + +**KEY FINDING — the stall is inventory-marker staleness, NOT saturation.** This answers §3 and corrects +the original "self-saturation" root cause. Across all instrumented runs: + +| run | no_ready | all_missing | +|---|---|---| +| hol1 / hol2 / hol3 | 0 / 0 / 0 | 12 / 12 / 3 | +| h7_2 (worst) | 0 | 19 | +| h7_5 / h7_6 | 0 | 2 / 1 | + +`no_ready` was **0 everywhere** — peers were never saturated; ready peers always existed. Every +synthetic miss was `all_missing`: ready peers exist but ALL are marked-missing the hash, so `route_inv` +synthesizes `NotFoundRegistry` without hitting the wire and we wait out the ~53s/106s registry rotation. + +**Implication: B2 (non-blocking backoff) is the actual fix; B1 (the saturation gate) targets a +condition (`no_ready`) that never fires** — cheap/defensive, may marginally cut marker creation, but +not what recovers the stall. base (continuous-refill, no HOL) still has the inline blocking `sleep`, so +on a miss it freezes the loop and re-hits the same marked peers ~200× (base3: registry_miss→201) +without progress; hol keeps the loop running so by retry the marker has aged / a different peer is ready. + +**Benchmark** (420s fork-runs, thin-peer regime; `zebrad-hol` vs `zebrad-refill`): +- **base 1/6 stalled** (base3: ~300s freeze, 0.5 blk/s, registry_miss→201; base4/5 partial). +- **hol 0/13 stalled** (3 unconstrained + 3 A/B + 7 consecutive). Recovers from all 12–19 `all_missing` + events per run by waiting them out. +- **No throughput regression:** both ~20–22 blk/s on healthy draws (hol 20.0–21.6; A/B tbase≈thol ~21.6). +- *Caveats:* stall is peer-draw-dependent, so no controlled same-peers A/B was achievable; base lacks + Part A counters, so its stall mechanism is inferred (near-certainly the same `all_missing`). + +**Resource characterization (steady state): commit-bound, not download/peer-bound.** `in_flight` pins +at the 1500 cap with `reserve` at 996 (both buffers full) while the instantaneous rate decays 33→16 +blk/s over a run — the signature of a rising per-block state-commit cost (RocksDB growth + +note-commitment trees), i.e. the serial finalized writer + disk, not verification CPU. Confirms the §5 +"verify/commit-bound at ~22 blk/s" reading. + +**Is #105 the right solution, or is there a better one? (analysis 2026-06-17)** — #105 is correct and +the safe thing to ship, but it is a **mitigation** (waits out marker staleness), not a cure. The counter +data points to more-direct alternatives: +1. **Registry-marker fix (most root-cause):** in `InventoryRegistry` — a targeted `clear_missing` for + the starved head-of-line hash, faster marker expiry, or make `NotFoundRegistry` non-terminal for the + critical block. Eliminates `all_missing` at the source. *Downside:* DoS-sensitive peer-set code (the + marker exists to avoid hammering peers that lack a block); needs its own bench + DoS review. This is + the documented follow-up direction (cf. PR C fate §). +2. **Shrink the lookahead buffer (cheapest, data-supported):** since steady state is commit-bound with + `in_flight` pinned at 1500, dropping `checkpoint_verify_concurrency_limit` to ~300–500 costs ~zero + throughput (the consumer can't go faster) and cuts the speculative-request volume that creates the + missing-markers. Composes with #105 and could let us drop the B1 gate. Static knob vs. the dynamic gate. +3. **#105 as shipped:** proven, low-risk, confined to `sync.rs`, no DoS-sensitive change. + +**Recommendation:** ship #105; if going further, test buffer-shrink (#2) first (free on throughput, +attacks the cause); treat the registry-marker fix (#1) as a deferred, separately-reviewed follow-up +only if the residual `all_missing` micro-stalls ever become user-visible (currently they don't — +0 stalls, full throughput). Harness: `/root/wal-bench/{prbench_thin,run_hol7,analyze_hol}.sh`. + +--- + +## Reference +- `route_inv` — `zebra-network/src/peer_set/set.rs:991`; synthetic `NotFoundRegistry` at `:1058`. +- Advertiser registration (PR C) — `zebra-network/src/peer/handshake.rs:~1230`. +- Inventory rotation governor — `INVENTORY_ROTATION_INTERVAL=53s` (`constants.rs:145`). +- Retry state machine — `zebrad/src/components/sync.rs` `handle_block_response_with_missing_retry`. +- Pre-fetch producer — `zebra-pr3` worktree: `EXTEND_PREFETCH_WATERMARK`, `discover_extend_hashes`. + +## PR C fate (inventory-routing experiment) + +**Status — two separate decisions, only one of which is made:** +- **DECIDED (deployment):** PR C is *excluded from the production PR* (#102) and parked on the local + branch `experiment-inventory-routing`. The production thrashing fix (PR A2 + F1 + Q4 + typed accessor) + is the measured, robust win and ships on its own. +- **NOT DECIDED (keep vs drop):** whether PR C is ultimately worth keeping is **unresolved**. The + decisive fate-check was inconclusive (see below), so we have neither confirmed a benefit nor proven + it inert. It stays parked, blocked on one cheap verification step before a keep/drop call can be made. + +**What we know about PR C (register FindBlocks-reply blocks as advertised inventory):** +- **Registration works at least sometimes.** With phase-1 layered on top (prF2/prF3, 22-25 peers), the + `busy` path engaged heavily (806/440) — which can only happen if `route_inv` saw `has_advertiser = + true`, i.e. PR C had registered the FindBlocks responders as advertisers. +- **Standalone benefit is unconfirmed.** Without phase-1, PR C only adds "route to a *ready* advertiser + in Step 1, else guess" — no funnel, but the prC matrix win (~8.4k, 0 restart-waits) is attributable to + PR A2's pipeline-preserving retry, not PR C's routing. We never isolated a clean A2-vs-A2+PR-C delta. +- **The counter-instrumented fate-check (exp1) is a measurement bug, now CONFIRMED — not evidence + about PR C.** A cross-check of the exp1 CSV settles it: the sync-side `registry_miss` climbed to + **203**, while the network-side `route_inv.registry_miss` stayed **0** for the whole run. Every one of + those 203 misses is synthesized *inside* `route_inv` at exactly the line that increments the network + counter, so it must have fired ~203 times — reading 0 proves the new `zcash.net.*` counters are not + exporting under the scraped names (a metric-name/registration bug in the diagnostics), **not** that PR + C is inert. So exp1 tells us nothing about PR C either way; the earlier prF2/prF3 evidence (busy + engagement) still says registration does fire. + +**Tradeoff that keeps it out of production:** PR C registers block hashes from multi-item invs, which the +original code deliberately skips to avoid inventory-registry bloat ("a query reply… the whole network has +it"). It also touches a DoS-sensitive peer-set routing path. Without a confirmed throughput benefit, that +tradeoff isn't justified in the production change. + +**To resolve PR C's fate (still OPEN), on `experiment-inventory-routing`:** +1. Fix the diagnostic counters — they are **confirmed broken** (export under a different name than + `zcash_net_route_inv_*`/`zcash_net_inv_queried_block_registered`, or aren't registering). One 60s + live `curl /metrics` reveals the real names; correct the harness scrape. +2. Then run A2 (production) vs A2+PR C on a **rich-peer** draw (variance matters — a thin draw stalls + and is uninformative; use N≥3 and only trust good-peer draws), watching `route_inv.advertiser` vs + `route_inv.registry_miss` and `inv.queried.block.registered`. PR C is worth keeping only if + `advertiser` climbs and `registry_miss` drops materially. If `registered ≈ 0` on a good draw, PR C + is genuinely inert (a registration-gap to fix or abandon). + +**Bottom line: PR C's keep/drop fate is NOT decided.** It is excluded from PR #102 and parked; the one +experiment meant to decide it was invalidated by a counter bug, so neither benefit nor inertness is +established. + +Phase-1 (`AdvertisersBusy`) is **not** revived for this: §7 showed its exclusive gate funnels concurrency +and regresses throughput ~20×. Any future routing work should use the phase-2 parking-queue design +(prefer the advertiser without blocking other requests), not the exclusive gate. + +## PR 3 fate (eager hash prefetch) — DROP (confirmed no-op) + +**Decision: DROP.** PR 3 (split `extend_tips` into `discover_extend_hashes` + a thin wrapper, and top +up the hash reserve whenever it falls below `EXTEND_PREFETCH_WATERMARK=500` instead of only when it +hits zero) has **no observable effect** in this configuration. Changes kept local & uncommitted on +`sync-prefetch-producer`, not proposed for merge. + +**Throughput A/B (prefetch vs `fix-sync-restart-thrash` baseline, post-escape rate, healthy draws):** +prefetch median **20.7 blk/s** (17.0 / 20.7 / 22.5) vs baseline **20.8 blk/s** — no difference. + +**Mechanistic trace (free, from the existing 5s CSVs — pref2 vs fixb2, same config, both healthy):** +- `sync.reserve.depth` = **0 at 100% of samples** for *both* — the lookahead (1500) ≫ a FindBlocks + batch (~500), so each extend batch is fully dispatched into `in_flight` the same iteration and the + reserve (overflow) never accumulates, with or without prefetch. Prefetch fires every iteration; it + just has nothing to accumulate. +- `sync_downloads_in_flight` = **identical** (pref avg 1697, range 1499–1951; baseline avg 1712, range + 1499–1952). It oscillates between the lookahead floor and the overflow allowance and **never + approaches 0** in either. + +**Root cause:** `in_flight` is **bound by the lookahead cap (1500), not by hash availability**. Prefetch +makes more hashes available, but the buffer is already pinned at the cap, so nothing changes. The +sawtooth-to-0 PR 3 was designed to fix is a **cold-start / pre-A2** phenomenon; it does not occur in +healthy steady state with lookahead=1500 — the deep buffer already absorbs the FindBlocks round-trip. +(5s sampling is adequate here: a 1500-deep buffer draining at ~20 blk/s cannot reach 0 within 5s, so a +sub-sample dip to zero is physically impossible.) + +**Corollary:** the heavier continuous-refill `select!` event-loop is **also not worth building** — the +bottleneck was never hash-feeding. Post-escape steady state is verify-bound (equihash), not +network/hash-bound. + +## Full benchmark matrix (all runs, post-escape metrics; cold-start removed) + +Post-rate = (final_height − escape_height) / (run_end − escape_time). Draw flagged STALLED on +`registry_miss ≥ 50` (peer-scarcity, network-bound). 7-min fork-runs from height 1,707,210. + +| run | config | escape | Δblocks | post-rate (blk/s) | restart_waits | reg_miss | draw | +|---|---|---|---|---|---|---|---| +| base1 | baseline | 110s | 6185 | 20.0 | 11 | – | healthy | +| base2 | baseline | 20s | 8480 | 21.2 | 6 | – | healthy | +| base3 | baseline | 181s | 4998 | 20.7 | 17 | – | healthy | +| base4 | baseline | 25s | 6483 | 16.4 | 16 | – | healthy | +| base5 | baseline | 35s | 7894 | 20.3 | 9 | – | healthy | +| base6 | baseline | 40s | 8506 | 22.3 | 0 | – | healthy | +| base7 | baseline | 15s | 5942 | 14.6 | 19 | – | healthy | +| base8 | baseline | 35s | 8448 | 21.7 | 6 | – | healthy | +| prA2 | PR-A2 | 15s | 8649 | 21.3 | 0 | 1 | healthy | +| prA2b | PR-A2 | 40s | 8461 | 22.0 | 0 | 19 | healthy | +| prA2c | PR-A2 | 51s | 222 | 0.6 | 0 | 203 | STALLED | +| prC1 | A2+C | 35s | 8331 | 21.5 | 0 | 45 | healthy | +| prC2 | A2+C | 36s | 8568 | 22.1 | 0 | 1 | healthy | +| prC3 | A2+C | 35s | 8352 | 21.6 | 0 | 2 | healthy | +| prE1 | A2+C+rot20 | 40s | 8414 | 22.2 | 0 | 0 | healthy | +| prE2 | A2+C+rot20 | 30s | 8599 | 22.0 | 0 | 1 | healthy | +| prE3 | A2+C+rot20 | 35s | 8489 | 21.9 | 0 | 26 | healthy | +| prF1 | A2+C+F1+phase1 | 35s | 3191 | 8.0 | 0 | 162 | STALLED (registry) | +| prF2 | A2+C+F1+phase1 | 45s | 494 | 1.2 | 0 | 0 | STALLED (busy funnel, busy=806) | +| prF3 | A2+C+F1+phase1 | 45s | 494 | 1.2 | 0 | 0 | STALLED (busy funnel, busy=440) | +| prG1 | A2+F1+Q4 (final) | 35s | 8602 | 22.1 | 0 | – | healthy | +| prG2 | A2+F1+Q4 (final) | 35s | 8476 | 21.8 | 0 | 2 | healthy | +| exp1 | A2+C+PRc+counters | 35s | 222 | 0.4 | 0 | 203 | STALLED | +| pref1 | A2+F1+Q4+prefetch | 55s | 8271 | 22.5 | 0 | – | healthy | +| pref2 | A2+F1+Q4+prefetch | 35s | 8006 | 20.7 | 0 | – | healthy | +| pref3 | A2+F1+Q4+prefetch | 40s | 6510 | 17.0 | 0 | – | healthy | +| fixb1 | A2+F1+Q4 baseline | 101s | 2007 | 6.2 | 0 | 150 | STALLED | +| fixb2 | A2+F1+Q4 baseline | 35s | 8075 | 20.8 | 0 | – | healthy | + +(`fixb3` died on startup, transient; `smoke` was an early thin-draw never-escape.) + +**Reading the matrix:** +- **The validated win (A2 / A2+F1+Q4):** `restart_waits = 0` on every run vs baseline's 6-19 — the + thrash-elimination is the robust, binary-attributable result. Post-rate ~21-22 blk/s, comparable to + baseline's healthy draws but without the cold-start restart thrash. +- **STALLED draws are peer-scarcity, not binary-attributable:** they appear under four different binaries + (PR-A2, A2+C, A2+C+PRc, prod-baseline) at `reg_miss` 150-203 / ~0.4-6.2 blk/s, while the *same* + binaries run clean on good draws. The stall is "no connected peer serves the head block" — network- + bound, not fixable by sync-logic changes (PR B eviction is contraindicated: can't evict from a thin + peer set). +- **phase-1 (prF2/prF3)** is the one binary-attributable regression: the `AdvertisersBusy` exclusive + gate funnels concurrency (busy=806/440) → ~1.2 blk/s even on non-thin draws. Reverted. +- **prefetch (pref*)** ≈ baseline (fixb2) — no effect, per the no-op analysis above. + +--- + +## §8 — The 20 blk/s ceiling is note-commitment-tree updates on the serial writer (2026-06-17) + +**Question:** on `fix-sync-head-of-line-priority`, healthy steady state sits at ~20 blk/s. What is +the constraint, and why can't it go higher? + +**Method:** fresh resource-sampled run (`prbench_res.sh`) from the 1.7M snapshot confirms it is a +**single serial thread**, not any hardware resource. Then an instrumented build (`zebrad-hol-instr`, +7 new phase histograms, see `/root/wal-bench/writer-phase-instrumentation.patch`) split the per-block +serial commit cost. Scrape: `/root/wal-bench/phase_scrape.sh`. + +### Macro: not resource-bound (res_holinstr.csv / res_holres1.csv, steady state) +| Resource | Measured | Verdict | +|---|---|---| +| CPU | **1.1–1.7 / 8 cores** | not aggregate-CPU-bound (7 cores idle) | +| Block-I/O wait | **0.00 s** | not disk-bound | +| Physical disk reads | **0.0 MB/s** (page-cache served) | not read-bound | +| Disk writes | 25 MB/s | trivial | +| Net RX / TX | 9.2 / 4.1 MB/s | not bandwidth-bound | +| `sync_downloads_in_flight` | ~1600–2000 (buffer full) | downloads far ahead; writer is the metronome | + +### Micro: per-block serial-writer breakdown (N=5297 blocks, instrumented run = 17.2 blk/s; sum=56.2 ms/block reconciles exactly) +| Phase (serial finalized-writer thread) | ms/block | % serial | +|---|---|---| +| **`update_trees_parallel`** (Sapling/Orchard note-commitment Merkle trees) | **40.9** | **72.7%** | +| `block_commitment_is_valid_for_chain_history` (ZIP-244 chain-history check) | 10.8 | 19.1% | +| `write_block` total (ALL RocksDB work) | 4.5 | 8.0% | +| · db.write (rocksdb commit — the only previously-timed part) | 2.5 | — | +| · prepare_block_batch | 1.0 | — | +| · address-balance reads | 0.45 | — | +| · per-input UTXO/output_location reads | 0.40 | — | +| `history_tree.push` (sapling/orchard root) | 0.1 | 0.2% | + +**~92% of serial commit time is CPU crypto** (tree update + commitment check). All RocksDB I/O — +including the per-input UTXO reads the RUNBOOK had fingered — is **<4.5 ms (8%)**. This **overturns +the prior working hypothesis** (serial state-writer DB / UTXO reads). + +### Root cause (architectural) +`commit_finalized_direct` Checkpoint arm (`finalized_state.rs:366`): *"Checkpoint-verified blocks +don't have an associated treestate"* — so `update_trees_parallel` + the commitment check run **inline +on the single finalized-writer thread**, with zero overlap (block N+1 cannot start until N's full +~56 ms completes). In the semantic/non-finalized path the same `update_trees_parallel` runs during +contextual validation (`chain.rs:1482`), off the commit critical step. The checkpoint verifier (1500 +concurrency) validates blocks in parallel but **skips treestate**, dumping the most expensive op onto +one thread. `update_trees_parallel` already parallelizes *across* the 3 trees (rayon, 4 tasks), so the +~41 ms is after cross-tree parallelism → **whichever pool the spam is in at that height dominates** and +is sequential. (Correction: the dominant pool **varies by range**, not "always Sapling" — see §13.) + +### Levers to break past ~20 blk/s (not yet built) +1. **Parallelize *within* a tree update**: leaf commitment hashing (Pedersen/Sinsemilla) for all of a + block's outputs across the 7 idle cores before the sequential frontier merge — the likely big win. +2. **Pipeline the writer** (tree-update stage ahead of db-commit stage): overlaps only the ~2.5 ms + commit — small. +3. **Compute treestate ahead of the writer in a dedicated sequential pre-stage** fed by the parallel + checkpoint verifier — hides nothing on its own (it IS the bottleneck) unless combined with (1). + +Artifacts: instrumented binary `/root/wal-bench/zebrad-hol-instr`; phase scrapes +`/root/wal-bench/phase_holinstr_final.txt`; patch `writer-phase-instrumentation.patch`. + +--- + +## §9 — Part 1 implemented: overlap commitment-check with tree update (2026-06-17) + +Worktree `/root/zebra-hol-pr`, branch `sync-checkpoint-commit-parallel` (off `fix-sync-head-of-line-priority`). +In `commit_finalized_direct`'s Checkpoint arm, `update_trees_parallel` and +`block_commitment_is_valid_for_chain_history` now run concurrently via `rayon::in_place_scope_fifo` +(tree update on the in-place thread, commitment check spawned), joining before `history_tree.push`. +The commitment check reads only the parent history tree, so it is independent (confirmed in `check.rs`). + +**Measured (zebrad-part1, 5,604 blocks at steady state, within-run so peer-independent):** +- `checkpoint_compute` WALL = **30.5 ms/block** ≈ `update_trees` component alone (30.4 ms) → the + commitment check (8.4 ms) is **fully hidden** by the overlap. +- Sequential sum would be 30.4 + 8.4 + 0.1 = 38.9 ms → actual 30.5 ms = **~8.4 ms/block saved (~21% of + the compute phase)**. +- Throughput 26.4 blk/s; CPU still ~1.97/8 cores; db.write now only ~1.7 ms/block. + +**Implication for the plan:** db.write is tiny (~1.7 ms), so Part 2 (pipeline write off the writer) +now buys at most ~write_block (~4.5 ms) of overlap — modest. The remaining serial wall is +`update_trees` (~30 ms = ~31 blk/s ceiling), so **Part 3 (parallel batch Sapling append) is the only +real lever past ~30 blk/s.** (Note: the ~30 ms here vs ~41 ms in the §8 baseline run is cross-run +variance — different machine/cache state; the §8 vs §9 numbers are not directly comparable, which is +why the Part 1 proof uses the within-run sequential-sum-vs-wall comparison instead.) + +--- + +## §10 — Part 3 premise CONFIRMED: Sapling append is parallelizable (2026-06-17) + +Micro-benchmark (`zebra-chain parallel::tree::part3_premise_bench`, release, ~1.7M-leaf tree): +| N (leaves/block) | append loop | per leaf | root() | append % | +|---|---|---|---|---| +| 256 | 18.3 ms | 71.5 µs | 2.5 ms | 88% | +| 512 | 36.6 ms | 71.5 µs | 2.5 ms | 93% | +| 1024 | 73.3 ms | 71.6 µs | 2.5 ms | 97% | + +- Per-leaf append cost (this micro-bench, Sapling) is a **flat ~71.5 µs** = one Sapling Pedersen + `combine`. `root()` is a fixed **~2.5 ms** sequential floor (one combine per spine level). + (Per §13, in-node per-leaf costs measured ~74 µs Sapling / ~190 µs Orchard-Sinsemilla.) +- NOTE (corrected in §13): the leaf *count* per block was later measured directly — it is **not** a + fixed ~385, and the dominant pool **varies by range** (Orchard ~87/block at 1.709M; Sapling ~255/block, + peaks ~1.6k, at 1.724M). Do not treat the timing-derived "~385 sapling" estimate as authoritative. +- **Append dominates (88–97%) and is the parallelizable part.** Parallelizing the per-leaf combines + across 7 cores: ~27.5 ms → ~4 ms, + 2.5 ms root ≈ **~6.5 ms/block** (tree-update side), ~4–5×. + +**Design (Part 3, in progress):** parallel batch frontier append — decompose the block's new leaves into +aligned perfect subtrees, compute their roots via rayon parallel reduction (independent `H::combine` per +level), then fold into the frontier's ommers (sequential, O(log N), ~2.5 ms). Consensus-critical: +must reconstruct `NonEmptyFrontier (position, leaf, ommers)` byte-identically. Safety net: differential +proptests vs the serial `append` asserting identical `into_parts()`, `root()`, and +`completed_subtree_index_and_root` events over random tree sizes × batch sizes, before any production wiring. + +--- + +## §11 — Part 3 implemented: parallel batch note-commitment-tree append (2026-06-17) + +`zebra-chain/src/parallel/batch_frontier.rs`: generic `parallel_append` for any +`incrementalmerkletree::Frontier` (so Sapling, Orchard, Sprout share one implementation). Algorithm: +rebuild the pure binary-counter forest from the frontier's ommers, inject the old tip leaf, then append +the new leaves (except the last, kept raw) as globally **position-aligned dyadic blocks** — each block's +root computed by a `rayon::join` parallel reduction — injected in ascending order (aligned blocks compose +with no cross-boundary re-pairing, which is what makes the parallel result exact). + +Wired in via `NoteCommitmentTree::append_batch` on Sapling and Orchard, called from +`update_{sapling,orchard}_note_commitment_tree`. Subtree (2^16) completion tracking preserved by +splitting the batch at the at-most-one subtree boundary per block. + +**Correctness (consensus-critical):** +- Differential proptests vs sequential `Frontier::append`: 2000 random (prefix × batch) cases + exhaustive + 40×40 sweep — identical root AND identical frontier parts. Test node `combine` is order- and + level-sensitive to catch swaps/level bugs. (First implementation, a half-split divide-and-conquer, was + caught wrong by the proptest at `prefix=0,batch=7` — ragged-boundary re-pairing — and replaced.) +- Full `zebra-chain --lib` suite: 259 passed, 1 failed = only the pre-existing date-dependent NU7 test + (fails identically on clean base). Known-answer note-commitment-tree root vectors + subtree tests pass. + +Next: build + benchmark (expect tree-update phase ~30 ms → single digits, CPU > 2 cores). + +## §12 — Part 3 benchmark results (2026-06-17) + +Bench from the 1.7M snapshot, peer-independent phase times (instrumented) + throughput/CPU. + +| Metric | Baseline (§8) | Part 1 (§9) | Part 3 seq-blocks | Part 3b par-blocks | +|---|---|---|---|---| +| `update_trees` ms/blk (peer-independent) | ~30 | ~30 | 18.4 | **16.5** | +| `checkpoint_compute` WALL ms/blk | ~52 (sum) | 30.5 | 18.6 | **16.9** | +| throughput blk/s | 17–22 | 26 | 32 | **42** | +| mean CPU /8 | 1.1–1.7 | 2.0 | 2.7 | **3.3 (peak 4.4)** | + +Part 3 = parallel batch note-commitment append (Sapling+Orchard). Part 3b adds `par_iter` across the +dyadic blocks (compute all block roots concurrently, each reduction internally parallel too). + +**Robust claim:** `update_trees` (peer-independent) ~30→16.5 ms (~1.8×); CPU and throughput ~doubled. +Not the theoretical ~5×: each block's ~27 ms of Pedersen work is a brief burst contending with the +verification pipeline on the shared rayon pool, plus the ~2.5 ms sequential `root()` per tree and the +sequential dyadic-block injection. Diminishing returns past here. + +**Overall stack (Part 1 + Part 3b):** checkpoint-zone steady state went from a single-core ~17–22 blk/s +to ~42 blk/s using ~3.3/8 cores, with byte-identical tree roots (proptests + known-answer vectors). +Part 2 (pipeline) remains parked; with db.write at ~1.9 ms it's still low-value. + +--- + +## §13 — CORRECTION: per-block output composition varies by range (2026-06-17) + +Earlier sections assumed "Sapling dominates." Direct measurement (commitment-tree size delta via +`z_gettreestate`, self-validated against the orchard nullifier counter + `getblock` shielded arrays — +three independent methods) shows the **spam pool flips by height range**: + +| range | sapling outputs/block | orchard outputs/block | note | +|---|---|---|---| +| 1,709,000–1,710,999 | 0.7 | **86.7** | Orchard sandblasting | +| 1,724,000–1,725,000 | **254.6** | 0 | Sapling sandblasting (peaks ~1,649/block) | + +- **Method:** parse the serialized `finalState` commitment tree (`left`/`right`/`parents` ⇒ leaf count) + at the two heights; the delta is the exact outputs added. Orchard delta matched the independent + nullifier counter and `getblock` exactly, validating the parser, so the Sapling delta is trustworthy. +- **Per-leaf cost by pool (in-node):** Sapling/Pedersen ~74 µs/leaf; Orchard/Sinsemilla ~190 µs/leaf + (~2.5× heavier). So a Sapling-spam block (~255 leaves) and an Orchard-spam block (~87 leaves) land at + similar `update_trees` cost (~17–19 ms) via different mixes. +- **Implication:** the parallel batch append (Part 3) is generic over the pool, so it covers both. But any + per-leaf/per-block cost model must use the actual pool mix of the range under test, and the leaf count + is highly variable (0 → ~1,650/block) and bursty. The timing-derived "~385 sapling/block" in §10 is + superseded by these direct counts. + +--- + +## §14 — Parallelism shortfall DIAGNOSED: global rayon contention (2026-06-17) + +Isolated release-mode probe of `parallel_append` against **real Sapling and Orchard hashing** (batch +128–2048 × `RAYON_NUM_THREADS=1,2,4,8`; probe removed afterward, tree clean): +- **8 threads → ~6.7–7.4 effective cores** for 1024–2048 leaves, **both pools**. The reduction scales. +- 1-thread parallel ≈ sequential → no task-overhead regression. +- Local pool ≈ global pool *in isolation* (no other load). + +In-node, the same code runs at only ~1.6 effective cores (heavy-Sapling `update_trees` ~137 ms for +~1,850 leaves ≈ sequential). ⇒ The bottleneck is **global rayon pool contention/scheduling +interference** — `update_trees_parallel` nests Sapling+Orchard tasks plus `parallel_append`'s internal +rayon work, all on the **global** pool, contending with the download/verify/checkpoint pipeline. + +**Decision: prioritize a dedicated tree-update rayon pool (pool isolation), NOT `parallel_append` +algorithm tuning.** Final confirmation owed: full-node dedicated-pool A/B (isolation proves the ceiling; +A/B proves it's realized in-node). See `PARALLEL_IDEA.md` next-step #1. diff --git a/COMMIT_OPTIMIZE.md b/COMMIT_OPTIMIZE.md new file mode 100644 index 00000000000..2add01b8fcd --- /dev/null +++ b/COMMIT_OPTIMIZE.md @@ -0,0 +1,96 @@ +# Committer / sync throughput optimization + +Where the checkpoint-sync throughput bottleneck actually is, the three highest-impact +improvements, and one architectural recommendation. Grounded in instrumented runs over the +sandblast region (~1.7M), not inference. + +## The measured bottleneck (steady-state, blocks 1.715M–1.728M) + +The finalized **committer is the binding constraint** — confirmed by direct utilization + +queue-depth instrumentation, not guessed from per-phase profiling: + +| signal | value | reads as | +| --- | --- | --- | +| committer utilization | **89% busy** | the committer is the gate, not idle | +| committer input queue depth | **937 blocks** backed up | upstream delivers faster than it commits | +| poll-empty fraction | 13% | rarely starved for input | +| commit time / block | 12.98 ms (~77 blk/s capacity) | — | +| update_trees (within commit) | 8.98 ms = **69% of the commit** | the dominant slice | +| equihash / merkle (serial verifier) | 0.42 / 0.03 ms | feed/verifier ruled out | +| download rate | ~60 blk/s | the *next* gate, just behind | +| throughput | 68.3 blk/s | committer draining its buffer | + +Key facts: +- The single-threaded committer does, per block in order: note-commitment tree update + + write-batch build + RocksDB write + history-tree push. Tree update is **69%** of it. +- The "feed" (download → verify) is **not** the bottleneck here: the serial verifier + (equihash + merkle) is ~0.5 ms, and blocks are backed up 937-deep at the committer's input. +- The committer's capacity (~77 blk/s) is only slightly above the **download rate (~60 blk/s)**, + so once the committer is sped up, the gate shifts to download bandwidth. The two are close, + which is why the bottleneck kept appearing to move between runs (it depends on how fast blocks + are being delivered, which varies with peers/conditions). + +Earlier confusion (recorded for honesty): a first A/B of improvement #1 showed flat throughput, +because that run happened to be in a download-limited regime (committer had slack). Per-phase +profiling tells you where time goes *within* a stage; only utilization/queue-depth instrumentation +(or a controlled A/B in the right regime) identifies the binding stage. The numbers above are from +that instrumentation. + +## Top 3 highest-impact improvements (ranked) + +### 1. Note-commitment tree precompute off the committer — highest ROI, already built (PR #144) +Move the tree's per-leaf Merkle hashing (Pedersen/Sinsemilla) off the serial committer: precompute +it ahead of time, keyed only on the cumulative note count, concurrently across many blocks on the +idle cores; the committer then only "grafts" the precomputed subtree roots (O(log N)). +- Cuts `update_trees` ~9 ms → ~4 ms, i.e. removes ~69% of the committer's per-block cost; committer + capacity ~77 → ~120 blk/s. +- Validated byte-identical to the inline append (differential proptests); env toggle for A/B. +- Status: implemented and PR'd against `sync-perf-main-2` (draft). Attacks the proven gate directly. + +### 2. Shrink the committer's *remaining* work: multi-block RocksDB commit + overlap the DB write +After #1, the committer's cost is dominated by the write path (batch build + RocksDB write + +history push, ~4 ms). Commit several blocks per RocksDB write batch (amortize per-commit overhead, +which grows with DB size), and overlap block N's disk write with block N+1's prepare. +- Pushes the committer toward the rocksdb-write floor; compounds with #1. +- Note (from a separate investigation): RocksDB had **zero write stalls** and the WAL is async, so + the win here is fewer/larger writes and less memtable-insert overhead, *not* WAL removal. + +### 3. Raise the download ceiling for large sandblast blocks (~60 blk/s — the next gate) +Once the committer is no longer the gate, download bandwidth (~60 blk/s) is the steady-state limit. +`in_flight` sits ~1026 (below the 1500 cap) yet completes only ~60/s → ~17 s effective per-block +latency: latency/concurrency-bound, not capped. More concurrent block-body requests, better peer +selection, and pipelined body fetch raise the durable ceiling. +- Medium-high ROI because it is the *steady-state* limiter after #1 and #2. + +## Architectural recommendation: parallel-prepare / thin-serial-commit + +The structural ceiling is that the finalized committer is a single serial thread doing +tree-update + batch-build + RocksDB-write + history-push per block, in order. Re-architect the +finalized commit into two stages: + +- **Prepare (parallel, many blocks ahead, off the critical path):** everything that depends only on + the block and its position, not on the live DB write — tree hashing (#1 does this), write-batch + build, serialization, address/UTXO index prep. +- **Commit (serial, minimal):** only the strictly-ordered work — the atomic RocksDB write and tip + advance. + +This is the correct version of the idea behind the parked "any-order commit pipeline" prototype +(PR #129). #129 split at the wrong seam (it overlapped the *tree compute* with the write) and was +measured when the box was CPU-saturated (~7.75/8), so it showed no gain. After the crypto wins the +box runs at ~3/8 (5 idle cores), and #1 makes the tree compute nearly free — so the right seam is +**prepare ‖ serial-write**, not tree-compute ‖ write. + +With prepare fully parallelized and commit reduced to the RocksDB write + multi-block batching, the +serial committer shrinks several-fold and the system-wide bottleneck moves cleanly to **download +bandwidth** — the honest physical floor for chain sync (you cannot validate faster than you fetch). + +**Direction:** #144 vs #129 is not a real choice — #144 is the better mechanism (it *reduces* the +dominant cost rather than redistributing it, and it makes #129's specific overlap moot). Land #144, +then pipeline the *write* (not the tree), then attack downloads. One-liner for the team: *#144 +removes the bottleneck; #129 only rearranged it. Land #144, then pipeline the write, not the tree.* + +## Suggested sequencing + +1. Merge #144 → re-measure; the committer gate should narrow and shift toward downloads. +2. Add multi-block commit batching + write/prepare overlap (improvement #2). +3. Decide between further committer work vs download parallelism based on which is then closer. diff --git a/CPU_PROFILE_RESULTS.md b/CPU_PROFILE_RESULTS.md new file mode 100644 index 00000000000..035ec6e2ced --- /dev/null +++ b/CPU_PROFILE_RESULTS.md @@ -0,0 +1,76 @@ +# CPU profile — checkpoint sync 1.7M → 1.8M + +Goal: replace the back-of-envelope "Pedersen ≈ 30% of CPU" inference with measured data, and map where per-block CPU actually goes. + +## TL;DR + +Direct per-block stage timers (including a **new off-committer precompute timer** that captures the bulk note-commitment hashing #144 moved off the committer) show **note-commitment hashing (Sapling Pedersen + Orchard Sinsemilla) is the single dominant per-block CPU cost** — growing from ~6.5 ms/block at 1.71M to **~17 ms/block** at 1.79M, dwarfing every other commit-side stage. A hard whole-node bound puts note-hashing at **≥31% of total CPU** (likely 31–54%). So the earlier "~30%" was a *floor*, and your intuition that Pedersen is a *large* share in the Sapling sandblast is correct. + +## Methodology + +- **Binary:** stock (no-fork) `sync-perf-main-2` tip (#144 merged), instrumented `--features commit-metrics` + a new `zebra.state.precompute.compute.duration_seconds` timer wrapping `BlockNotePrecompute::compute` (the off-committer Pedersen/Sinsemilla hashing). Single fast peer so it's CPU/committer-bound. +- **Per-block stage timers** (wall-time of each stage; metrics scraped every 5s, deltas over height windows). +- **Total CPU/block** from `/proc//stat` (`res-prof.csv`). +- **perf** `-F 99 --call-graph dwarf,16384` over the 1.72–1.75M Sapling-spam window: **203,823 samples**. *(Flamegraph rendering blocked — see limitation below.)* + +## Per-block stage budget (measured, ms/block, wall-time) + +| region | precompute (note hashing) | txid+auth digest | graft (on committer) | rocksdb commit | committer total | +|---|---|---|---|---|---| +| 1.71M (early) | 6.56 | 0.83 | 7.66 | 1.88 | 12.73 | +| **1.72–1.75M (Sapling-spam)** | **10.34** | 1.19 | 3.37 | 1.72 | 12.20 | +| 1.76–1.79M (deeper) | **17.09** | 1.71 | 6.75 | 3.10 | 20.56 | + +(commitment-check is negligible, ~0.07 ms. `committer total` = graft + commitment-check + rocksdb + UTXO/address reads + batch build + history push, all serial on the committer.) + +**Read:** the **precompute** (bulk Pedersen+Sinsemilla) is the largest single stage and the one that *scales with note accumulation* — it more than doubles across the range. The committer's own serial work (graft + rocksdb + reads/batch/history ≈ 12–20 ms) is the next chunk; per-tx BLAKE2b digesting and DB commit are minor (1–3 ms each). + +## Total CPU per block, and the feed side + +Total CPU/block (`res-prof.csv`) is **~70 ms** in the (perf-inflated) Sapling-spam window and **~113 ms** in the heavier deeper window — **much larger than the ~24 ms of timed commit-side stages.** The gap is two things: +1. **Internal parallelism** — `precompute` and the txid digest use rayon, so their CPU-seconds exceed wall-time. +2. **Untimed feed-side CPU** — block **deserialization** (parsing huge sandblast blocks: many outputs, cv/epk/proof fields) and **checkpoint verification** (equihash, merkle), which my commit-side timers don't cover. + +So the per-block CPU splits roughly into **note-hashing + feed-side deserialize/verify**, with note-hashing the largest single identifiable consumer. + +## The Pedersen CPU-share question — settled (with a measured bound) + +Earlier I wrote "~30%," derived by back-calculating from the fork's 18% whole-node CPU reduction *assuming the full 2.4× micro-bench speedup*. That was a soft inference. The defensible statement: + +- **Hard lower bound: note-hashing ≥ 31% of whole-node CPU.** The sapling-crypto fork cut whole-node CPU/block ~18% (measured A/B). Since the realized speedup can't exceed the 2.4× micro-bench, `share = 0.18 / (1 − 1/speedup) ≥ 0.18 / 0.583 = 31%`. +- **If the realized in-node speedup is lower than 2.4× (likely — the fork's 60 MB lookup table loses to cache pressure in a busy node), the share is correspondingly higher:** at a realized 1.5×, share ≈ 54%. +- The stage budget corroborates a large share: precompute alone is 10–17 ms of the per-block budget. + +**Conclusion: Pedersen/note-hashing is ~⅓ to ~½ of total per-block CPU in the Sapling sandblast — a large share, not a minor one.** The "30%" was a floor, not the central estimate. + +## Bottleneck ranking (1.7–1.8M checkpoint sync) + +1. **Note-commitment Pedersen/Sinsemilla hashing** — the #1 CPU consumer (the precompute), scaling with shielded-note volume. Levers: the sapling-crypto fork (~18% whole-node), faster/SIMD hash impls upstream, dedicated pool isolation (#144 already relocated it off the serial committer). +2. **Feed-side block deserialization + checkpoint verification** — the other major chunk (the gap between timed commit stages and total CPU). The lazy cv/epk (#136) and native ZIP-244 (#131) PRs already cut this; further wins from eliminating redundant parsing. +3. **Committer serial overhead** — UTXO/address reads + batch build + history push (~7 ms inside committer total beyond graft/rocksdb). +4. **RocksDB commit (1.7–3 ms) and per-tx BLAKE2b digesting (0.8–1.7 ms)** — minor. + +## Flamegraph (partial) — function/category shares + +A second, narrower capture (`--call-graph dwarf,16384` over 1.725–1.735M, ~1.2 GB) was foldable only **partially**: the full fold stalled (same DWARF-on-248MB-binary wall), but a salvaged subset of **~5,690 samples** rendered (`flame-sapling-spam-partial.svg`). Counts are period-weighted (×1010101); shares are valid. **Inclusive** category shares (stack contains the pattern): + +| category | inclusive CPU share | +|---|---| +| Sapling Pedersen (jubjub) | **~65%** | +| RocksDB | ~8% | +| block deserialize/parse | ~5% | +| point decompression | ~1% | +| equihash | ~0.7% | +| Orchard Sinsemilla | ~0% (pure-Sapling window) | +| (rayon pool, wraps the above) | ~92% | + +**Caveats on the flamegraph numbers:** (1) partial subset; (2) the inclusive grep partly matches rayon job *type parameters*, so it conflates real Pedersen compute with pool overhead; (3) leaf self-time is dominated by `rayon ...execute` (~65%) — i.e. there is **significant rayon spin-wait** (workers busy-waiting for sibling tasks), which is itself a finding worth chasing (idle-spin burns CPU). The clean per-stage **metric budget above is the more reliable decomposition**; the flamegraph corroborates that Pedersen/note-hashing dominates. + +**Settling the Pedersen share:** the flamegraph's ~65% (even allowing for overcount) confirms Pedersen is a *large* share — well above the ≥31% floor. Combined with the fork's measured 18% whole-node CPU reduction, that implies a **realized in-node speedup of only ~1.4×** (vs the 2.4× micro-bench) — Amdahl: `0.18 = 0.65·(1−1/1.4)`. The gap is cache pressure: the fork's ~60 MB lookup table benches hot/uncontended but in a busy node is evicted to DRAM, so it realizes ~1.4× not 2.4×. (Reconciles with the wall-time budget: `precompute` is only ~10 ms *wall* because it parallelizes via rayon, but it's a large *CPU-seconds* share — which is what the flamegraph samples.) + +**Why no full flamegraph:** DWARF offline post-processing (`perf script`/`perf report`) is intractable on the full capture against the 248 MB binary (stalls); **LBR is unavailable in this VM**; a frame-pointer build (`-Cforce-frame-pointers`) + re-capture (~30 min) is the only path to a clean *complete* leaf-level flamegraph — the cheap follow-up if the exact compute-vs-spin and feed-side split is wanted. + +### Artifacts +- Metrics: `metrics-prof.prom` (full /metrics every 5s), `res-prof.csv` (CPU/throughput). Binary: `/root/wal-bench/zebrad-prof` (stock, instrumented). +- Flamegraph (partial, ~5,690 samples): `/root/zebra/flame-sapling-spam-partial.svg`. +- perf captures removed after analysis (3.2 GB / 1.2 GB DWARF — un-renderable in full; see above). diff --git a/FULL_SYNC_SUMMARY.md b/FULL_SYNC_SUMMARY.md new file mode 100644 index 00000000000..9b23d0a0d5c --- /dev/null +++ b/FULL_SYNC_SUMMARY.md @@ -0,0 +1,120 @@ +# Full mainnet sync analysis (genesis → tip) + +A full Zcash mainnet sync from genesis to the chain tip, profiled per phase with the +`commit-metrics` instrumentation, on the optimized binary and an 8-core box. This document +breaks the sync down by height range and lists the major bottlenecks. + +## Binary and methodology + +- **Binary:** `zebrad-readpar` — the proto optimization stack: native ZIP-244 digests, dropped + v5-deserialize reparse, lazy Sapling cv/epk point decompression, parallel block writer, the + #138 serialization gate, and the #140 committer read parallelization. +- **Run:** genesis → tip in a fresh state dir. Reached the max checkpoint (3,358,006) and continued + through semantic verification to the tip (~3.382M). One disk-full interruption around 1.79M was + resumed in place (RocksDB recovered); the per-block phase metrics below are committer-thread + timers and are independent of that interruption and of peer/download luck. Throughput (blk/s) is + peer-dependent and is reported only as a secondary signal. +- **Phase columns (ms/block):** `prep` = UTXO/address reads before the batch; `tree` = note-commitment + tree update; `batch` = write-batch build; `rocks` = RocksDB commit; `wbt` = total DB-write + (prep+batch+rocks+tip). `tree` runs concurrently with the write, so it is reported separately. + +## Timing + +| segment | blocks | wall time | avg blk/s | +| --- | --- | --- | --- | +| genesis → 1.79M (checkpoint) | 1.79M | 3.37 h | ~148 | +| 1.79M → tip (incl. resume stalls + semantic tail) | ~1.59M | 4.20 h | 105 | +| of which: semantic tail (> max checkpoint 3.358M) | ~24.6K | 0.64 h | **11** | + +The semantic tail (above the last checkpoint) is full validation — proofs and signatures — at +~11 blk/s, CPU ~1.6/8. Every optimization in this work targets the checkpoint region below 3.358M; +the tail is a different, fundamentally slower regime. + +## Per-100K breakdown (genesis → 3.2M) + +| range | blk/s | cpu/8 | prep | tree | batch | rocks | wbt | tx/blk | dominant | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| 100k | 90 | 2.8 | 2.64 | 0.04 | 3.32 | 4.40 | 10.38 | 8.7 | rocksdb | +| 200k | 66 | 3.1 | 3.75 | 0.05 | 4.50 | 6.04 | 14.30 | 14.3 | rocksdb | +| 300k | 72 | 3.3 | 3.52 | 0.04 | 4.89 | 4.65 | 13.07 | 11.1 | batch_prep | +| 400k | 239 | 3.4 | 1.05 | 0.64 | 0.71 | 1.28 | 3.05 | 5.9 | rocksdb | +| 500k | 210 | 3.3 | 1.15 | 1.08 | 0.77 | 1.19 | 3.11 | 8.0 | rocksdb | +| 600k | 254 | 3.6 | 0.93 | 0.95 | 0.63 | 0.96 | 2.52 | 5.1 | rocksdb | +| 700k | 358 | 3.3 | 0.52 | 0.94 | 0.29 | 0.54 | 1.35 | 4.1 | tree | +| 800k | 279 | 3.2 | 0.75 | 1.31 | 0.35 | 0.66 | 1.77 | 4.9 | tree | +| 900k | 259 | 3.1 | 0.71 | 1.21 | 0.39 | 0.83 | 1.94 | 5.2 | tree | +| 1000k | 243 | 3.1 | 0.80 | 1.30 | 0.43 | 0.91 | 2.15 | 4.7 | tree | +| 1100k | 203 | 3.3 | 1.13 | 1.34 | 0.61 | 1.19 | 2.94 | 5.6 | tree | +| 1200k | 216 | 3.1 | 0.99 | 1.07 | 0.56 | 1.22 | 2.78 | 5.1 | rocksdb | +| 1300k | 214 | 3.3 | 1.18 | 1.05 | 0.51 | 1.17 | 2.87 | 4.3 | prep_reads | +| 1400k | 209 | 3.0 | 0.81 | 2.02 | 0.41 | 0.85 | 2.09 | 5.7 | tree | +| 1500k | 229 | 3.2 | 0.84 | 1.41 | 0.56 | 0.95 | 2.36 | 4.1 | tree | +| 1600k | 257 | 3.2 | 0.67 | 1.33 | 0.40 | 0.74 | 1.81 | 5.2 | tree | +| **1800k** | **38** | 5.3 | 2.18 | **17.60** | 1.59 | 3.96 | 7.74 | 4.2 | **tree** | +| **1900k** | **55** | 5.2 | 1.08 | **12.67** | 0.87 | 2.55 | 4.51 | 4.3 | **tree** | +| **2000k** | **64** | 4.2 | 0.97 | **11.34** | 0.67 | 1.87 | 3.52 | 5.4 | **tree** | +| 2100k | 100 | 2.9 | 0.63 | 7.43 | 0.38 | 0.84 | 1.86 | 4.0 | tree | +| 2200k | 158 | 2.6 | 0.78 | 3.66 | 0.41 | 0.87 | 2.07 | 3.5 | tree | +| 2300k | 360 | 3.0 | 0.35 | 1.35 | 0.18 | 0.32 | 0.86 | 2.7 | tree | +| 2400k | 282 | 2.8 | 0.46 | 1.73 | 0.22 | 0.40 | 1.09 | 2.9 | tree | +| 2500k | 143 | 2.6 | 0.70 | 4.49 | 0.36 | 0.79 | 1.86 | 3.8 | tree | +| 2600k | 149 | 2.7 | 0.73 | 3.94 | 0.36 | 0.85 | 1.95 | 3.2 | tree | +| 2700k | 305 | 2.8 | 0.32 | 1.94 | 0.16 | 0.26 | 0.74 | 2.1 | tree | +| 2800k | 351 | 2.9 | 0.21 | 1.73 | 0.12 | 0.21 | 0.55 | 2.0 | tree | +| 2900k | 301 | 2.6 | 0.22 | 2.18 | 0.12 | 0.20 | 0.56 | 2.0 | tree | +| 3000k | 217 | 2.5 | 0.46 | 2.99 | 0.18 | 0.28 | 0.93 | 3.0 | tree | +| 3100k | 133 | 2.5 | 0.78 | 5.00 | 0.54 | 0.57 | 1.90 | 8.9 | tree | +| 3200k | 169 | 2.5 | 0.44 | 4.29 | 0.23 | 0.41 | 1.10 | 5.3 | tree | + +(At 5K granularity the sandblast peak is sharper still: tree update hits ~39 ms/block around 1.875M.) + +## The four regimes + +1. **Transparent band (~100–330K):** the slowest pre-sandblast stretch, 66–90 blk/s, `wbt` 10–14 ms. + Many transparent inputs/outputs per block. Dominated by **rocksdb commit + batch_prep**; `prep_reads` + here is already cut to 2.6–3.7 ms by #140 (was ~25 ms / 58% of wall before it). +2. **Post-Sapling low-tx (~400–650K):** fast, 210–254 blk/s, everything small; rocksdb the largest slice. +3. **Shielded era (~700K–1.6M and ~2.3M onward):** 130–360 blk/s; **note-commitment tree update** is the + dominant phase as Sapling/Orchard notes accumulate (~1–5 ms). +4. **Sandblast region (~1.7M–2.2M):** the slowest part of the whole chain, 38–158 blk/s. The spam created + huge numbers of shielded outputs, so the **note-commitment tree update explodes to 11–18 ms/block** + (peaking ~39 ms at 5K granularity). CPU rises to ~5/8 here as the parallel tree append engages — yet it + is still the bottleneck because the note volume overwhelms it. + +A constant across every range: **CPU sits at ~2.5–3.5/8** (rising to ~5/8 only in sandblast). The committer +is a single serial critical path, leaving ~3–5 cores idle nearly everywhere — which is why moving work off +that thread (rather than parallelizing within it) is the recurring lever. + +## Major bottlenecks (ranked) + +1. **Note-commitment tree update — the #1 cost.** Dominant for the entire shielded half of the chain + (~700K → tip) and catastrophic in sandblast (11–18 ms/block, ~39 ms peak). Already internally + parallelized; the lever is to move its per-leaf Pedersen/Sinsemilla hashing off the serial committer. + *(Optimization implemented — see below.)* +2. **RocksDB commit — the transparent-band ceiling.** 4.4–6 ms/block at 100–300K and the largest slice in + the low-tx span; grows with DB size. Evidence (live `rocksdb.LOG`): zero write stalls and the WAL is + async, so the cost is memtable insertion, not I/O. PR #90's WAL-skip targets a near-absent cost here; + the real levers are multi-block batch commits and/or pipelining the commit. *(Indexed for later.)* +3. **Serial committer / idle cores — structural.** CPU ~3/8 everywhere; one thread gates throughput while + most cores idle. Underlies both #1 and #2. +4. **prep_reads — transparent-input UTXO/address reads.** Was 58% of wall (~25 ms) at 340K; now 2.6–3.7 ms + after #140 (parallel + de-duplicated reads). Largely resolved. +5. **Semantic verification tail (> max checkpoint 3.358M).** ~11 blk/s, full proof/signature validation. + Out of scope for checkpoint-sync optimization; inherently slow. + +## Improvements validated and shipped this work + +- **#138** — par_iter size gate (don't fork-join tiny blocks): batch_prep −8 to −13%. +- **#140** — parallelize + de-duplicate the committer's UTXO/address reads: **prep_reads −55 to −68%**, + write_block_total −25 to −37% across the transparent band; this flattened regime 1's `prep_reads`. +- **Note-commitment tree precompute (implemented, A/B pending)** — splits the tree append into an + off-committer `precompute_subtree_roots` (the heavy hashing, keyed only on note count) and a cheap + on-committer `graft`, driven by a 1-block look-ahead so the hashing overlaps the previous commit on idle + cores. Byte-identical to the sequential append (differential proptests), with a size-match fallback so it + can only affect speed, never correctness. Targets bottleneck #1. + +## Remaining levers + +- **Tree precompute** (above) — pending throughput A/B in the sandblast (1.8–2.2M) and shielded ranges. +- **Multi-block RocksDB commit batching** — bottleneck #2, the transparent-band and low-tx ceiling. +- **Commit pipelining** — overlap block N's commit with block N+1's prep/reads on the idle cores. diff --git a/HANDOFF.md b/HANDOFF.md new file mode 100644 index 00000000000..8b3218f02e0 --- /dev/null +++ b/HANDOFF.md @@ -0,0 +1,143 @@ +# Handoff — Zcash checkpoint-sync throughput optimization + +Context for the next agent. The mission: maximize Zcash mainnet checkpoint-sync throughput +(blocks/sec), focused on the heavy "sandblast" region (~1.7M–2.2M). Fork: `valargroup/zebra`. + +> The previous session's `HANDOFF.md` is preserved in git commit `0ecb27f14976` (branch +> `proto-lazy-sapling-points`) if you need it. This file supersedes it. + +## TL;DR — the one thing to know + +The throughput bottleneck in the sandblast region is the **single-threaded finalized committer**, +proven by direct instrumentation (89% busy, 937-block input backlog), and within it the +**note-commitment tree update is ~69% of per-block cost**. The fix (move tree hashing off the +committer) is built and PR'd. The verifier/"feed" is NOT the bottleneck (~0.5 ms/block). Download +bandwidth (~60 blk/s) is the next gate once the committer is sped up. Full analysis + ranked +improvements in `COMMIT_OPTIMIZE.md`. Methodology lesson: per-phase profiling told us *where time +goes within a stage*; it took utilization + queue-depth instrumentation (or a controlled A/B in the +right regime) to identify the *binding* stage — we initially mis-called it twice. + +## Branches & PRs + +- **`sync-perf-main-2`** (origin) — the integration branch with all merged perf PRs (#122 dedicated + commit pool, #128 parallel writer, #131 native ZIP-244, #133 drop reparse, #136 lazy Sapling + cv/epk, #138 par_iter gate, #140 read parallelization, #148 prepare digest fanout). **Base all new + work here.** This is the branch the local working tree is on now. +- **PR #144** (`proto-note-tree-precompute` → `sync-perf-main-2`, draft) — the note-tree precompute + prototype. Rebased onto the latest `sync-perf-main-2` tip (6ca5a4cf9), MERGEABLE, proptests green. +- Earlier shipped this effort: **#138** (par_iter size gate), **#140** (committer UTXO/address read + parallelization). Both validated with A/B and merged into `sync-perf-main-2`. +- `proto-lazy-sapling-points` — old local working branch; holds the original (pre-port) prototype + + the restored docs in commit `0ecb27f14976`. Not the base for new work. + +### Uncommitted right now (on local `sync-perf-main-2`) +Feed + committer **instrumentation** (not yet committed): `zebra-consensus/src/checkpoint.rs`, +`zebra-state/src/request.rs`, `zebra-state/src/service/write.rs`. These add the metrics below. Keep +them for benchmarking; do not merge as-is (timers are unconditional `metrics::histogram!`). + +## How to build + +```bash +export CARGO_TARGET_DIR=/root/cargo-target-readpar # /mnt fills up; build target lives on /root +cargo build --release -p zebrad --features commit-metrics --locked +cp $CARGO_TARGET_DIR/release/zebrad /root/wal-bench/zebrad-