diff --git a/.stats.json b/.stats.json new file mode 100644 index 0000000..81c65d5 --- /dev/null +++ b/.stats.json @@ -0,0 +1,5 @@ +{ + "lean_modules": 61, + "lean_lines": 32558, + "_note": "Generated by scripts/repo-stats.sh. Do not edit by hand." +} diff --git a/FormalSLT.lean b/FormalSLT.lean index 6e71de2..36bb137 100644 --- a/FormalSLT.lean +++ b/FormalSLT.lean @@ -67,3 +67,4 @@ import FormalSLT.PACBayesFiniteProductMGF import FormalSLT.PACBayesBoundedLoss import FormalSLT.PACBayesMcAllester import FormalSLT.PACBayesBernstein +import FormalSLT.PACBayesMargin diff --git a/FormalSLT/PACBayesBernstein.lean b/FormalSLT/PACBayesBernstein.lean index 2f9ffec..d074f28 100644 --- a/FormalSLT/PACBayesBernstein.lean +++ b/FormalSLT/PACBayesBernstein.lean @@ -4,6 +4,7 @@ Released under MIT license as described in the file LICENSE. Authors: Robby Sneiderman -/ import FormalSLT.PACBayesKL +import FormalSLT.Probability.FiniteUnionBound import Mathlib.Algebra.BigOperators.Field import Mathlib.Algebra.Order.BigOperators.Group.Finset import Mathlib.Analysis.SpecialFunctions.Sqrt @@ -37,6 +38,8 @@ classifier margin loss. Markov/confidence layer for a fixed Bernstein parameter. * `finitePACBayesBernsteinMargin_badEventMass_le_delta` — posterior-dependent margin-style wrapper under explicit complexity and penalty certificates. +* `finitePACBayesBernsteinGridOptimized_badEventMass_le_delta` — finite-grid + optimization wrapper over supplied Bernstein parameter buckets. ## Current boundaries @@ -94,6 +97,114 @@ def expectedPriorBernsteinExpMoment [Fintype Ω] [Fintype ι] ∑ ω, ν ω * priorBernsteinExpMoment π lambda scale riskFn empiricalRiskFn varianceProxy ω +/-- +Expected normalized PAC-Bayes Bernstein prior moment from per-hypothesis MGF +budgets. + +This is the algebraic bridge used before plugging in a concrete iid +concentration lemma: each hypothesis supplies the Bernstein exponential budget, +then the prior-weighted normalized moment has expectation at most one. +-/ +theorem expectedPriorBernsteinExpMoment_le_one_of_mgf_bound + [Fintype Ω] [Fintype ι] + {π : ι → ℝ} (hπ : IsPMF π) + (ν : Ω → ℝ) (lambda scale : ℝ) + (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ) + (varianceProxy : ι → ℝ) + (hmgf : + ∀ i : ι, + (∑ ω : Ω, ν ω * + Real.exp (lambda * (riskFn i - empiricalRiskFn ω i))) ≤ + Real.exp + (lambda ^ 2 * varianceProxy i / + (2 * (1 - scale * lambda)))) : + expectedPriorBernsteinExpMoment ν π lambda scale riskFn empiricalRiskFn + varianceProxy ≤ 1 := by + classical + let budget : ι → ℝ := + fun i => + lambda ^ 2 * varianceProxy i / + (2 * (1 - scale * lambda)) + have hexp_split : ∀ a b : ℝ, Real.exp (a - b) = Real.exp a * Real.exp (-b) := by + intro a b + rw [← Real.exp_add] + ring_nf + have hswap : + expectedPriorBernsteinExpMoment ν π lambda scale riskFn empiricalRiskFn + varianceProxy = + ∑ i : ι, + π i * Real.exp (-(budget i)) * + (∑ ω : Ω, ν ω * + Real.exp (lambda * (riskFn i - empiricalRiskFn ω i))) := by + unfold expectedPriorBernsteinExpMoment priorBernsteinExpMoment + calc + (∑ ω : Ω, + ν ω * + ∑ i : ι, + π i * + Real.exp + (lambda * (riskFn i - empiricalRiskFn ω i) - + lambda ^ 2 * varianceProxy i / + (2 * (1 - scale * lambda)))) + = + ∑ ω : Ω, ∑ i : ι, + ν ω * + (π i * + Real.exp + (lambda * (riskFn i - empiricalRiskFn ω i) - + budget i)) := by + apply Finset.sum_congr rfl + intro ω _hω + rw [Finset.mul_sum] + _ = + ∑ i : ι, ∑ ω : Ω, + ν ω * + (π i * + Real.exp + (lambda * (riskFn i - empiricalRiskFn ω i) - + budget i)) := by + rw [Finset.sum_comm] + _ = + ∑ i : ι, + π i * Real.exp (-(budget i)) * + (∑ ω : Ω, ν ω * + Real.exp (lambda * (riskFn i - empiricalRiskFn ω i))) := by + apply Finset.sum_congr rfl + intro i _hi + rw [Finset.mul_sum] + apply Finset.sum_congr rfl + intro ω _hω + rw [hexp_split] + ring + rw [hswap] + calc + (∑ i : ι, + π i * Real.exp (-(budget i)) * + (∑ ω : Ω, ν ω * + Real.exp (lambda * (riskFn i - empiricalRiskFn ω i)))) + ≤ + ∑ i : ι, + π i * Real.exp (-(budget i)) * + Real.exp (budget i) := by + apply Finset.sum_le_sum + intro i _hi + exact mul_le_mul_of_nonneg_left + (hmgf i) + (mul_nonneg (hπ.nonneg i) (le_of_lt (Real.exp_pos _))) + _ = ∑ i : ι, π i := by + apply Finset.sum_congr rfl + intro i _hi + calc + π i * Real.exp (-(budget i)) * Real.exp (budget i) + = π i * (Real.exp (-(budget i)) * Real.exp (budget i)) := by ring + _ = π i * 1 := by + congr 1 + rw [← Real.exp_add] + ring_nf + simp + _ = π i := by ring + _ = 1 := hπ.sum_one + /-- Finite sample mass of outcomes whose Bernstein prior moment exceeds a threshold. -/ def priorBernsteinExpMomentTailMass [Fintype Ω] [Fintype ι] (ν : Ω → ℝ) (π : ι → ℝ) @@ -556,6 +667,238 @@ theorem finitePACBayesBernsteinMargin_badEventMass_le_delta scale * complexityOf ρ) hcomplexity hpenalty hExpected +/-! ### Finite-grid peeling over Bernstein parameters -/ + +/-- +Samples where some posterior violates at least one finite Bernstein parameter +bucket. + +Each grid index supplies a parameter `lambdaOf g`, a scale `scaleOf g`, and a +confidence allocation `confidenceOf g`. This is a finite-grid statement, not an +optimization over all real parameters. +-/ +def finitePACBayesBernsteinGridBadSamples + [Fintype Ω] [Fintype ι] [Fintype γ] + (π : ι → ℝ) (lambdaOf scaleOf confidenceOf : γ → ℝ) + (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ) + (varianceProxy : ι → ℝ) : Finset Ω := + Finset.univ.filter fun ω => + ∃ g : γ, ∃ ρ : ι → ℝ, + IsPMF ρ ∧ + posteriorGeneralizationGap ρ riskFn (empiricalRiskFn ω) > + (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g + + lambdaOf g * posteriorMarginVarianceProxy ρ varianceProxy / + (2 * (1 - scaleOf g * lambdaOf g)) + +/-- +Finite-grid Bernstein bad-event bound. + +For each finite grid bucket `g`, the fixed-`lambda` Bernstein theorem is applied +with its own confidence allocation. A finite weighted union bound controls the +mass of samples on which any bucket fails. +-/ +theorem finitePACBayesBernsteinGrid_badEventMass_le_sum_delta + [Fintype Ω] [DecidableEq Ω] [Fintype ι] [Nonempty ι] [Fintype γ] + {ν : Ω → ℝ} (hν : IsPMF ν) + {π : ι → ℝ} (hπ : IsFullSupportPMF π) + (lambdaOf scaleOf confidenceOf : γ → ℝ) + (hlambda : ∀ g : γ, 0 < lambdaOf g) + (hscale : ∀ g : γ, scaleOf g * lambdaOf g < 1) + (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g) + (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ) + (varianceProxy : ι → ℝ) + (hExpected : + ∀ g : γ, + expectedPriorBernsteinExpMoment ν π (lambdaOf g) (scaleOf g) + riskFn empiricalRiskFn varianceProxy ≤ 1) : + (∑ ω ∈ + finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf + riskFn empiricalRiskFn varianceProxy, + ν ω) ≤ + ∑ g : γ, confidenceOf g := by + classical + let bucketEvent : γ → Finset Ω := fun g => + finitePACBayesBernsteinFixedLambdaBadSamples π (lambdaOf g) (scaleOf g) + (confidenceOf g) riskFn empiricalRiskFn varianceProxy + let gridUnionEvent : Finset Ω := + Finset.univ.filter fun ω : Ω => ∃ g : γ, ω ∈ bucketEvent g + have hsubset : + finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf + riskFn empiricalRiskFn varianceProxy + ⊆ gridUnionEvent := by + intro ω hω + rw [finitePACBayesBernsteinGridBadSamples, Finset.mem_filter] at hω + rcases hω.2 with ⟨g, ρ, hρ, hbad⟩ + change ω ∈ (Finset.univ.filter fun ω : Ω => ∃ g : γ, ω ∈ bucketEvent g) + rw [Finset.mem_filter] + refine ⟨Finset.mem_univ ω, g, ?_⟩ + change ω ∈ finitePACBayesBernsteinFixedLambdaBadSamples π (lambdaOf g) (scaleOf g) + (confidenceOf g) riskFn empiricalRiskFn varianceProxy + rw [finitePACBayesBernsteinFixedLambdaBadSamples, Finset.mem_filter] + exact ⟨Finset.mem_univ ω, ρ, hρ, hbad⟩ + have hmass_subset : + (∑ ω ∈ + finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf + riskFn empiricalRiskFn varianceProxy, + ν ω) + ≤ ∑ ω ∈ gridUnionEvent, ν ω := by + exact Finset.sum_le_sum_of_subset_of_nonneg hsubset (by + intro ω _hω _hnot + exact hν.nonneg ω) + have hunion := + FormalSLT.Probability.FiniteUnionBound.finiteProbabilityUnionBound_proof + (support := (Finset.univ : Finset Ω)) + (w := ν) + (events := bucketEvent) + (s := (Finset.univ : Finset γ)) + (fun ω : Ω => hν.nonneg ω) + have hunion_mass : + (∑ ω ∈ gridUnionEvent, ν ω) ≤ + ∑ g : γ, ∑ ω ∈ bucketEvent g, ν ω := by + change + (∑ ω ∈ (Finset.univ.filter fun ω : Ω => ∃ g : γ, ω ∈ bucketEvent g), ν ω) ≤ + ∑ g : γ, ∑ ω ∈ bucketEvent g, ν ω + unfold FormalSLT.Probability.FiniteUnionBound.finiteUnionEventMass at hunion + unfold FormalSLT.Probability.FiniteUnionBound.finiteEventMassSum at hunion + unfold FormalSLT.Probability.FiniteUnionBound.finiteEventMass at hunion + simp_rw [← Finset.sum_filter] at hunion + simpa using hunion + have hbucket : + ∀ g : γ, (∑ ω ∈ bucketEvent g, ν ω) ≤ confidenceOf g := by + intro g + simpa [bucketEvent] using + finitePACBayesBernstein_fixedLambda_badEventMass_le_delta + (ν := ν) hν hπ (lambdaOf g) (scaleOf g) (confidenceOf g) + (hlambda g) (hscale g) (hconfidenceOf g) riskFn empiricalRiskFn + varianceProxy (hExpected g) + have hbucket_sum : + (∑ g : γ, ∑ ω ∈ bucketEvent g, ν ω) ≤ ∑ g : γ, confidenceOf g := by + exact Finset.sum_le_sum (fun g _hg => hbucket g) + exact hmass_subset.trans (hunion_mass.trans hbucket_sum) + +/-- Finite-grid Bernstein bad-event bound with an explicit confidence budget. -/ +theorem finitePACBayesBernsteinGrid_badEventMass_le_delta + [Fintype Ω] [DecidableEq Ω] [Fintype ι] [Nonempty ι] [Fintype γ] + {ν : Ω → ℝ} (hν : IsPMF ν) + {π : ι → ℝ} (hπ : IsFullSupportPMF π) + (lambdaOf scaleOf confidenceOf : γ → ℝ) + (hlambda : ∀ g : γ, 0 < lambdaOf g) + (hscale : ∀ g : γ, scaleOf g * lambdaOf g < 1) + (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g) + (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ) + (varianceProxy : ι → ℝ) + (hExpected : + ∀ g : γ, + expectedPriorBernsteinExpMoment ν π (lambdaOf g) (scaleOf g) + riskFn empiricalRiskFn varianceProxy ≤ 1) + {delta : ℝ} (hgridConfidence : (∑ g : γ, confidenceOf g) ≤ delta) : + (∑ ω ∈ + finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf + riskFn empiricalRiskFn varianceProxy, + ν ω) ≤ delta := + (finitePACBayesBernsteinGrid_badEventMass_le_sum_delta + (Ω := Ω) (ι := ι) (γ := γ) hν hπ lambdaOf scaleOf confidenceOf + hlambda hscale hconfidenceOf riskFn empiricalRiskFn varianceProxy hExpected).trans + hgridConfidence + +/-- +Finite-grid optimized Bernstein bad-event bound. + +The user-supplied `posteriorPenalty` may depend on the posterior. The theorem +requires a finite grid certificate: each posterior is assigned to a bucket whose +fixed-parameter Bernstein penalty is no larger than the supplied penalty. +-/ +theorem finitePACBayesBernsteinGridOptimized_badEventMass_le_sum_delta + [Fintype Ω] [DecidableEq Ω] [Fintype ι] [Nonempty ι] [Fintype γ] + {ν : Ω → ℝ} (hν : IsPMF ν) + {π : ι → ℝ} (hπ : IsFullSupportPMF π) + (lambdaOf scaleOf confidenceOf : γ → ℝ) + (posteriorPenalty : (ι → ℝ) → ℝ) + (hlambda : ∀ g : γ, 0 < lambdaOf g) + (hscale : ∀ g : γ, scaleOf g * lambdaOf g < 1) + (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g) + (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ) + (varianceProxy : ι → ℝ) + (hgridCovers : + ∀ ρ : ι → ℝ, IsPMF ρ → + ∃ g : γ, + (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g + + lambdaOf g * posteriorMarginVarianceProxy ρ varianceProxy / + (2 * (1 - scaleOf g * lambdaOf g)) + ≤ posteriorPenalty ρ) + (hExpected : + ∀ g : γ, + expectedPriorBernsteinExpMoment ν π (lambdaOf g) (scaleOf g) + riskFn empiricalRiskFn varianceProxy ≤ 1) : + (∑ ω ∈ + finitePACBayesBernsteinPenaltyBadSamples riskFn empiricalRiskFn posteriorPenalty, + ν ω) ≤ + ∑ g : γ, confidenceOf g := by + classical + have hsubset : + finitePACBayesBernsteinPenaltyBadSamples riskFn empiricalRiskFn posteriorPenalty + ⊆ + finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf + riskFn empiricalRiskFn varianceProxy := by + intro ω hω + rw [finitePACBayesBernsteinPenaltyBadSamples, Finset.mem_filter] at hω + rcases hω.2 with ⟨ρ, hρ, hbad⟩ + rcases hgridCovers ρ hρ with ⟨g, hpenalty⟩ + rw [finitePACBayesBernsteinGridBadSamples, Finset.mem_filter] + refine ⟨Finset.mem_univ ω, g, ρ, hρ, ?_⟩ + linarith + have hmass_subset : + (∑ ω ∈ + finitePACBayesBernsteinPenaltyBadSamples riskFn empiricalRiskFn posteriorPenalty, + ν ω) + ≤ + ∑ ω ∈ + finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf + riskFn empiricalRiskFn varianceProxy, + ν ω := by + exact Finset.sum_le_sum_of_subset_of_nonneg hsubset (by + intro ω _hω _hnot + exact hν.nonneg ω) + exact hmass_subset.trans + (finitePACBayesBernsteinGrid_badEventMass_le_sum_delta + (Ω := Ω) (ι := ι) (γ := γ) hν hπ lambdaOf scaleOf confidenceOf + hlambda hscale hconfidenceOf riskFn empiricalRiskFn varianceProxy hExpected) + +/-- +Finite-grid optimized Bernstein bad-event bound with an explicit total +confidence budget. +-/ +theorem finitePACBayesBernsteinGridOptimized_badEventMass_le_delta + [Fintype Ω] [DecidableEq Ω] [Fintype ι] [Nonempty ι] [Fintype γ] + {ν : Ω → ℝ} (hν : IsPMF ν) + {π : ι → ℝ} (hπ : IsFullSupportPMF π) + (lambdaOf scaleOf confidenceOf : γ → ℝ) + (posteriorPenalty : (ι → ℝ) → ℝ) + (hlambda : ∀ g : γ, 0 < lambdaOf g) + (hscale : ∀ g : γ, scaleOf g * lambdaOf g < 1) + (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g) + (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ) + (varianceProxy : ι → ℝ) + (hgridCovers : + ∀ ρ : ι → ℝ, IsPMF ρ → + ∃ g : γ, + (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g + + lambdaOf g * posteriorMarginVarianceProxy ρ varianceProxy / + (2 * (1 - scaleOf g * lambdaOf g)) + ≤ posteriorPenalty ρ) + (hExpected : + ∀ g : γ, + expectedPriorBernsteinExpMoment ν π (lambdaOf g) (scaleOf g) + riskFn empiricalRiskFn varianceProxy ≤ 1) + {delta : ℝ} (hgridConfidence : (∑ g : γ, confidenceOf g) ≤ delta) : + (∑ ω ∈ + finitePACBayesBernsteinPenaltyBadSamples riskFn empiricalRiskFn posteriorPenalty, + ν ω) ≤ delta := + (finitePACBayesBernsteinGridOptimized_badEventMass_le_sum_delta + (Ω := Ω) (ι := ι) (γ := γ) hν hπ lambdaOf scaleOf confidenceOf + posteriorPenalty hlambda hscale hconfidenceOf riskFn empiricalRiskFn + varianceProxy hgridCovers hExpected).trans hgridConfidence + end end FormalSLT.PACBayesBernstein diff --git a/FormalSLT/PACBayesFiniteProductMGF.lean b/FormalSLT/PACBayesFiniteProductMGF.lean index 9dfb491..6947403 100644 --- a/FormalSLT/PACBayesFiniteProductMGF.lean +++ b/FormalSLT/PACBayesFiniteProductMGF.lean @@ -163,6 +163,43 @@ theorem finiteProduct_mgf_empiricalRiskDeviation_le_of_single field_simp [hn_ne] exact h_pow.trans_eq h_exp_pow +/-- +Finite product MGF bound from an arbitrary one-coordinate exponential budget. + +This is the same product-factorization step as +`finiteProduct_mgf_empiricalRiskDeviation_le_of_single`, but keeps the +one-coordinate budget explicit. It is useful for Bernstein/sub-Gamma bounds, +whose denominator depends on the one-coordinate scale. +-/ +theorem finiteProduct_mgf_empiricalRiskDeviation_le_exp_of_single + {n : ℕ} [Fintype Z] (hn : 0 < n) + (p : Z → ℝ) (hp : IsPMF p) + (ℓ : ι → Z → ℝ) (i : ι) (lam singleBudget : ℝ) + (hsingle : + oneCoordinateDeviationMGF (n := n) p ℓ i lam ≤ + Real.exp singleBudget) : + (∑ S : Fin n → Z, + finiteProductSampleWeight p S * + Real.exp (lam * (finitePopulationRisk p ℓ i - finiteEmpiricalRisk ℓ i S))) ≤ + Real.exp ((n : ℝ) * singleBudget) := by + classical + have h_eq := + finiteProduct_mgf_empiricalRiskDeviation_eq_pow + (ι := ι) (Z := Z) hn p ℓ i lam + rw [h_eq] + have hsingle_nonneg : 0 ≤ oneCoordinateDeviationMGF (n := n) p ℓ i lam := by + unfold oneCoordinateDeviationMGF + exact Finset.sum_nonneg + (fun z _hz => mul_nonneg (hp.nonneg z) (le_of_lt (Real.exp_pos _))) + have h_pow : + (oneCoordinateDeviationMGF (n := n) p ℓ i lam) ^ n ≤ + (Real.exp singleBudget) ^ n := + pow_le_pow_left₀ hsingle_nonneg hsingle n + have h_exp_pow : + (Real.exp singleBudget) ^ n = Real.exp ((n : ℝ) * singleBudget) := by + rw [← Real.exp_nat_mul] + exact h_pow.trans_eq h_exp_pow + /-- Prior-averaged finite product MGF bound. diff --git a/FormalSLT/PACBayesMargin.lean b/FormalSLT/PACBayesMargin.lean new file mode 100644 index 0000000..27ab9af --- /dev/null +++ b/FormalSLT/PACBayesMargin.lean @@ -0,0 +1,642 @@ +/- +Copyright (c) 2026 Robby Sneiderman. All rights reserved. +Released under MIT license as described in the file LICENSE. +Authors: Robby Sneiderman +-/ +import FormalSLT.PACBayesBernstein +import FormalSLT.PACBayesFiniteProductMGF +import FormalSLT.Probability.BernsteinMGF + +/-! +# Finite PAC-Bayes classifier-margin adapter + +This module specializes the finite PAC-Bayes Bernstein shell to classifier +margin losses on a finite data domain. + +The supplied-certificate adapter remains available for arbitrary normalized +Bernstein prior-moment certificates. For iid finite classifier-margin losses, +this file also derives that certificate internally from the finite product law +and the Bennett/sub-Gamma MGF layer. + +## Main declarations + +* `classifierMarginLoss` — thresholded margin loss in `{0,1}`. +* `classifierMarginPopulationRisk` and `classifierMarginEmpiricalRisk` — + concrete finite risk functions for margin loss. +* `classifierMarginVarianceProxy` — the margin-risk proxy used by the + Bernstein shell. +* `expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one` — iid finite + classifier-margin losses discharge the normalized Bernstein prior-moment + certificate. +* `finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta` — the + end-to-end finite iid PAC-Bayes Bernstein theorem for classifier-margin + losses and a fixed admissible `lambda`. +* `finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_delta` — + the finite-grid version, with a caller-supplied grid certificate over + admissible Bernstein parameters. +-/ + +namespace FormalSLT.PACBayesMargin + +open Finset Real BigOperators +open FormalSLT.PACBayesKL +open FormalSLT.PACBayesFiniteProductMGF +open FormalSLT.PACBayesBernstein +open FormalSLT.Probability.BernsteinMGF + +noncomputable section + +variable {ι Z : Type*} + +local instance (p : Prop) : Decidable p := Classical.propDecidable p + +/-! ### Classifier margin losses -/ + +/-- Encode a Boolean label as `+1` for `true` and `-1` for `false`. -/ +def signedBool (y : Bool) : ℝ := + if y then 1 else -1 + +/-- Signed margin of hypothesis `i` on example `z`. -/ +def classifierMargin + (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) (z : Z) : ℝ := + signedBool (label z) * score i z + +/-- +Thresholded classifier margin loss. + +The loss is `1` when the signed margin is at most `threshold`, and `0` +otherwise. For `threshold = 0`, this is the usual error-or-no-positive-margin +indicator for real-valued scores with Boolean labels. +-/ +def classifierMarginLoss + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (i : ι) (z : Z) : ℝ := + if classifierMargin score label i z ≤ threshold then 1 else 0 + +/-- Classifier margin loss is always in `[0,1]`. -/ +lemma classifierMarginLoss_mem_Icc_zero_one + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (i : ι) (z : Z) : + classifierMarginLoss threshold score label i z ∈ Set.Icc (0 : ℝ) 1 := by + unfold classifierMarginLoss + by_cases h : classifierMargin score label i z ≤ threshold + · simp [h] + · simp [h] + +/-- Finite population risk of a thresholded classifier margin loss. -/ +def classifierMarginPopulationRisk [Fintype Z] + (p : Z → ℝ) (threshold : ℝ) + (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) : ℝ := + finitePopulationRisk p (classifierMarginLoss threshold score label) i + +/-- Finite empirical risk of a thresholded classifier margin loss. -/ +def classifierMarginEmpiricalRisk {n : ℕ} + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (i : ι) (S : Fin n → Z) : ℝ := + finiteEmpiricalRisk (classifierMarginLoss threshold score label) i S + +/-- Sample-indexed empirical-risk function for the PAC-Bayes Bernstein shell. -/ +def classifierMarginEmpiricalRiskFn {n : ℕ} + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (S : Fin n → Z) (i : ι) : ℝ := + classifierMarginEmpiricalRisk threshold score label i S + +/-- +Concrete margin-risk variance proxy. + +For an indicator margin loss, the second moment equals the population risk, so +this is the natural Bernstein variance proxy exposed to the PAC-Bayes layer. +The equality is kept as a definition here; deriving the normalized prior MGF +from this proxy is a separate concentration step. +-/ +def classifierMarginVarianceProxy [Fintype Z] + (p : Z → ℝ) (threshold : ℝ) + (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) : ℝ := + classifierMarginPopulationRisk p threshold score label i + +/-- The margin variance proxy is definitionally the margin population risk. -/ +lemma classifierMarginVarianceProxy_eq_populationRisk [Fintype Z] + (p : Z → ℝ) (threshold : ℝ) + (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) : + classifierMarginVarianceProxy p threshold score label i = + classifierMarginPopulationRisk p threshold score label i := rfl + +/-- Sample-average variance proxy for iid classifier-margin empirical risks. -/ +def classifierMarginSampleVarianceProxy {n : ℕ} [Fintype Z] + (p : Z → ℝ) (threshold : ℝ) + (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) : ℝ := + classifierMarginVarianceProxy p threshold score label i / (n : ℝ) + +/-- Posterior-averaged classifier-margin variance proxy. -/ +def posteriorClassifierMarginVarianceProxy [Fintype Z] [Fintype ι] + (ρ : ι → ℝ) (p : Z → ℝ) (threshold : ℝ) + (score : ι → Z → ℝ) (label : Z → Bool) : ℝ := + posteriorMarginVarianceProxy ρ + (classifierMarginVarianceProxy p threshold score label) + +/-- Posterior-averaged iid sample classifier-margin variance proxy. -/ +def posteriorClassifierMarginSampleVarianceProxy {n : ℕ} [Fintype Z] [Fintype ι] + (ρ : ι → ℝ) (p : Z → ℝ) (threshold : ℝ) + (score : ι → Z → ℝ) (label : Z → Bool) : ℝ := + posteriorMarginVarianceProxy ρ + (classifierMarginSampleVarianceProxy (n := n) p threshold score label) + +lemma classifierMarginPopulationRisk_mem_Icc_zero_one [Fintype Z] + (p : Z → ℝ) (hp : IsPMF p) + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) : + classifierMarginPopulationRisk p threshold score label i ∈ Set.Icc (0 : ℝ) 1 := by + constructor + · unfold classifierMarginPopulationRisk finitePopulationRisk + exact Finset.sum_nonneg + (fun z _hz => + mul_nonneg (hp.nonneg z) + (classifierMarginLoss_mem_Icc_zero_one threshold score label i z).1) + · unfold classifierMarginPopulationRisk finitePopulationRisk + calc + (∑ z : Z, p z * classifierMarginLoss threshold score label i z) + ≤ ∑ z : Z, p z * 1 := by + apply Finset.sum_le_sum + intro z _hz + exact mul_le_mul_of_nonneg_left + (classifierMarginLoss_mem_Icc_zero_one threshold score label i z).2 + (hp.nonneg z) + _ = 1 := by simp [hp.sum_one] + +lemma classifierMarginLoss_sq + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (i : ι) (z : Z) : + classifierMarginLoss threshold score label i z ^ 2 = + classifierMarginLoss threshold score label i z := by + unfold classifierMarginLoss + by_cases h : classifierMargin score label i z ≤ threshold + · simp [h] + · simp [h] + +lemma classifierMarginVariance_le_risk [Fintype Z] + (p : Z → ℝ) (hp : IsPMF p) + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) : + (∑ z : Z, + p z * + (classifierMarginPopulationRisk p threshold score label i - + classifierMarginLoss threshold score label i z) ^ 2) + ≤ classifierMarginVarianceProxy p threshold score label i := by + classical + let R := classifierMarginPopulationRisk p threshold score label i + let loss : Z → ℝ := classifierMarginLoss threshold score label i + have hsum_loss : (∑ z : Z, p z * loss z) = R := by + simp [R, loss, classifierMarginPopulationRisk, finitePopulationRisk] + have hsum_loss_sq : (∑ z : Z, p z * loss z ^ 2) = R := by + calc + (∑ z : Z, p z * loss z ^ 2) + = ∑ z : Z, p z * loss z := by + apply Finset.sum_congr rfl + intro z _hz + rw [classifierMarginLoss_sq] + _ = R := hsum_loss + have hvariance_eq : + (∑ z : Z, p z * (R - loss z) ^ 2) = R - R ^ 2 := by + calc + (∑ z : Z, p z * (R - loss z) ^ 2) + = + ∑ z : Z, (p z * R ^ 2 - 2 * R * (p z * loss z) + + p z * loss z ^ 2) := by + apply Finset.sum_congr rfl + intro z _hz + ring + _ = + (∑ z : Z, p z * R ^ 2) - + (∑ z : Z, 2 * R * (p z * loss z)) + + ∑ z : Z, p z * loss z ^ 2 := by + rw [Finset.sum_add_distrib, Finset.sum_sub_distrib] + _ = + (∑ z : Z, p z * R ^ 2) - + 2 * R * (∑ z : Z, p z * loss z) + + ∑ z : Z, p z * loss z ^ 2 := by + congr 2 + rw [Finset.mul_sum] + _ = + R ^ 2 * (∑ z : Z, p z) - + 2 * R * (∑ z : Z, p z * loss z) + + ∑ z : Z, p z * loss z ^ 2 := by + congr 2 + calc + (∑ z : Z, p z * R ^ 2) + = ∑ z : Z, R ^ 2 * p z := by + apply Finset.sum_congr rfl + intro z _hz + ring + _ = R ^ 2 * (∑ z : Z, p z) := by + rw [Finset.mul_sum] + _ = R - R ^ 2 := by + rw [hp.sum_one, hsum_loss, hsum_loss_sq] + ring + have hR_nonneg : 0 ≤ R := + (classifierMarginPopulationRisk_mem_Icc_zero_one + p hp threshold score label i).1 + calc + (∑ z : Z, + p z * + (classifierMarginPopulationRisk p threshold score label i - + classifierMarginLoss threshold score label i z) ^ 2) + = ∑ z : Z, p z * (R - loss z) ^ 2 := by + simp [R, loss] + _ = R - R ^ 2 := hvariance_eq + _ ≤ R := by nlinarith [sq_nonneg R] + _ = classifierMarginVarianceProxy p threshold score label i := by + simp [classifierMarginVarianceProxy, R] + +theorem oneCoordinate_classifierMarginLoss_mgf_subgamma + {n : ℕ} [Fintype Z] (hn : 0 < n) + (p : Z → ℝ) (hp : IsPMF p) + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (i : ι) (lambda : ℝ) + (hlambda : 0 ≤ lambda) (hlambda_bound : lambda * (n : ℝ)⁻¹ < 3) : + oneCoordinateDeviationMGF (n := n) p + (classifierMarginLoss threshold score label) i lambda + ≤ + Real.exp + ((lambda * (n : ℝ)⁻¹) ^ 2 * + classifierMarginVarianceProxy p threshold score label i / + (2 * (1 - (lambda * (n : ℝ)⁻¹) / 3))) := by + classical + let R := classifierMarginPopulationRisk p threshold score label i + let loss : Z → ℝ := classifierMarginLoss threshold score label i + have hcenter : (∑ z : Z, p z * (R - loss z)) = 0 := by + have hexp : + (∑ z : Z, p z * (R - loss z)) + = R * (∑ z : Z, p z) - ∑ z : Z, p z * loss z := by + rw [Finset.mul_sum, ← Finset.sum_sub_distrib] + exact Finset.sum_congr rfl (fun z _hz => by ring) + rw [hexp, hp.sum_one] + simp [R, loss, classifierMarginPopulationRisk, finitePopulationRisk] + have hbound : ∀ z : Z, R - loss z ≤ 1 := by + intro z + have hR_le : + R ≤ 1 := + (classifierMarginPopulationRisk_mem_Icc_zero_one + p hp threshold score label i).2 + have hloss_nonneg : + 0 ≤ loss z := + (classifierMarginLoss_mem_Icc_zero_one threshold score label i z).1 + linarith + have hvar : + (∑ z : Z, p z * (R - loss z) ^ 2) + ≤ classifierMarginVarianceProxy p threshold score label i := by + simpa [R, loss] using + (classifierMarginVariance_le_risk p hp threshold score label i) + have h := + bennett_mgf_subgamma p (fun z : Z => R - loss z) + (b := 1) + (v := classifierMarginVarianceProxy p threshold score label i) + (lam := lambda * (n : ℝ)⁻¹) + (by norm_num) (by positivity) (by simpa using hlambda_bound) + hp.nonneg hp.sum_one hcenter hbound hvar + simpa [oneCoordinateDeviationMGF, R, loss, classifierMarginPopulationRisk] using h + +theorem finiteProduct_classifierMarginLoss_mgf_subgamma + {n : ℕ} [Fintype Z] (hn : 0 < n) + (p : Z → ℝ) (hp : IsPMF p) + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (i : ι) (lambda : ℝ) + (hlambda : 0 ≤ lambda) (hlambda_bound : lambda * (n : ℝ)⁻¹ < 3) : + (∑ S : Fin n → Z, + finiteProductSampleWeight p S * + Real.exp + (lambda * + (classifierMarginPopulationRisk p threshold score label i - + classifierMarginEmpiricalRisk threshold score label i S))) + ≤ + Real.exp + (lambda ^ 2 * + classifierMarginSampleVarianceProxy (n := n) p threshold score label i / + (2 * (1 - (3 * (n : ℝ))⁻¹ * lambda))) := by + classical + let singleBudget : ℝ := + (lambda * (n : ℝ)⁻¹) ^ 2 * + classifierMarginVarianceProxy p threshold score label i / + (2 * (1 - (lambda * (n : ℝ)⁻¹) / 3)) + have hsingle : + oneCoordinateDeviationMGF (n := n) p + (classifierMarginLoss threshold score label) i lambda ≤ + Real.exp singleBudget := by + simpa [singleBudget] using + (oneCoordinate_classifierMarginLoss_mgf_subgamma + (n := n) hn p hp threshold score label i lambda hlambda hlambda_bound) + have hprod := + finiteProduct_mgf_empiricalRiskDeviation_le_exp_of_single + (ι := ι) (Z := Z) hn p hp + (classifierMarginLoss threshold score label) i lambda singleBudget hsingle + refine hprod.trans_eq ?_ + congr 1 + have hn_ne : (n : ℝ) ≠ 0 := by exact_mod_cast (Nat.ne_of_gt hn) + have hden₁ : 2 * (1 - (lambda * (n : ℝ)⁻¹) / 3) ≠ 0 := by + have hpos : 0 < 1 - (lambda * (n : ℝ)⁻¹) / 3 := by linarith + nlinarith + have hden₂ : 2 * (1 - (3 * (n : ℝ))⁻¹ * lambda) ≠ 0 := by + have hn_pos : 0 < (n : ℝ) := by exact_mod_cast hn + have hlt : lambda < 3 * (n : ℝ) := by + rw [← div_lt_iff₀ hn_pos] + simpa [div_eq_mul_inv] using hlambda_bound + have hpos : 0 < 1 - (3 * (n : ℝ))⁻¹ * lambda := by + rw [sub_pos] + rw [inv_mul_lt_iff₀ (by positivity : 0 < 3 * (n : ℝ))] + nlinarith + nlinarith + simp [singleBudget, classifierMarginSampleVarianceProxy] + field_simp [hn_ne, hden₁, hden₂] + +theorem expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one + {n : ℕ} [Fintype Z] [Fintype ι] + (hn : 0 < n) + (p : Z → ℝ) (hp : IsPMF p) + {π : ι → ℝ} (hπ : IsPMF π) + (lambda : ℝ) + (hlambda : 0 ≤ lambda) (hlambda_bound : lambda * (n : ℝ)⁻¹ < 3) + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) : + expectedPriorBernsteinExpMoment + (finiteProductSampleWeight (n := n) p) π lambda ((3 * (n : ℝ))⁻¹) + (classifierMarginPopulationRisk p threshold score label) + (classifierMarginEmpiricalRiskFn threshold score label) + (classifierMarginSampleVarianceProxy (n := n) p threshold score label) + ≤ 1 := by + classical + apply expectedPriorBernsteinExpMoment_le_one_of_mgf_bound hπ + intro i + simpa [classifierMarginEmpiricalRiskFn] using + (finiteProduct_classifierMarginLoss_mgf_subgamma + (n := n) hn p hp threshold score label i lambda hlambda hlambda_bound) + +/-! ### Product sample law -/ + +/-- Finite iid product sample weights form a PMF when the base mass is a PMF. -/ +theorem finiteProductSampleWeight_isPMF + {n : ℕ} [Fintype Z] {p : Z → ℝ} (hp : IsPMF p) : + IsPMF (finiteProductSampleWeight (n := n) p) where + nonneg := by + intro S + unfold finiteProductSampleWeight + exact Finset.prod_nonneg (fun k _hk => hp.nonneg (S k)) + sum_one := by + unfold finiteProductSampleWeight + calc + (∑ S : Fin n → Z, ∏ k : Fin n, p (S k)) + = ∏ _k : Fin n, ∑ z : Z, p z := by + exact (Fintype.prod_sum (f := fun _k : Fin n => p)).symm + _ = 1 := by simp [hp.sum_one] + +/-! ### PAC-Bayes Bernstein specialization -/ + +/-- Bad samples for the concrete classifier-margin PAC-Bayes Bernstein event. -/ +def finitePACBayesClassifierMarginBernsteinBadSamples + {n : ℕ} [Fintype Z] [Fintype ι] + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (p : Z → ℝ) (posteriorPenalty : (ι → ℝ) → ℝ) : + Finset (Fin n → Z) := + finitePACBayesBernsteinPenaltyBadSamples + (classifierMarginPopulationRisk p threshold score label) + (classifierMarginEmpiricalRiskFn threshold score label) + posteriorPenalty + +/-- +Finite PAC-Bayes Bernstein theorem specialized to classifier margin losses. + +The theorem states that, under the same explicit finite-parameter certificates +as `finitePACBayesBernsteinMargin_badEventMass_le_delta`, the iid sample mass +of classifier-margin bad samples is at most `delta`. +-/ +theorem finitePACBayesClassifierMarginBernstein_badEventMass_le_delta + {n : ℕ} [Fintype Z] [Fintype ι] [Nonempty ι] + (p : Z → ℝ) (hp : IsPMF p) + {π : ι → ℝ} (hπ : IsFullSupportPMF π) + (lambda scale delta : ℝ) + (hlambda : 0 < lambda) (hscale : scale * lambda < 1) + (hdelta : 0 < delta) + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (complexityOf : (ι → ℝ) → ℝ) + (hcomplexity : + ∀ ρ : ι → ℝ, IsPMF ρ → + klDiv ρ π + Real.log (1 / delta) ≤ complexityOf ρ) + (hpenalty : + ∀ ρ : ι → ℝ, IsPMF ρ → + complexityOf ρ / lambda + + lambda * + posteriorClassifierMarginVarianceProxy ρ p threshold score label / + (2 * (1 - scale * lambda)) + ≤ + Real.sqrt + (2 * posteriorClassifierMarginVarianceProxy ρ p threshold score label * + complexityOf ρ) + + scale * complexityOf ρ) + (hExpected : + expectedPriorBernsteinExpMoment + (finiteProductSampleWeight (n := n) p) π lambda scale + (classifierMarginPopulationRisk p threshold score label) + (classifierMarginEmpiricalRiskFn threshold score label) + (classifierMarginVarianceProxy p threshold score label) ≤ 1) : + (∑ S ∈ + finitePACBayesClassifierMarginBernsteinBadSamples + (n := n) threshold score label p + (fun ρ => + Real.sqrt + (2 * posteriorClassifierMarginVarianceProxy ρ p threshold score label * + complexityOf ρ) + + scale * complexityOf ρ), + finiteProductSampleWeight p S) ≤ delta := by + classical + simpa [finitePACBayesClassifierMarginBernsteinBadSamples, + posteriorClassifierMarginVarianceProxy] + using + (finitePACBayesBernsteinMargin_badEventMass_le_delta + (Ω := Fin n → Z) (ι := ι) + (ν := finiteProductSampleWeight (n := n) p) + (hν := finiteProductSampleWeight_isPMF (n := n) hp) + (π := π) hπ lambda scale delta hlambda hscale hdelta + (classifierMarginPopulationRisk p threshold score label) + (classifierMarginEmpiricalRiskFn threshold score label) + (classifierMarginVarianceProxy p threshold score label) + complexityOf hcomplexity hpenalty hExpected) + +/-- +Finite iid PAC-Bayes Bernstein theorem for classifier-margin losses. + +This version derives the normalized Bernstein prior-moment certificate from the +iid product sample law and the finite Bennett/sub-Gamma bound for thresholded +margin losses. The variance proxy is scaled by the sample size. +-/ +theorem finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta + {n : ℕ} [Fintype Z] [Fintype ι] [Nonempty ι] + (hn : 0 < n) + (p : Z → ℝ) (hp : IsPMF p) + {π : ι → ℝ} (hπ : IsFullSupportPMF π) + (lambda delta : ℝ) + (hlambda : 0 < lambda) + (hlambda_bound : lambda * (n : ℝ)⁻¹ < 3) + (hdelta : 0 < delta) + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (complexityOf : (ι → ℝ) → ℝ) + (hcomplexity : + ∀ ρ : ι → ℝ, IsPMF ρ → + klDiv ρ π + Real.log (1 / delta) ≤ complexityOf ρ) + (hpenalty : + ∀ ρ : ι → ℝ, IsPMF ρ → + complexityOf ρ / lambda + + lambda * + posteriorClassifierMarginSampleVarianceProxy + (n := n) ρ p threshold score label / + (2 * (1 - (3 * (n : ℝ))⁻¹ * lambda)) + ≤ + Real.sqrt + (2 * + posteriorClassifierMarginSampleVarianceProxy + (n := n) ρ p threshold score label * + complexityOf ρ) + + (3 * (n : ℝ))⁻¹ * complexityOf ρ) : + (∑ S ∈ + finitePACBayesClassifierMarginBernsteinBadSamples + (n := n) threshold score label p + (fun ρ => + Real.sqrt + (2 * + posteriorClassifierMarginSampleVarianceProxy + (n := n) ρ p threshold score label * + complexityOf ρ) + + (3 * (n : ℝ))⁻¹ * complexityOf ρ), + finiteProductSampleWeight p S) ≤ delta := by + classical + have hExpected : + expectedPriorBernsteinExpMoment + (finiteProductSampleWeight (n := n) p) π lambda ((3 * (n : ℝ))⁻¹) + (classifierMarginPopulationRisk p threshold score label) + (classifierMarginEmpiricalRiskFn threshold score label) + (classifierMarginSampleVarianceProxy (n := n) p threshold score label) ≤ 1 := by + exact + expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one + (n := n) hn p hp hπ.toIsPMF lambda hlambda.le hlambda_bound + threshold score label + have hscale : (3 * (n : ℝ))⁻¹ * lambda < 1 := by + have hn_pos : 0 < (n : ℝ) := by exact_mod_cast hn + have hlt : lambda < 3 * (n : ℝ) := by + rw [← div_lt_iff₀ hn_pos] + simpa [div_eq_mul_inv] using hlambda_bound + rw [inv_mul_lt_iff₀ (by positivity : 0 < 3 * (n : ℝ))] + nlinarith + simpa [finitePACBayesClassifierMarginBernsteinBadSamples, + posteriorClassifierMarginSampleVarianceProxy] + using + (finitePACBayesBernsteinMargin_badEventMass_le_delta + (Ω := Fin n → Z) (ι := ι) + (ν := finiteProductSampleWeight (n := n) p) + (hν := finiteProductSampleWeight_isPMF (n := n) hp) + (π := π) hπ lambda ((3 * (n : ℝ))⁻¹) delta hlambda + hscale hdelta + (classifierMarginPopulationRisk p threshold score label) + (classifierMarginEmpiricalRiskFn threshold score label) + (classifierMarginSampleVarianceProxy (n := n) p threshold score label) + complexityOf hcomplexity hpenalty hExpected) + +/-- +Finite-grid iid PAC-Bayes Bernstein theorem for classifier-margin losses. + +Each grid index supplies an admissible Bernstein parameter and confidence +allocation. A caller supplies the finite grid certificate assigning each +posterior to a bucket whose fixed-parameter Bernstein penalty is no larger than +`posteriorPenalty`. +-/ +theorem finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_sum_delta + {n : ℕ} [Fintype Z] [Fintype ι] [Nonempty ι] [Fintype γ] + (hn : 0 < n) + (p : Z → ℝ) (hp : IsPMF p) + {π : ι → ℝ} (hπ : IsFullSupportPMF π) + (lambdaOf confidenceOf : γ → ℝ) + (hlambda : ∀ g : γ, 0 < lambdaOf g) + (hlambda_bound : ∀ g : γ, lambdaOf g * (n : ℝ)⁻¹ < 3) + (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g) + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (posteriorPenalty : (ι → ℝ) → ℝ) + (hgridCovers : + ∀ ρ : ι → ℝ, IsPMF ρ → + ∃ g : γ, + (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g + + lambdaOf g * + posteriorClassifierMarginSampleVarianceProxy + (n := n) ρ p threshold score label / + (2 * (1 - (3 * (n : ℝ))⁻¹ * lambdaOf g)) + ≤ posteriorPenalty ρ) : + (∑ S ∈ + finitePACBayesClassifierMarginBernsteinBadSamples + (n := n) threshold score label p posteriorPenalty, + finiteProductSampleWeight p S) ≤ + ∑ g : γ, confidenceOf g := by + classical + have hscale : ∀ g : γ, (3 * (n : ℝ))⁻¹ * lambdaOf g < 1 := by + intro g + have hn_pos : 0 < (n : ℝ) := by exact_mod_cast hn + have hlt : lambdaOf g < 3 * (n : ℝ) := by + rw [← div_lt_iff₀ hn_pos] + simpa [div_eq_mul_inv] using hlambda_bound g + rw [inv_mul_lt_iff₀ (by positivity : 0 < 3 * (n : ℝ))] + nlinarith + have hExpected : + ∀ g : γ, + expectedPriorBernsteinExpMoment + (finiteProductSampleWeight (n := n) p) π (lambdaOf g) + ((3 * (n : ℝ))⁻¹) + (classifierMarginPopulationRisk p threshold score label) + (classifierMarginEmpiricalRiskFn threshold score label) + (classifierMarginSampleVarianceProxy (n := n) p threshold score label) ≤ 1 := by + intro g + exact + expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one + (n := n) hn p hp hπ.toIsPMF (lambdaOf g) (hlambda g).le + (hlambda_bound g) threshold score label + simpa [finitePACBayesClassifierMarginBernsteinBadSamples, + posteriorClassifierMarginSampleVarianceProxy] + using + (finitePACBayesBernsteinGridOptimized_badEventMass_le_sum_delta + (Ω := Fin n → Z) (ι := ι) (γ := γ) + (ν := finiteProductSampleWeight (n := n) p) + (hν := finiteProductSampleWeight_isPMF (n := n) hp) + (π := π) hπ lambdaOf (fun _g : γ => (3 * (n : ℝ))⁻¹) confidenceOf + posteriorPenalty hlambda hscale hconfidenceOf + (classifierMarginPopulationRisk p threshold score label) + (classifierMarginEmpiricalRiskFn threshold score label) + (classifierMarginSampleVarianceProxy (n := n) p threshold score label) + hgridCovers hExpected) + +/-- +Finite-grid iid PAC-Bayes Bernstein theorem for classifier-margin losses with +an explicit total confidence budget. +-/ +theorem finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_delta + {n : ℕ} [Fintype Z] [Fintype ι] [Nonempty ι] [Fintype γ] + (hn : 0 < n) + (p : Z → ℝ) (hp : IsPMF p) + {π : ι → ℝ} (hπ : IsFullSupportPMF π) + (lambdaOf confidenceOf : γ → ℝ) + (hlambda : ∀ g : γ, 0 < lambdaOf g) + (hlambda_bound : ∀ g : γ, lambdaOf g * (n : ℝ)⁻¹ < 3) + (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g) + (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) + (posteriorPenalty : (ι → ℝ) → ℝ) + (hgridCovers : + ∀ ρ : ι → ℝ, IsPMF ρ → + ∃ g : γ, + (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g + + lambdaOf g * + posteriorClassifierMarginSampleVarianceProxy + (n := n) ρ p threshold score label / + (2 * (1 - (3 * (n : ℝ))⁻¹ * lambdaOf g)) + ≤ posteriorPenalty ρ) + {delta : ℝ} (hgridConfidence : (∑ g : γ, confidenceOf g) ≤ delta) : + (∑ S ∈ + finitePACBayesClassifierMarginBernsteinBadSamples + (n := n) threshold score label p posteriorPenalty, + finiteProductSampleWeight p S) ≤ delta := + (finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_sum_delta + (n := n) (Z := Z) (ι := ι) (γ := γ) + hn p hp hπ lambdaOf confidenceOf hlambda hlambda_bound hconfidenceOf + threshold score label posteriorPenalty hgridCovers).trans hgridConfidence + +end + +end FormalSLT.PACBayesMargin diff --git a/FormalSLT/Test/PACBayesBernsteinTest.lean b/FormalSLT/Test/PACBayesBernsteinTest.lean index 5609928..347d8a9 100644 --- a/FormalSLT/Test/PACBayesBernsteinTest.lean +++ b/FormalSLT/Test/PACBayesBernsteinTest.lean @@ -19,6 +19,7 @@ namespace FormalSLT.PACBayesBernstein.Test #check FormalSLT.PACBayesBernstein.posteriorMarginVarianceProxy #check FormalSLT.PACBayesBernstein.priorBernsteinExpMoment #check FormalSLT.PACBayesBernstein.expectedPriorBernsteinExpMoment +#check FormalSLT.PACBayesBernstein.expectedPriorBernsteinExpMoment_le_one_of_mgf_bound #check FormalSLT.PACBayesBernstein.priorBernsteinExpMomentTailMass #check FormalSLT.PACBayesBernstein.priorBernsteinExpMoment_nonneg #check FormalSLT.PACBayesBernstein.priorBernsteinExpMoment_tailMass_le_expected_div @@ -30,4 +31,6 @@ namespace FormalSLT.PACBayesBernstein.Test #check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinPenalty_badEventMass_le_delta #check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinMargin_badEventMass_le_delta +#print axioms FormalSLT.PACBayesBernstein.expectedPriorBernsteinExpMoment_le_one_of_mgf_bound + end FormalSLT.PACBayesBernstein.Test diff --git a/README.md b/README.md index 8f9ef74..272ccfc 100644 --- a/README.md +++ b/README.md @@ -7,154 +7,91 @@ [![Axioms](https://img.shields.io/badge/axioms-propext%2C%20Classical.choice%2C%20Quot.sound-brightgreen.svg)](#audit-commands) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](./LICENSE) -FormalSLT is a Lean 4 library for checking finite-sample statistical learning -theory proof chains. The current v0.1 surface is organized around two checked -endpoints: - -1. a finite-class, countable-time Hoeffding confidence sequence for `[0,1]` - losses; and -2. a finite-net Dudley bridge for a concrete process indexed by the non-finite - unit interval `[0,1]`. - -The repository also packages reusable dyadic Dudley API instances for a -two-point metric space and for finite discrete spaces `Fin n`. Those examples -are API checks: they show that the finite-net wrapper is reusable outside the -unit-interval file. - -The concentration layer now carries the sharp McDiarmid bounded-differences -inequality over a homogeneous product measure. For a function whose value -changes by at most `c k` when coordinate `k` is altered, the library proves the -one-sided tail `exp(-2ε²/∑ₖcₖ²)` (`mcdiarmid_of_hasBoundedDifferences_sharp`), -its lower-tail form (`mcdiarmid_of_hasBoundedDifferences_sharp_lower`), and the -two-sided bound `2·exp(-2ε²/∑ₖcₖ²)` -(`mcdiarmid_twoSided_of_hasBoundedDifferences_sharp`). The sharp `2B²` exponent -is propagated into the high-probability Rademacher, finite-class, VC, and -algorithmic-stability wrappers listed below, replacing the looser Azuma `8B²` -constant on those paths. The localized-Rademacher wrapper still uses the older -Azuma constant. The bounded-differences theorem is stated for the same product -law in each coordinate, not for arbitrary non-iid product spaces. - -The PAC-Bayes layer now includes a finite Bernstein margin-proxy shell. It -adds a supplied per-hypothesis variance proxy, a normalized Bernstein -prior-moment certificate, fixed-`λ` finite bad-event bounds, and a -posterior-dependent square-root-plus-linear wrapper. This is a supplied-proxy -finite shell: it is not yet a concrete classifier-margin extractor, an -all-real-`λ` optimization theorem, or a continuous hypothesis-space theorem. - -**60 `FormalSLT/` Lean files. 85 checked Lean files under `FormalSLT/` and -`examples/`. 33,427 lines. Zero `sorry`. Zero `admit`. Zero custom axioms.** - -Counts are generated with `find FormalSLT -name '*.lean'`, `find FormalSLT -examples -name '*.lean'`, and `find FormalSLT examples -name '*.lean' -print0 | -xargs -0 wc -l`. - -The printed axiom profile for the v0.1 headline surface stays inside: +FormalSLT is a compact Lean 4 library for the finite-sample statistical +learning theory route from empirical risk minimization to VC-style +generalization bounds, with recent extensions for contraction, linear +predictors, finite sub-Gaussian chaining, algorithmic stability, and +finite PAC-Bayes confidence bounds, a conditional sub-gamma MGF extractor, +sharp bounded-differences/McDiarmid tails, and a first total-bounded finite-net +bridge for the Dudley lane. The PAC-Bayes layer now also includes a finite iid +Bernstein classifier-margin theorem for thresholded margin losses. + +> **Repository names.** The public home of this project is +> [`Robby955/FormalSLT`](https://github.com/Robby955/FormalSLT); the same code +> also lives in the development repo `Robby955/lean-statistical-learning`. Clone +> instructions and badges below use the public `FormalSLT` name. + + +**61 Lean modules. 32,558 lines of Lean. Zero `sorry`. Zero `admit`. Zero custom axioms.** + + +Axioms used by the public theorem spine: `[propext, Classical.choice, Quot.sound]`. ![FormalSLT theorem chain](./docs/theorem-chain.svg) -## What v0.1 Proves - -| Surface | Main checked endpoint | Checker | -|---|---|---| -| Finite-class Hoeffding confidence sequence | `FiniteClassConfidenceSequence.failure_probability_le` | `examples/CheckUniformConvergence.lean` | -| Unit-interval finite-net Dudley bridge | `unitIntervalRademacherLinearSup_roundedDyadicGrid_dudley_m_bound_prefixFree` | `examples/CheckUnitIntervalDudley.lean` | -| Packaged finite dyadic Dudley API | `FiniteDyadicDudleyInstance.suppliedSup_dudley_bound` | `examples/CheckV01Usability.lean` | -| Two-point dyadic Dudley instance | `twoPointDudleyInstance` | `examples/CheckTwoPointDudley.lean` | -| `Fin n` discrete dyadic Dudley instance | `finDiscreteDudleyInstance` | `examples/CheckFiniteDiscreteDudley.lean` | -| Two-sided sharp McDiarmid over a homogeneous product measure | `mcdiarmid_twoSided_of_hasBoundedDifferences_sharp` | `FormalSLT/Test/SharpMcDiarmidTest.lean` | -| PAC-Bayes Bernstein supplied margin-proxy shell | `finitePACBayesBernsteinMargin_badEventMass_le_delta` | `examples/CheckPACBayesBernstein.lean` | - -The confidence-sequence chain makes the finite-class and countable-time union -bound explicit. The Dudley chain connects finite sub-Gaussian chaining to a -non-finite metric index space by constructing rounded dyadic finite nets and -checking the supplied supremum for the concrete process -`X(b,t) = signOfBool b * t`. - -## Fast Verification - -From the repository root: - -```bash -lake exe cache get -lake build FormalSLT -lake env lean examples/CheckV01Usability.lean -lake env lean examples/CheckUniformConvergence.lean -lake env lean examples/CheckUnitIntervalDudley.lean -lake env lean examples/CheckTwoPointDudley.lean -lake env lean examples/CheckFiniteDiscreteDudley.lean -lake env lean examples/CheckPACBayesBernstein.lean -python3 scripts/generate_proof_frontier_manifest.py --check -``` - -If `lake` is not on the shell path, use `~/.elan/bin/lake`. - -## What v0.1 Does Not Prove - -- No full continuous Dudley entropy-integral theorem. -- No arbitrary measurable-supremum construction over non-finite classes. -- No general separability theorem. -- No infinite-class confidence sequence. -- No martingale or e-process stopping theorem in the v0.1 confidence-sequence - surface. -- No neural-network generalization theorem. +The core path runs from ERM through Rademacher symmetrization, +high-probability Rademacher bounds, Massart, Sauer-Shelah, the binary VC +bridge, finite contraction, linear predictors, finite sub-Gaussian chaining, +finite Dudley entropy-budget wrappers, finite algorithmic stability, finite +localized-Rademacher scaffolding, sharp McDiarmid product-measure concentration, +finite PAC-Bayes KL/DV/MGF, bounded-loss confidence bounds, finite iid +Bernstein classifier-margin adapters, and total-bounded finite-net adapters for +the next Dudley steps. ## Where to start -- **For the v0.1 proof surface:** start with - [FormalSLT v0.1 quickstart](./docs/formalslt-v0.1-quickstart.md). -- **For the v0.1 technical note:** read - [FormalSLT v0.1 technical note](./docs/formalslt-v0.1-technical-note.md). -- **For TheoremPath packaging:** read - [TheoremPath FormalSLT v0.1 draft](./docs/theorempath-formalslt-v0.1-page-draft.mdx). -- **For the live walkthrough:** see - [theorempath.com/theorems/formalslt-v0-1](https://theorempath.com/theorems/formalslt-v0-1). - **For ML readers:** start with [How to read the proofs](./docs/how-to-read-the-proofs.md), then [Intuition](./docs/intuition.md). - **For proof structure:** see [Diagrams](./docs/diagrams.md). - **For exact theorem names:** use [Theorem map](./docs/theorem-map.md). -- **For the conditional sub-Gamma extractor:** see - [Conditional Sub-Gamma Extractor](./docs/subgamma-extractor.md). -- **For the non-finite unit-interval Dudley example:** see - [Unit-Interval Dudley Example](./docs/unit-interval-dudley.md). -- **For reusable dyadic Dudley API packaging:** see - [FormalSLT v0.1 quickstart](./docs/formalslt-v0.1-quickstart.md). -- **For the finite discrete dyadic-net family:** see - [FormalSLT v0.1 quickstart](./docs/formalslt-v0.1-quickstart.md). -- **For a generated proof-surface index:** see - [Proof frontier manifest](./docs/proof-frontier.md). - **For scope and assumptions:** read - [Scope and assumptions](./docs/assumptions-and-nonclaims.md). + [Assumptions and current boundaries](./docs/assumptions-and-nonclaims.md). - **For contributors:** read [Contributing](./CONTRIBUTING.md), then [Good first issues](./docs/good-first-issues.md). - **For related Lean projects:** see [Related work](./docs/related-work.md). - FormalSLT is scoped as a finite-class theorem spine and is complementary to - existing empirical-process and Rademacher-generalization formalizations. - -## Library Map - -FormalSLT records finite-sample statistical learning bounds as Lean theorems -with explicit hypotheses and constants. Beyond the v0.1 headline chains, the -library contains checked finite routes for: - -- finite-class ERM, Rademacher symmetrization, Massart, Sauer-Shelah, and - VC-style sample-complexity wrappers; -- high-probability Rademacher and VC sample-complexity bounds carrying the - sharp McDiarmid `2B²` exponent; -- the sharp bounded-differences/McDiarmid concentration module - (`Concentration.SharpMcDiarmid`): one-sided, lower-tail, and two-sided - homogeneous product-measure tails, plus the additive independent special case; -- finite contraction and linear-predictor Rademacher bounds; -- finite Bennett/Bernstein and localized-Rademacher scaffolding; -- conditional sub-Gamma MGF extraction for bounded, conditionally centered - increments with an explicit conditional second-moment proxy; -- finite covering, finite sub-Gaussian chaining, and finite Dudley - entropy-budget wrappers; -- finite algorithmic stability expected-gap and high-probability wrappers; -- finite PAC-Bayes KL/DV/MGF, bounded-loss confidence bounds, and finite-grid - peeling wrappers; -- finite PAC-Bayes Bernstein margin-proxy bounds with a supplied - per-hypothesis variance proxy. + FormalSLT is complementary to + [YuanheZ/lean-stat-learning-theory](https://github.com/YuanheZ/lean-stat-learning-theory), + which builds deeper empirical-process infrastructure; this repo focuses on a + readable finite-class theorem spine. + +## Why this matters + +Finite-sample proofs in statistical learning theory carry assumptions that are +easy to blur in prose. FormalSLT turns a readable finite-sample route into +machine-checked Lean statements, so each bound carries its hypotheses, +constants, and finite-class scope in the theorem signature. The motivations: + +- **Reproducibility.** Every constant — the sharp McDiarmid exponent, the + `2 * E[Rad]` factor, the `(en/d)^d` Sauer-Shelah closed form — is a Lean + term that the kernel re-checks on every build. There is no implicit "without + loss of generality" or "up to constants". +- **Curriculum.** A graduate student reading the README can click through to + the exact Lean statement for any bound they have seen on a chalkboard, with + the assumptions made explicit in the type signature. +- **Frontier ML.** As learning theory increasingly feeds back into modern + deep-learning analyses (PAC-Bayes generalization, stability, chaining), + having a checked finite scaffold is the first step toward formal guarantees + on the methods themselves. + +## Theorem families + +| Family | Main modules | Representative result | Status | +|---|---|---|---| +| Finite-class ERM | `Risk`, `ERM`, `Rademacher.ERMGeneralization` | Excess risk controlled by uniform deviation | Verified | +| Rademacher symmetrization | `GhostSample`, `Rademacher.Symmetrization` | `E[genGap] ≤ 2 * E[Rad]` | Verified | +| High-probability Rademacher | `Azuma.*`, `Rademacher.HighProbability` | `P(genGap ≥ 2 * E[Rad] + ε) ≤ exp(-ε² n / (2B²))` | Verified | +| Conditional sub-gamma concentration | `Concentration.SubGamma.Extractor`, `Concentration.SubGamma.BennettBound` | bounded + conditionally-centered + conditional second moment `≤ σ²` ⇒ conditional MGF `≤ exp(σ²λ²/(2(1−bλ/3)))` on `b·λ < 3` | Verified | +| Massart finite-class bound | `Rademacher.Massart` | `Rad(H,S) ≤ B * sqrt(2 * log card(H) / n)` | Verified | +| Binary VC route | `VC.SauerShelah`, `VC.BinaryVCBridge`, `VC.SampleComplexity` | VC-style ERM excess-risk tail and closed sample-complexity form via effective classes | Verified | +| Finite contraction | `Rademacher.Contraction` | `Rad_S(φ ∘ F) ≤ L * Rad_S(F)` for scalar finite samples/classes | Verified | +| Linear predictors | `Rademacher.LinearPredictor` | `Rad ≤ R * n⁻¹ * sqrt(∑ k, ‖z k‖²)` and `Rad ≤ R * B / sqrt n` | Verified | +| Localized Rademacher scaffold | `Rademacher.Localized` | Bernstein excess-risk localization embeds into second-moment localized empirical Rademacher complexity | Verified finite scaffold | +| Finite covering and two-scale chaining | `Covering.Rademacher`, `Covering.DudleyChaining` | ε-net peeling and two-scale finite chaining | Verified | +| Finite sub-Gaussian chaining foundation | `Covering.FiniteSubGaussianChaining` | finite-max entropy bounds and finite Dudley-style entropy-budget sums | Verified finite infrastructure | +| Total-bounded Dudley bridge | `Covering.TotalBoundedDudley` | totally bounded metric spaces yield dyadic finite-net schedules, projected finite-net wrappers, truncated interval-integral entropy comparisons, and supplied-supremum / finite-skeleton / pathwise-modulus / epsilonized boundary adapters | Verified bridge | +| Algorithmic stability | `AlgorithmicStability`, `Stability.BousquetElisseeff` | bounded-differences constants, finite and product-measure expected-gap wrappers with bound `β`, bounded-loss measurability adapters, and bounded-loss Azuma-constant concentration wrappers | Verified finite scaffold | +| PAC-Bayes finite confidence layer | `PACBayesKL`, `PACBayesMcAllester`, `PACBayesFiniteProductMGF`, `PACBayesBoundedLoss`, `PACBayesBernstein`, `PACBayesMargin` | finite KL/DV change-of-measure, bounded-loss Catoni-style bound, closed PAC-Bayes good-event payoff, fixed-budget McAllester corollary, finite-grid McAllester peeling wrapper, and finite iid Bernstein classifier-margin bad-event bounds | Verified finite layer | ## Scope and assumptions @@ -165,45 +102,36 @@ The main generalization theorems are intentionally finite and explicit. | Hypothesis classes | Finite index types unless a theorem states a separate finite net/family | | Samples | Finite iid samples through product measures | | Losses/processes | Scalar real-valued, with boundedness or finite sub-Gaussian MGF assumptions | -| Conditional MGF layer | Bounded, conditionally centered real increments with an explicit conditional second-moment proxy | -| Constants | High-probability Rademacher and VC sample-complexity bounds use the sharp McDiarmid `2B²` exponent; the localized-Rademacher wrapper still cites the Azuma `8B²` constant | -| PAC-Bayes Bernstein layer | Finite posterior/prior setting with supplied variance proxy and normalized prior-moment certificate | -| Chaining | Finite nets/images, finite support/outcome spaces, finite entropy sums; the unit-interval example instantiates the bridge on a non-finite metric index space with explicit finite meshes | +| Constants | Main Rademacher, VC, stability, and bounded-differences wrappers use the sharp McDiarmid route where stated; legacy Azuma lemmas remain as named infrastructure | +| Chaining | Finite nets/images, finite support/outcome spaces, finite entropy sums | | Public axiom target | `[propext, Classical.choice, Quot.sound]` only | -## Open work +## Current boundaries Short version: -- The main Rademacher and VC results are finite-class and finite-sample - theorems. -- The sharp McDiarmid constant `exp(-2t²/∑cᵢ²)` is proved for the additive - independent case (`mcdiarmid_additive_independent`), for Doob martingale - increments with conditional sub-Gaussian MGF control - (`sharp_mcdiarmid_of_doob_increments`), and for a general bounded-differences - function over a homogeneous product measure, one-sided - (`mcdiarmid_of_hasBoundedDifferences_sharp`) and two-sided - (`mcdiarmid_twoSided_of_hasBoundedDifferences_sharp`). The high-probability - Rademacher and VC sample-complexity wrappers now use this sharp `2B²` - exponent; the localized-Rademacher wrapper still uses the older Azuma `8B²` - constant. The bounded-differences theorem is stated for the same product law - in each coordinate, not for arbitrary non-iid product spaces. -- The chaining layer proves finite entropy-budget infrastructure and an initial +- The main Rademacher and VC results are finite-class / finite-sample theorems. +- The main Rademacher, VC, stability, and bounded-differences wrappers have + been rewired to the sharp McDiarmid route where the theorem name states that + sharp constant. +- The chaining layer proves finite entropy-budget infrastructure and a first total-bounded finite-net extraction bridge, not the continuous Dudley integral. - PAC-Bayes includes a finite `[0,1]` bounded-loss Catoni-style confidence bound, a closed high-confidence good-event theorem, a fixed-budget - McAllester-style square-root corollary, a finite-grid peeling wrapper for - posterior-dependent penalties, and a finite Bernstein margin-proxy shell with - a supplied per-hypothesis variance proxy. Exact all-real-`λ`, concrete - classifier-margin extractors, infinite-hypothesis, and continuous-posterior - variants are not yet implemented. + McAllester-style square-root corollary, a finite-grid peeling wrapper for + posterior-dependent penalties, and a finite iid Bernstein classifier-margin + theorem for thresholded margin losses with the prior-moment certificate + derived from the iid finite product law. The Bernstein lane now also has a + finite-grid `λ` wrapper for posterior-dependent penalties. Exact all-real + `λ`, infinite-hypothesis, and continuous-posterior variants remain future + work. - Algorithmic stability includes finite iid and measure-theoretic iid expected-gap wrappers, plus bounded-loss high-probability wrappers for finite measurable hypothesis interfaces. For the full scope statement, see -[Scope and assumptions](./docs/assumptions-and-nonclaims.md). +[Assumptions and current boundaries](./docs/assumptions-and-nonclaims.md). ## Installation @@ -235,18 +163,7 @@ Run these before treating a branch as a release candidate: ```bash lake exe cache get lake build FormalSLT -lake env lean examples/CheckV01Usability.lean -lake env lean examples/CheckSubGammaExtractor.lean -lake env lean examples/CheckUnitIntervalDudley.lean -lake env lean examples/CheckTwoPointDudley.lean -lake env lean examples/CheckFiniteDiscreteDudley.lean -lake env lean examples/CheckPACBayesBernstein.lean -python3 scripts/generate_proof_frontier_manifest.py --check -python3 scripts/check_doc_anchors.py \ - docs/formalslt-v0.1-technical-note.md \ - docs/formalslt-v0.1-artifact-map-2026-06-01.md \ - docs/formalslt-v0.1-release-review-2026-06-01.md \ - docs/theorempath-formalslt-v0.1-page-draft.mdx +lake env lean examples/CheckShowcaseTheorems.lean ``` ## Audit commands @@ -257,35 +174,16 @@ audits: ```bash rg -n --pcre2 '^\s*(?:by\s+)?(?:sorry|admit)\b|:=\s*(?:by\s+)?(?:sorry|admit)\b' FormalSLT examples rg -n --pcre2 '^\s*(?:axiom|constant)\s+[A-Za-z_]' FormalSLT examples -python3 scripts/generate_proof_frontier_manifest.py --check -python3 scripts/check_doc_anchors.py \ - docs/formalslt-v0.1-technical-note.md \ - docs/formalslt-v0.1-artifact-map-2026-06-01.md \ - docs/formalslt-v0.1-release-review-2026-06-01.md \ - docs/theorempath-formalslt-v0.1-page-draft.mdx git diff --check ``` The expected result is: - `lake build FormalSLT` exits successfully; -- `examples/CheckV01Usability.lean` resolves the v0.1 citation targets and - prints standard Lean/Mathlib axioms for the bundled confidence-sequence API, - the dyadic-net sequence API, and the concrete dyadic-net instances; - `examples/CheckShowcaseTheorems.lean` prints standard Lean/Mathlib axioms for selected public theorems; -- `examples/CheckSubGammaExtractor.lean` prints standard Lean/Mathlib axioms - for the conditional sub-Gamma extractor and its helper lemmas; -- `examples/CheckUnitIntervalDudley.lean` prints standard Lean/Mathlib axioms - for the concrete unit-interval Dudley example; -- `examples/CheckTwoPointDudley.lean` prints standard Lean/Mathlib axioms - for the second dyadic-net example; -- `examples/CheckFiniteDiscreteDudley.lean` prints standard Lean/Mathlib - axioms for the finite discrete embedded Rademacher dyadic-net family; - the `rg` commands find no executable `sorry`, no executable `admit`, and no custom axioms/constants in `FormalSLT` or `examples`; -- the proof-frontier manifest is in sync with the theorem map and source counts; -- documented `FormalSLT/...lean:line` anchors point at the named declarations; - `git diff --check` reports no whitespace errors. ## Module map @@ -294,11 +192,12 @@ The expected result is: |---|---| | Core definitions | `Risk`, `ERM`, `UniformConvergence`, `GhostSample` | | Probability utilities | `Probability.Concentration`, `Probability.FiniteUnionBound`, `Probability.FiniteExpectation` | -| Conditional sub-Gamma infrastructure | `Concentration.SubGamma.BennettBound`, `Concentration.SubGamma.BoundedExpIntegrable`, `Concentration.SubGamma.CondExpProduct`, `Concentration.SubGamma.CondJensen`, `Concentration.SubGamma.CondMarkov`, `Concentration.SubGamma.CondVarianceFromSquare`, `Concentration.SubGamma.Extractor` | +| Conditional sub-gamma concentration | `Concentration.SubGamma.BennettBound`, `Concentration.SubGamma.BoundedExpIntegrable`, `Concentration.SubGamma.CondExpProduct`, `Concentration.SubGamma.CondJensen`, `Concentration.SubGamma.CondMarkov`, `Concentration.SubGamma.CondVarianceFromSquare`, `Concentration.SubGamma.Extractor` | +| Sharp bounded-differences concentration | `Concentration.SharpMcDiarmid` | | Rademacher route | `Rademacher.FiniteSample`, `Rademacher.FiniteSampleSymmetrization`, `Rademacher.ProbabilityBridge`, `Rademacher.Decoupling`, `Rademacher.Symmetrization`, `Rademacher.Massart`, `Rademacher.HighProbability`, `Rademacher.FiniteClassHighProb`, `Rademacher.UniformDeviation`, `Rademacher.ERMGeneralization`, `Rademacher.Contraction`, `Rademacher.LinearPredictor`, `Rademacher.Localized` | | Azuma infrastructure | `Azuma.ExposureMartingale`, `Azuma.BoundedDifferences`, `Azuma.BoundedDiffMartingale`, `Azuma.BoundedDiffsAzumaInput`, `Azuma.BoundedIncrementBound`, `Azuma.HasBoundedDifferences`, `Azuma.ExposureIncrementHoeffding`, `Azuma.ExposureIncrementCondMGF`, `Azuma.GenGapTail` | | VC route | `VC.Dimension`, `VC.PACBridge`, `VC.SauerShelah`, `VC.Rademacher`, `VC.SampleComplexity`, `VC.BinaryVCBridge` | -| Covering and chaining | `Covering.Rademacher`, `Covering.DudleyChaining`, `Covering.FiniteSubGaussianChaining`, `Covering.TotalBoundedDudley`, `Covering.UnitIntervalDudley`, `Covering.TwoPointDudley`, `Covering.FiniteDiscreteDudley` | +| Covering and chaining | `Covering.Rademacher`, `Covering.DudleyChaining`, `Covering.FiniteSubGaussianChaining`, `Covering.TotalBoundedDudley` | | Stability and PAC-Bayes foundations | `AlgorithmicStability`, `Stability.BousquetElisseeff`, `PACBayesKL`, `PACBayesMcAllester`, `PACBayesFiniteProductMGF`, `PACBayesBoundedLoss`, `PACBayesBernstein` | ## Roadmap @@ -308,6 +207,8 @@ The expected result is: - [x] Massart finite-class bound - [x] Azuma-Hoeffding genGap tail - [x] High-probability Rademacher bound +- [x] Conditional sub-gamma MGF extractor (bounded + conditionally-centered + + conditional second moment ⇒ sub-gamma conditional MGF on `b·λ < 3`) - [x] Sauer-Shelah polynomial bound - [x] VC-style pointwise Rademacher - [x] VC uniform deviation and ERM excess-risk tail @@ -316,8 +217,6 @@ The expected result is: - [x] Finite-sample scalar contraction - [x] Finite-dimensional linear predictor Rademacher bound - [x] Finite localized Rademacher/Bernstein variance-localization scaffold -- [x] Conditional sub-Gamma MGF extractor from boundedness, conditional - centering, and a conditional second-moment proxy - [x] Covering number peeling and two-scale chaining - [x] Finite sub-Gaussian max and finite chaining entropy budgets - [x] Algorithmic stability bounded-differences scaffold @@ -330,8 +229,10 @@ The expected result is: - [x] PAC-Bayes closed high-confidence generalization payoff theorem - [x] PAC-Bayes fixed-budget McAllester-style square-root corollary - [x] PAC-Bayes finite-grid McAllester peeling and optimized finite-grid wrapper -- [x] PAC-Bayes finite Bernstein margin-proxy wrapper with supplied variance - proxy and normalized prior-moment certificate +- [x] PAC-Bayes finite iid Bernstein classifier-margin confidence wrapper with + concrete margin losses and an iid-derived normalized prior-moment certificate +- [x] PAC-Bayes finite-grid Bernstein `λ` wrapper for posterior-dependent + classifier-margin penalties - [x] Finite Dudley discrete entropy-bound refinements: annulus, integral-budget, and prefix-envelope wrappers - [x] Total-bounded finite-net extraction bridge for the continuous Dudley lane @@ -360,8 +261,6 @@ The expected result is: - [x] Finite-cover/pathwise-modulus bridge into the separable-terminal Dudley boundary interface - [x] Finite-terminal total-bounded dyadic Dudley wrapper -- [x] Concrete unit-interval Dudley example with explicit half/quarter meshes - and a supplied-supremum projected-mesh bound - [ ] Continuous Dudley entropy-integral theorem over total-bounded classes - [x] Measure-theoretic iid algorithmic stability expected bound, with explicit integrability assumptions @@ -369,16 +268,13 @@ The expected result is: expected-gap theorem - [x] Bounded-loss high-probability stability wrappers for finite measurable hypothesis interfaces -- [ ] PAC-Bayes concrete margin-loss Bernstein extractor, all-real-`λ`, or - continuous-posterior extensions -- [x] Sharp McDiarmid constant for the additive independent case and for Doob - martingale increments with conditional sub-Gaussian MGF control -- [x] Sharp McDiarmid for a general bounded-differences function over a - homogeneous product measure: one-sided, lower-tail, and two-sided - (`mcdiarmid_twoSided_of_hasBoundedDifferences_sharp`), with the sharp `2B²` - exponent propagated into the high-probability Rademacher and VC wrappers -- [ ] Sharp McDiarmid over arbitrary non-iid product spaces (different law per - coordinate) +- [x] PAC-Bayes iid margin-loss Bernstein prior-moment derivation +- [x] PAC-Bayes Bernstein finite-grid `λ` extension +- [ ] PAC-Bayes Bernstein all-real-`λ` or continuous-posterior extensions +- [x] Sharp McDiarmid/product-kernel assembly. The sharp exposure-increment + conditional MGF, one-sided iid product bounded-differences tail, lower-tail + wrapper, and two-sided `mcdiarmid_twoSided_of_hasBoundedDifferences_sharp` + theorem are on the verified spine - [ ] Continuous Dudley-style entropy integral ## Dependencies @@ -406,7 +302,7 @@ If you use FormalSLT in academic work, please cite: ```bibtex @software{formal_slt, title = {FormalSLT: Formal Statistical Learning Theory in Lean 4}, - author = {Sneiderman, Robert}, + author = {Sneiderman, Robby}, year = {2026}, url = {https://github.com/Robby955/FormalSLT}, note = {Lean 4 formalization of finite-sample SLT bounds.} diff --git a/docs/assumptions-and-nonclaims.md b/docs/assumptions-and-nonclaims.md index d0f821e..17af217 100644 --- a/docs/assumptions-and-nonclaims.md +++ b/docs/assumptions-and-nonclaims.md @@ -1,6 +1,7 @@ -# Scope and Assumptions +# Assumptions and Current Boundaries -This page records the assumptions behind the current theorem spine. +This page records the assumptions behind the current theorem spine and the +current boundaries for public summaries. ## Scope @@ -75,38 +76,10 @@ generic VC bridge is not yet part of the library. - finite multiscale chains; - finite entropy sums and entropy-budget wrappers. -It is finite infrastructure for Dudley-style arguments. It is not the -continuous Dudley entropy integral. +It is a foundation for Dudley-style arguments. It is not the continuous +Dudley entropy integral. -### Unit-interval Dudley example - -`Covering.UnitIntervalDudley` instantiates the total-bounded finite-net bridge -on the non-finite metric index space `[0,1]`. It constructs explicit half and -quarter meshes, packages the Rademacher process -`X(b,t) = signOfBool b * t`, and routes a nonzero supplied supremum through a -projected finite-net Dudley bound with a concrete `sqrt (log 15)` entropy -envelope. - -This example still uses the supplied-supremum interface. It does not construct -arbitrary measurable suprema, prove a general separability theorem, or state -the full continuous Dudley entropy integral. - -### Conditional sub-Gamma extraction - -`Concentration.SubGamma.Extractor` proves a conditional MGF bound for a -single bounded real increment. The headline theorem -`condSubGammaMGF_of_bounded_centered_condVariance` assumes: - -- an a.s. bound `|X| ≤ b`; -- conditional centering `μ[X | m] = 0`; -- a conditional second-moment proxy `μ[X² | m] ≤ σ²`; -- the parameter regime `0 ≤ λ` and `b * λ < 3`. - -The theorem converts those assumptions into a conditional sub-Gamma MGF bound. -It does not by itself construct a filtration, prove independence, or state a -full sequential concentration inequality. - -## Open Work +## Current Boundaries ### VC-to-PAC equivalence @@ -121,11 +94,6 @@ general infinite classes requires covering-number APIs, measurable suprema, separability assumptions, and approximation arguments that are outside the current theorem spine. -The unit-interval Dudley example is the current concrete bridge beyond a -finite ambient index type. It verifies finite-net machinery on `[0,1]`, but it -does not remove the remaining measurable-supremum and separability obligations -for arbitrary non-finite classes. - ### Downstream sharp-tail propagation The product-measure sharp McDiarmid theorem is checked, and the main @@ -154,10 +122,13 @@ posterior-dependent penalties certified by that finite grid. The closed total iid product mass to state the finite high-confidence good event directly. `PACBayesBernstein` adds a finite Bernstein margin-proxy shell: the variance proxy is supplied per hypothesis, and the theorem consumes a normalized -Bernstein prior-moment certificate. It does not yet derive that variance proxy -from a concrete classifier-margin loss. Exact all-real `λ`, finite-grid -Bernstein optimization, infinite-hypothesis, and continuous-posterior -PAC-Bayes theorems are not yet implemented. +Bernstein prior-moment certificate. `PACBayesMargin` specializes that shell to +thresholded classifier-margin losses, proves the iid finite classifier-margin +MGF budget, derives the normalized Bernstein prior-moment certificate, and +packages the fixed-`lambda` iid bad-event theorem with the sample-size-scaled +variance proxy. Exact all-real `λ`, finite-grid Bernstein optimization, +infinite-hypothesis, and continuous-posterior PAC-Bayes theorems remain future +work. ### Algorithmic stability expected bound diff --git a/docs/diagrams.md b/docs/diagrams.md index 264c04b..79b4013 100644 --- a/docs/diagrams.md +++ b/docs/diagrams.md @@ -186,8 +186,8 @@ flowchart TD The PAC-Bayes lane runs from the KL/Donsker-Varadhan change of measure through a finite Catoni-style bound to a finite-grid McAllester peeling wrapper for posterior-dependent penalties. The Bernstein lane adds a finite margin-proxy -wrapper with a supplied per-hypothesis variance proxy and a normalized -prior-moment certificate. +wrapper, then discharges the normalized prior-moment certificate for iid finite +classifier-margin losses, and then adds a finite-grid Bernstein `lambda` wrapper. ```mermaid flowchart TD @@ -198,18 +198,20 @@ flowchart TD fixed["Fixed-budget McAllester square-root corollary"] peeling["Finite-grid McAllester peeling
posterior-dependent penalty"] optimized["Optimized finite-grid wrapper"] - bernstein["Finite Bernstein margin-proxy shell
supplied variance proxy"] - future["Next: all-real-λ / continuous-posterior extensions
(open)"] - - kl --> mgf --> catoni --> payoff --> fixed --> peeling --> optimized - kl --> bernstein - bernstein -.-> future - optimized -.-> future - - classDef verified fill:#f0fdf4,stroke:#16a34a,color:#14532d; - classDef future fill:#f8fafc,stroke:#94a3b8,color:#475569,stroke-dasharray: 5 4; - class kl,mgf,catoni,payoff,fixed,peeling,optimized,bernstein verified; - class future future; + bernstein["Finite Bernstein margin-proxy shell
supplied variance proxy"] + margin["Iid classifier-margin MGF
fixed-lambda bad-event theorem"] + bernsteinGrid["Finite-grid Bernstein λ wrapper
posterior-dependent penalty"] + future["Next: all-real-λ / continuous-posterior extensions
(open)"] + + kl --> mgf --> catoni --> payoff --> fixed --> peeling --> optimized + kl --> bernstein --> margin --> bernsteinGrid + bernsteinGrid -.-> future + optimized -.-> future + + classDef verified fill:#f0fdf4,stroke:#16a34a,color:#14532d; + classDef future fill:#f8fafc,stroke:#94a3b8,color:#475569,stroke-dasharray: 5 4; + class kl,mgf,catoni,payoff,fixed,peeling,optimized,bernstein,margin,bernsteinGrid verified; + class future future; ``` ## Where each definition first appears diff --git a/docs/proof-frontier-manifest.json b/docs/proof-frontier-manifest.json index 0785093..a6e04e1 100644 --- a/docs/proof-frontier-manifest.json +++ b/docs/proof-frontier-manifest.json @@ -67,11 +67,11 @@ }, "schema": "FormalSLT.proof_frontier.v1", "source_counts": { - "example_lean_files": 25, - "library_and_example_lean_lines": 33427, - "library_lean_lines": 31533, - "library_modules": 60, - "theorem_map_entries": 412 + "example_lean_files": 26, + "library_and_example_lean_lines": 34536, + "library_lean_lines": 32558, + "library_modules": 61, + "theorem_map_entries": 426 }, "theorem_families": [ { @@ -2437,6 +2437,12 @@ "name": "finiteProduct_mgf_empiricalRiskDeviation_le_of_single", "summary": "Single-coordinate MGF budget lifts to the finite sample-average MGF" }, + { + "kind": "theorem", + "module": "PACBayesFiniteProductMGF", + "name": "finiteProduct_mgf_empiricalRiskDeviation_le_exp_of_single", + "summary": "Single-coordinate exponential budget lifts to an explicit finite product MGF budget" + }, { "kind": "theorem", "module": "PACBayesFiniteProductMGF", @@ -2527,6 +2533,12 @@ "name": "priorBernsteinExpMoment", "summary": "Normalized Bernstein prior exponential moment with variance and scale terms" }, + { + "kind": "theorem", + "module": "PACBayesBernstein", + "name": "expectedPriorBernsteinExpMoment_le_one_of_mgf_bound", + "summary": "Expected normalized prior moment is at most one when each hypothesis has the corresponding Bernstein MGF budget" + }, { "kind": "theorem", "module": "PACBayesBernstein", @@ -2550,6 +2562,78 @@ "module": "PACBayesBernstein", "name": "finitePACBayesBernsteinMargin_badEventMass_le_delta", "summary": "Finite supplied margin-proxy wrapper with sqrt(2 * V\u03c1 * C\u03c1) + scale * C\u03c1 penalty form" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "classifierMarginLoss", + "summary": "Thresholded finite classifier-margin loss in {0,1}" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "classifierMarginPopulationRisk", + "summary": "Concrete population risk for thresholded classifier-margin loss" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "classifierMarginEmpiricalRisk", + "summary": "Concrete empirical risk for thresholded classifier-margin loss" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "classifierMarginVarianceProxy", + "summary": "Concrete risk-as-variance proxy for indicator classifier-margin loss" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "classifierMarginSampleVarianceProxy", + "summary": "Sample-size-scaled variance proxy for iid classifier-margin empirical-risk deviations" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "classifierMarginVariance_le_risk", + "summary": "Indicator margin-loss centered second moment is bounded by its risk proxy" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "oneCoordinate_classifierMarginLoss_mgf_subgamma", + "summary": "One-coordinate finite Bennett/sub-Gamma MGF bound for thresholded classifier-margin loss" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "finiteProduct_classifierMarginLoss_mgf_subgamma", + "summary": "Iid product MGF bound for classifier-margin empirical-risk deviations" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one", + "summary": "Iid classifier-margin losses discharge the normalized Bernstein prior-moment certificate" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "finiteProductSampleWeight_isPMF", + "summary": "Iid finite product sample weights form a PMF when the base mass is a PMF" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "finitePACBayesClassifierMarginBernstein_badEventMass_le_delta", + "summary": "PAC-Bayes Bernstein bad-event theorem specialized to finite classifier-margin losses under the supplied prior-moment certificate" + }, + { + "kind": "theorem", + "module": "PACBayesMargin", + "name": "finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta", + "summary": "End-to-end fixed-lambda iid PAC-Bayes Bernstein bad-event theorem for finite classifier-margin losses" } ], "name": "Stability and PAC-Bayes foundations" diff --git a/docs/roadmap.md b/docs/roadmap.md index f1dc628..4a0616f 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -231,6 +231,9 @@ proxy.~~ ✓ (`PACBayesBernstein.posteriorMarginVarianceProxy`) - ~~Step 2: define the normalized Bernstein prior exponential moment with variance and scale terms.~~ ✓ (`PACBayesBernstein.priorBernsteinExpMoment`) + - ~~Step 2.5: lift per-hypothesis Bernstein MGF budgets to the expected + normalized prior moment.~~ ✓ + (`PACBayesBernstein.expectedPriorBernsteinExpMoment_le_one_of_mgf_bound`) - ~~Step 3: prove the deterministic fixed-sample PAC-Bayes Bernstein adapter from a prior-moment certificate.~~ ✓ (`PACBayesBernstein.posteriorGeneralizationGap_le_bernstein_of_priorBernsteinExpMoment_le`) @@ -238,10 +241,19 @@ posterior-dependent margin-style penalties under explicit complexity and penalty certificates.~~ ✓ (`PACBayesBernstein.finitePACBayesBernsteinMargin_badEventMass_le_delta`) - - Remaining PAC-Bayes Bernstein extensions: concrete classifier-margin - variance extractors, finite-grid `lambda` optimization, exact all-real - `lambda` optimization, continuous posteriors, and infinite hypothesis - classes. + - ~~Step 5: specialize the finite Bernstein shell to concrete thresholded + classifier-margin losses and the risk-as-variance proxy.~~ ✓ + (`PACBayesMargin.finitePACBayesClassifierMarginBernstein_badEventMass_le_delta`) + - ~~Step 6: derive the normalized Bernstein prior-moment certificate from iid + finite classifier-margin losses and package the fixed-`lambda` iid + bad-event theorem.~~ ✓ + (`PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta`) + - ~~Step 7: add finite-grid `lambda` optimization wrappers for Bernstein + posterior-dependent penalties.~~ ✓ + (`PACBayesBernstein.finitePACBayesBernsteinGridOptimized_badEventMass_le_delta`, + `PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_delta`) + - Remaining PAC-Bayes Bernstein extensions: exact all-real `lambda` + optimization, continuous posteriors, and infinite hypothesis classes. ### Long-term diff --git a/docs/theorem-chain.svg b/docs/theorem-chain.svg index 2c97431..97d4a03 100644 --- a/docs/theorem-chain.svg +++ b/docs/theorem-chain.svg @@ -1,6 +1,6 @@ - + FormalSLT theorem chain - A dependency diagram for the finite statistical learning theory spine in Lean. + A dependency diagram for the finite statistical learning theory spine in Lean, with the recent conditional sub-gamma concentration toolkit, iid PAC-Bayes Bernstein classifier-margin theorem, and total-bounded Dudley boundary adapters. @@ -8,6 +8,9 @@ + + + - + FormalSLT theorem chain - - 60 FormalSLT Lean files · 85 checked Lean files · zero sorry · zero admit · axioms: propext, Classical.choice, Quot.sound - + +61 Lean modules · 32,558 lines · zero sorry · zero admit · axioms: propext, Classical.choice, Quot.sound + Core finite Rademacher and VC spine @@ -56,8 +60,8 @@ - Sharp McDiarmid genGap tail - exp(-ε²n / 2B²) + Azuma tail + exp(-ε²n / 8B²) Massart finite-class @@ -117,8 +121,8 @@ dyadic finite entropy wrappers - PAC-Bayes finite confidence - Catoni, McAllester, Bernstein + PAC-Bayes finite layers + Catoni, McAllester, iid Bernstein @@ -126,32 +130,40 @@ - Finite chaining is a finite entropy-budget layer. Continuous Dudley is a separate theorem target. - - Sharp McDiarmid bounded-differences (homogeneous product measure) - - - One-sided sharp tail - exp(-2ε² / Σ cₖ²) - bounded differences, product μ - - - Lower-tail wrapper - apply to -f, same widths - exp(-2ε² / Σ cₖ²) - - - Two-sided wrapper - 2 · exp(-2ε² / Σ cₖ²) - union of upper and lower - - - - - PAC-Bayes now includes a finite Bernstein margin-proxy shell with supplied variance proxy. + + + Recent additions — conditional concentration toolkit and Dudley boundary lane + + + Sub-gamma MGF extractor + conditional, from bounded + + cond. variance, on b·λ < 3 + + + Total-bounded finite-net + dyadic finite-net schedules, + projected-sup wrappers + + + PAC-Bayes finite confidence + finite-grid peeling plus + iid Bernstein margin theorem + + + Dudley boundary adapters + separable-terminal · supplied-sup + · pathwise-modulus · ε-elimination + + + + + + + Recent lane (since 2026-05-09): conditional sub-gamma toolkit, iid PAC-Bayes Bernstein classifier-margin theorem, and total-bounded Dudley boundary adapters, all verified and axiom-clean. + Sharp McDiarmid now includes the iid product-kernel route, lower tail, and two-sided bounded-differences theorem. diff --git a/docs/theorem-map.md b/docs/theorem-map.md index 9129dfe..a7a6693 100644 --- a/docs/theorem-map.md +++ b/docs/theorem-map.md @@ -442,6 +442,7 @@ declarations; modules are relative to `FormalSLT`. | `finiteEmpiricalRisk` | `PACBayesFiniteProductMGF` | Finite empirical risk for a real-valued loss | | `finiteProduct_mgf_empiricalRiskDeviation_eq_pow` | `PACBayesFiniteProductMGF` | Exact iid product factorization of `E exp(lam * (R_i - Rhat_i))` | | `finiteProduct_mgf_empiricalRiskDeviation_le_of_single` | `PACBayesFiniteProductMGF` | Single-coordinate MGF budget lifts to the finite sample-average MGF | +| `finiteProduct_mgf_empiricalRiskDeviation_le_exp_of_single` | `PACBayesFiniteProductMGF` | Single-coordinate exponential budget lifts to an explicit finite product MGF budget | | `finitePriorAveraged_mgf_empiricalRiskDeviation_le` | `PACBayesFiniteProductMGF` | Prior-averaged finite iid empirical-risk-deviation MGF bound | | `oneCoordinate_boundedLoss_mgf` | `PACBayesBoundedLoss` | `[0,1]` bounded-loss one-coordinate MGF instantiation | | `sampleAverage_boundedLoss_mgf` | `PACBayesBoundedLoss` | Finite sample-average bounded-loss MGF bound | @@ -457,10 +458,23 @@ declarations; modules are relative to `FormalSLT`. | `pac_bayes_generalization` | `PACBayesBoundedLoss` | Closed PAC-Bayes good-event theorem: with product-sample mass at least `1 - delta`, every posterior satisfies the Catoni-form risk bound | | `posteriorMarginVarianceProxy` | `PACBayesBernstein` | Posterior average of a supplied per-hypothesis margin-variance proxy | | `priorBernsteinExpMoment` | `PACBayesBernstein` | Normalized Bernstein prior exponential moment with variance and scale terms | +| `expectedPriorBernsteinExpMoment_le_one_of_mgf_bound` | `PACBayesBernstein` | Expected normalized prior moment is at most one when each hypothesis has the corresponding Bernstein MGF budget | | `posteriorGeneralizationGap_le_bernstein_of_priorBernsteinExpMoment_le` | `PACBayesBernstein` | Deterministic fixed-sample PAC-Bayes Bernstein adapter from a prior-moment certificate | | `finitePACBayesBernstein_fixedLambda_badEventMass_le_delta` | `PACBayesBernstein` | Finite fixed-`lambda` PAC-Bayes Bernstein bad-event bound | | `finitePACBayesBernsteinPenalty_badEventMass_le_delta` | `PACBayesBernstein` | Posterior-dependent finite Bernstein bad-event wrapper under complexity and penalty certificates | | `finitePACBayesBernsteinMargin_badEventMass_le_delta` | `PACBayesBernstein` | Finite supplied margin-proxy wrapper with `sqrt(2 * Vρ * Cρ) + scale * Cρ` penalty form | +| `classifierMarginLoss` | `PACBayesMargin` | Thresholded finite classifier-margin loss in `{0,1}` | +| `classifierMarginPopulationRisk` | `PACBayesMargin` | Concrete population risk for thresholded classifier-margin loss | +| `classifierMarginEmpiricalRisk` | `PACBayesMargin` | Concrete empirical risk for thresholded classifier-margin loss | +| `classifierMarginVarianceProxy` | `PACBayesMargin` | Concrete risk-as-variance proxy for indicator classifier-margin loss | +| `classifierMarginSampleVarianceProxy` | `PACBayesMargin` | Sample-size-scaled variance proxy for iid classifier-margin empirical-risk deviations | +| `classifierMarginVariance_le_risk` | `PACBayesMargin` | Indicator margin-loss centered second moment is bounded by its risk proxy | +| `oneCoordinate_classifierMarginLoss_mgf_subgamma` | `PACBayesMargin` | One-coordinate finite Bennett/sub-Gamma MGF bound for thresholded classifier-margin loss | +| `finiteProduct_classifierMarginLoss_mgf_subgamma` | `PACBayesMargin` | Iid product MGF bound for classifier-margin empirical-risk deviations | +| `expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one` | `PACBayesMargin` | Iid classifier-margin losses discharge the normalized Bernstein prior-moment certificate | +| `finiteProductSampleWeight_isPMF` | `PACBayesMargin` | Iid finite product sample weights form a PMF when the base mass is a PMF | +| `finitePACBayesClassifierMarginBernstein_badEventMass_le_delta` | `PACBayesMargin` | PAC-Bayes Bernstein bad-event theorem specialized to finite classifier-margin losses under the supplied prior-moment certificate | +| `finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta` | `PACBayesMargin` | End-to-end fixed-`lambda` iid PAC-Bayes Bernstein bad-event theorem for finite classifier-margin losses | ## Conditional sub-Gamma extractor diff --git a/examples/CheckPACBayesBernstein.lean b/examples/CheckPACBayesBernstein.lean index a2ab608..4bc8a8a 100644 --- a/examples/CheckPACBayesBernstein.lean +++ b/examples/CheckPACBayesBernstein.lean @@ -24,3 +24,13 @@ posterior-dependent margin-style wrapper. #check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinMargin_badEventMass_le_delta #print axioms FormalSLT.PACBayesBernstein.finitePACBayesBernsteinMargin_badEventMass_le_delta + +#check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinGridBadSamples +#check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinGrid_badEventMass_le_sum_delta +#print axioms FormalSLT.PACBayesBernstein.finitePACBayesBernsteinGrid_badEventMass_le_sum_delta +#check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinGrid_badEventMass_le_delta +#print axioms FormalSLT.PACBayesBernstein.finitePACBayesBernsteinGrid_badEventMass_le_delta +#check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinGridOptimized_badEventMass_le_sum_delta +#print axioms FormalSLT.PACBayesBernstein.finitePACBayesBernsteinGridOptimized_badEventMass_le_sum_delta +#check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinGridOptimized_badEventMass_le_delta +#print axioms FormalSLT.PACBayesBernstein.finitePACBayesBernsteinGridOptimized_badEventMass_le_delta diff --git a/examples/CheckPACBayesMargin.lean b/examples/CheckPACBayesMargin.lean new file mode 100644 index 0000000..7c35583 --- /dev/null +++ b/examples/CheckPACBayesMargin.lean @@ -0,0 +1,74 @@ +import FormalSLT.PACBayesMargin + +/-! +# PAC-Bayes classifier-margin adapter audit + +Checks the concrete classifier-margin surface that specializes the finite +PAC-Bayes Bernstein shell. +-/ + +namespace FormalSLT.PACBayesMargin.Check + +inductive ToyHyp + | left + | right + deriving DecidableEq, Fintype + +instance : Nonempty ToyHyp := ⟨ToyHyp.left⟩ + +inductive ToyPoint + | neg + | pos + deriving DecidableEq, Fintype + +def toyLabel : ToyPoint → Bool + | ToyPoint.neg => false + | ToyPoint.pos => true + +def toyScore : ToyHyp → ToyPoint → ℝ + | ToyHyp.left, ToyPoint.neg => -1 + | ToyHyp.left, ToyPoint.pos => 1 + | ToyHyp.right, ToyPoint.neg => 1 + | ToyHyp.right, ToyPoint.pos => -1 + +#check FormalSLT.PACBayesMargin.signedBool +#check FormalSLT.PACBayesMargin.classifierMargin +#check FormalSLT.PACBayesMargin.classifierMarginLoss +#check FormalSLT.PACBayesMargin.classifierMarginLoss_mem_Icc_zero_one +#check FormalSLT.PACBayesMargin.classifierMarginPopulationRisk +#check FormalSLT.PACBayesMargin.classifierMarginEmpiricalRisk +#check FormalSLT.PACBayesMargin.classifierMarginEmpiricalRiskFn +#check FormalSLT.PACBayesMargin.classifierMarginVarianceProxy +#check FormalSLT.PACBayesMargin.classifierMarginSampleVarianceProxy +#check FormalSLT.PACBayesMargin.posteriorClassifierMarginVarianceProxy +#check FormalSLT.PACBayesMargin.posteriorClassifierMarginSampleVarianceProxy +#check FormalSLT.PACBayesMargin.classifierMarginPopulationRisk_mem_Icc_zero_one +#check FormalSLT.PACBayesMargin.classifierMarginLoss_sq +#check FormalSLT.PACBayesMargin.classifierMarginVariance_le_risk +#check FormalSLT.PACBayesMargin.oneCoordinate_classifierMarginLoss_mgf_subgamma +#check FormalSLT.PACBayesMargin.finiteProduct_classifierMarginLoss_mgf_subgamma +#check FormalSLT.PACBayesMargin.expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one +#check FormalSLT.PACBayesMargin.finiteProductSampleWeight_isPMF +#check FormalSLT.PACBayesMargin.finitePACBayesClassifierMarginBernsteinBadSamples + +#check (FormalSLT.PACBayesMargin.classifierMarginLoss + (ι := ToyHyp) (Z := ToyPoint) 0 toyScore toyLabel ToyHyp.left ToyPoint.pos) + +#check (FormalSLT.PACBayesMargin.finitePACBayesClassifierMarginBernstein_badEventMass_le_delta + (n := 2) (Z := ToyPoint) (ι := ToyHyp)) +#print axioms FormalSLT.PACBayesMargin.finitePACBayesClassifierMarginBernstein_badEventMass_le_delta + +#check (FormalSLT.PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta + (n := 2) (Z := ToyPoint) (ι := ToyHyp)) +#print axioms FormalSLT.PACBayesMargin.expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one +#print axioms FormalSLT.PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta + +#check (FormalSLT.PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_sum_delta + (n := 2) (Z := ToyPoint) (ι := ToyHyp) (γ := Fin 2)) +#print axioms FormalSLT.PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_sum_delta + +#check (FormalSLT.PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_delta + (n := 2) (Z := ToyPoint) (ι := ToyHyp) (γ := Fin 2)) +#print axioms FormalSLT.PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_delta + +end FormalSLT.PACBayesMargin.Check diff --git a/scripts/repo-stats.sh b/scripts/repo-stats.sh new file mode 100755 index 0000000..f777c1a --- /dev/null +++ b/scripts/repo-stats.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# +# repo-stats.sh — single source of truth for the repo size figures the README, +# the theorem-chain SVG, and any reviewer quote against the GitHub page. +# +# It recomputes the Lean module count and total Lean line count from disk using +# exactly the commands a skeptical reader would run, writes them to +# `.stats.json`, and stamps them into the marker-delimited STATS regions of +# `README.md` and `docs/theorem-chain.svg` so the numbers cannot silently drift. +# +# Usage: +# scripts/repo-stats.sh # recompute, write .stats.json, stamp files +# scripts/repo-stats.sh --check # recompute and FAIL (exit 1) if anything is +# # stale; writes nothing. Use this in CI / the +# # pre-PR checklist to catch drift. +# +# The canonical count commands (kept identical to the audit instructions): +# modules: find FormalSLT -name "*.lean" | wc -l +# lines: find FormalSLT -name "*.lean" -exec cat {} \; | wc -l +set -euo pipefail + +# Resolve repo root from this script's location so it works from any cwd. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +cd "${ROOT}" + +CHECK=0 +if [[ "${1:-}" == "--check" ]]; then + CHECK=1 +fi + +MODULES="$(find FormalSLT -name '*.lean' | wc -l | tr -d ' ')" +LINES="$(find FormalSLT -name '*.lean' -exec cat {} \; | wc -l | tr -d ' ')" + +README="README.md" +SVG="docs/theorem-chain.svg" +STATS_JSON=".stats.json" + +MODULES="${MODULES}" LINES="${LINES}" \ +README="${README}" SVG="${SVG}" STATS_JSON="${STATS_JSON}" CHECK="${CHECK}" \ +python3 - <<'PY' +import os, re, sys, json + +modules = os.environ["MODULES"] +lines = os.environ["LINES"] +check = os.environ["CHECK"] == "1" +readme = os.environ["README"] +svg = os.environ["SVG"] +stats_json = os.environ["STATS_JSON"] + +# Thousands-separated line count for prose (e.g. 20789 -> 20,789). +lines_human = f"{int(lines):,}" + +# The exact strings stamped between the STATS markers in each file. +README_STATS = f"**{modules} Lean modules. {lines_human} lines of Lean. Zero `sorry`. Zero `admit`. Zero custom axioms.**" +SVG_STATS = f"{modules} Lean modules · {lines_human} lines · zero sorry · zero admit · axioms: propext, Classical.choice, Quot.sound" + +drift = [] + +def restamp(path, replacement): + """Replace the text between STATS:BEGIN and STATS:END markers.""" + with open(path, encoding="utf-8") as fh: + text = fh.read() + pat = re.compile(r"()(.*?)()", re.DOTALL) + m = pat.search(text) + if not m: + sys.stderr.write(f"error: STATS markers not found in {path}\n") + sys.exit(2) + current = m.group(2).strip() + want = replacement.strip() + if current != want: + drift.append(f"{path}: stale STATS region\n have: {current}\n want: {want}") + new = text[:m.start(2)] + "\n" + replacement + "\n" + text[m.end(2):] + if not check and new != text: + with open(path, "w", encoding="utf-8") as fh: + fh.write(new) + +restamp(readme, README_STATS) +restamp(svg, SVG_STATS) + +# .stats.json is fully regenerated; compare semantically for --check. +payload = { + "lean_modules": int(modules), + "lean_lines": int(lines), + "_note": "Generated by scripts/repo-stats.sh. Do not edit by hand.", +} +existing = None +if os.path.exists(stats_json): + try: + with open(stats_json, encoding="utf-8") as fh: + existing = json.load(fh) + except Exception: + existing = None +if existing != payload: + if existing is None: + drift.append(f"{stats_json}: missing or unreadable") + else: + drift.append(f"{stats_json}: stale (have {existing.get('lean_modules')}/{existing.get('lean_lines')}, want {modules}/{lines})") + if not check: + with open(stats_json, "w", encoding="utf-8") as fh: + json.dump(payload, fh, indent=2) + fh.write("\n") + +if check: + if drift: + sys.stderr.write("repo-stats: DRIFT DETECTED — run scripts/repo-stats.sh to fix:\n") + for d in drift: + sys.stderr.write(" - " + d + "\n") + sys.exit(1) + print(f"repo-stats: in sync ({modules} modules, {lines} lines)") +else: + print(f"repo-stats: stamped {modules} modules, {lines} lines into {readme}, {svg}, {stats_json}") +PY