diff --git a/.stats.json b/.stats.json
new file mode 100644
index 0000000..81c65d5
--- /dev/null
+++ b/.stats.json
@@ -0,0 +1,5 @@
+{
+ "lean_modules": 61,
+ "lean_lines": 32558,
+ "_note": "Generated by scripts/repo-stats.sh. Do not edit by hand."
+}
diff --git a/FormalSLT.lean b/FormalSLT.lean
index 6e71de2..36bb137 100644
--- a/FormalSLT.lean
+++ b/FormalSLT.lean
@@ -67,3 +67,4 @@ import FormalSLT.PACBayesFiniteProductMGF
import FormalSLT.PACBayesBoundedLoss
import FormalSLT.PACBayesMcAllester
import FormalSLT.PACBayesBernstein
+import FormalSLT.PACBayesMargin
diff --git a/FormalSLT/PACBayesBernstein.lean b/FormalSLT/PACBayesBernstein.lean
index 2f9ffec..d074f28 100644
--- a/FormalSLT/PACBayesBernstein.lean
+++ b/FormalSLT/PACBayesBernstein.lean
@@ -4,6 +4,7 @@ Released under MIT license as described in the file LICENSE.
Authors: Robby Sneiderman
-/
import FormalSLT.PACBayesKL
+import FormalSLT.Probability.FiniteUnionBound
import Mathlib.Algebra.BigOperators.Field
import Mathlib.Algebra.Order.BigOperators.Group.Finset
import Mathlib.Analysis.SpecialFunctions.Sqrt
@@ -37,6 +38,8 @@ classifier margin loss.
Markov/confidence layer for a fixed Bernstein parameter.
* `finitePACBayesBernsteinMargin_badEventMass_le_delta` — posterior-dependent
margin-style wrapper under explicit complexity and penalty certificates.
+* `finitePACBayesBernsteinGridOptimized_badEventMass_le_delta` — finite-grid
+ optimization wrapper over supplied Bernstein parameter buckets.
## Current boundaries
@@ -94,6 +97,114 @@ def expectedPriorBernsteinExpMoment [Fintype Ω] [Fintype ι]
∑ ω, ν ω *
priorBernsteinExpMoment π lambda scale riskFn empiricalRiskFn varianceProxy ω
+/--
+Expected normalized PAC-Bayes Bernstein prior moment from per-hypothesis MGF
+budgets.
+
+This is the algebraic bridge used before plugging in a concrete iid
+concentration lemma: each hypothesis supplies the Bernstein exponential budget,
+then the prior-weighted normalized moment has expectation at most one.
+-/
+theorem expectedPriorBernsteinExpMoment_le_one_of_mgf_bound
+ [Fintype Ω] [Fintype ι]
+ {π : ι → ℝ} (hπ : IsPMF π)
+ (ν : Ω → ℝ) (lambda scale : ℝ)
+ (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ)
+ (varianceProxy : ι → ℝ)
+ (hmgf :
+ ∀ i : ι,
+ (∑ ω : Ω, ν ω *
+ Real.exp (lambda * (riskFn i - empiricalRiskFn ω i))) ≤
+ Real.exp
+ (lambda ^ 2 * varianceProxy i /
+ (2 * (1 - scale * lambda)))) :
+ expectedPriorBernsteinExpMoment ν π lambda scale riskFn empiricalRiskFn
+ varianceProxy ≤ 1 := by
+ classical
+ let budget : ι → ℝ :=
+ fun i =>
+ lambda ^ 2 * varianceProxy i /
+ (2 * (1 - scale * lambda))
+ have hexp_split : ∀ a b : ℝ, Real.exp (a - b) = Real.exp a * Real.exp (-b) := by
+ intro a b
+ rw [← Real.exp_add]
+ ring_nf
+ have hswap :
+ expectedPriorBernsteinExpMoment ν π lambda scale riskFn empiricalRiskFn
+ varianceProxy =
+ ∑ i : ι,
+ π i * Real.exp (-(budget i)) *
+ (∑ ω : Ω, ν ω *
+ Real.exp (lambda * (riskFn i - empiricalRiskFn ω i))) := by
+ unfold expectedPriorBernsteinExpMoment priorBernsteinExpMoment
+ calc
+ (∑ ω : Ω,
+ ν ω *
+ ∑ i : ι,
+ π i *
+ Real.exp
+ (lambda * (riskFn i - empiricalRiskFn ω i) -
+ lambda ^ 2 * varianceProxy i /
+ (2 * (1 - scale * lambda))))
+ =
+ ∑ ω : Ω, ∑ i : ι,
+ ν ω *
+ (π i *
+ Real.exp
+ (lambda * (riskFn i - empiricalRiskFn ω i) -
+ budget i)) := by
+ apply Finset.sum_congr rfl
+ intro ω _hω
+ rw [Finset.mul_sum]
+ _ =
+ ∑ i : ι, ∑ ω : Ω,
+ ν ω *
+ (π i *
+ Real.exp
+ (lambda * (riskFn i - empiricalRiskFn ω i) -
+ budget i)) := by
+ rw [Finset.sum_comm]
+ _ =
+ ∑ i : ι,
+ π i * Real.exp (-(budget i)) *
+ (∑ ω : Ω, ν ω *
+ Real.exp (lambda * (riskFn i - empiricalRiskFn ω i))) := by
+ apply Finset.sum_congr rfl
+ intro i _hi
+ rw [Finset.mul_sum]
+ apply Finset.sum_congr rfl
+ intro ω _hω
+ rw [hexp_split]
+ ring
+ rw [hswap]
+ calc
+ (∑ i : ι,
+ π i * Real.exp (-(budget i)) *
+ (∑ ω : Ω, ν ω *
+ Real.exp (lambda * (riskFn i - empiricalRiskFn ω i))))
+ ≤
+ ∑ i : ι,
+ π i * Real.exp (-(budget i)) *
+ Real.exp (budget i) := by
+ apply Finset.sum_le_sum
+ intro i _hi
+ exact mul_le_mul_of_nonneg_left
+ (hmgf i)
+ (mul_nonneg (hπ.nonneg i) (le_of_lt (Real.exp_pos _)))
+ _ = ∑ i : ι, π i := by
+ apply Finset.sum_congr rfl
+ intro i _hi
+ calc
+ π i * Real.exp (-(budget i)) * Real.exp (budget i)
+ = π i * (Real.exp (-(budget i)) * Real.exp (budget i)) := by ring
+ _ = π i * 1 := by
+ congr 1
+ rw [← Real.exp_add]
+ ring_nf
+ simp
+ _ = π i := by ring
+ _ = 1 := hπ.sum_one
+
/-- Finite sample mass of outcomes whose Bernstein prior moment exceeds a threshold. -/
def priorBernsteinExpMomentTailMass [Fintype Ω] [Fintype ι]
(ν : Ω → ℝ) (π : ι → ℝ)
@@ -556,6 +667,238 @@ theorem finitePACBayesBernsteinMargin_badEventMass_le_delta
scale * complexityOf ρ)
hcomplexity hpenalty hExpected
+/-! ### Finite-grid peeling over Bernstein parameters -/
+
+/--
+Samples where some posterior violates at least one finite Bernstein parameter
+bucket.
+
+Each grid index supplies a parameter `lambdaOf g`, a scale `scaleOf g`, and a
+confidence allocation `confidenceOf g`. This is a finite-grid statement, not an
+optimization over all real parameters.
+-/
+def finitePACBayesBernsteinGridBadSamples
+ [Fintype Ω] [Fintype ι] [Fintype γ]
+ (π : ι → ℝ) (lambdaOf scaleOf confidenceOf : γ → ℝ)
+ (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ)
+ (varianceProxy : ι → ℝ) : Finset Ω :=
+ Finset.univ.filter fun ω =>
+ ∃ g : γ, ∃ ρ : ι → ℝ,
+ IsPMF ρ ∧
+ posteriorGeneralizationGap ρ riskFn (empiricalRiskFn ω) >
+ (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g +
+ lambdaOf g * posteriorMarginVarianceProxy ρ varianceProxy /
+ (2 * (1 - scaleOf g * lambdaOf g))
+
+/--
+Finite-grid Bernstein bad-event bound.
+
+For each finite grid bucket `g`, the fixed-`lambda` Bernstein theorem is applied
+with its own confidence allocation. A finite weighted union bound controls the
+mass of samples on which any bucket fails.
+-/
+theorem finitePACBayesBernsteinGrid_badEventMass_le_sum_delta
+ [Fintype Ω] [DecidableEq Ω] [Fintype ι] [Nonempty ι] [Fintype γ]
+ {ν : Ω → ℝ} (hν : IsPMF ν)
+ {π : ι → ℝ} (hπ : IsFullSupportPMF π)
+ (lambdaOf scaleOf confidenceOf : γ → ℝ)
+ (hlambda : ∀ g : γ, 0 < lambdaOf g)
+ (hscale : ∀ g : γ, scaleOf g * lambdaOf g < 1)
+ (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g)
+ (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ)
+ (varianceProxy : ι → ℝ)
+ (hExpected :
+ ∀ g : γ,
+ expectedPriorBernsteinExpMoment ν π (lambdaOf g) (scaleOf g)
+ riskFn empiricalRiskFn varianceProxy ≤ 1) :
+ (∑ ω ∈
+ finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf
+ riskFn empiricalRiskFn varianceProxy,
+ ν ω) ≤
+ ∑ g : γ, confidenceOf g := by
+ classical
+ let bucketEvent : γ → Finset Ω := fun g =>
+ finitePACBayesBernsteinFixedLambdaBadSamples π (lambdaOf g) (scaleOf g)
+ (confidenceOf g) riskFn empiricalRiskFn varianceProxy
+ let gridUnionEvent : Finset Ω :=
+ Finset.univ.filter fun ω : Ω => ∃ g : γ, ω ∈ bucketEvent g
+ have hsubset :
+ finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf
+ riskFn empiricalRiskFn varianceProxy
+ ⊆ gridUnionEvent := by
+ intro ω hω
+ rw [finitePACBayesBernsteinGridBadSamples, Finset.mem_filter] at hω
+ rcases hω.2 with ⟨g, ρ, hρ, hbad⟩
+ change ω ∈ (Finset.univ.filter fun ω : Ω => ∃ g : γ, ω ∈ bucketEvent g)
+ rw [Finset.mem_filter]
+ refine ⟨Finset.mem_univ ω, g, ?_⟩
+ change ω ∈ finitePACBayesBernsteinFixedLambdaBadSamples π (lambdaOf g) (scaleOf g)
+ (confidenceOf g) riskFn empiricalRiskFn varianceProxy
+ rw [finitePACBayesBernsteinFixedLambdaBadSamples, Finset.mem_filter]
+ exact ⟨Finset.mem_univ ω, ρ, hρ, hbad⟩
+ have hmass_subset :
+ (∑ ω ∈
+ finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf
+ riskFn empiricalRiskFn varianceProxy,
+ ν ω)
+ ≤ ∑ ω ∈ gridUnionEvent, ν ω := by
+ exact Finset.sum_le_sum_of_subset_of_nonneg hsubset (by
+ intro ω _hω _hnot
+ exact hν.nonneg ω)
+ have hunion :=
+ FormalSLT.Probability.FiniteUnionBound.finiteProbabilityUnionBound_proof
+ (support := (Finset.univ : Finset Ω))
+ (w := ν)
+ (events := bucketEvent)
+ (s := (Finset.univ : Finset γ))
+ (fun ω : Ω => hν.nonneg ω)
+ have hunion_mass :
+ (∑ ω ∈ gridUnionEvent, ν ω) ≤
+ ∑ g : γ, ∑ ω ∈ bucketEvent g, ν ω := by
+ change
+ (∑ ω ∈ (Finset.univ.filter fun ω : Ω => ∃ g : γ, ω ∈ bucketEvent g), ν ω) ≤
+ ∑ g : γ, ∑ ω ∈ bucketEvent g, ν ω
+ unfold FormalSLT.Probability.FiniteUnionBound.finiteUnionEventMass at hunion
+ unfold FormalSLT.Probability.FiniteUnionBound.finiteEventMassSum at hunion
+ unfold FormalSLT.Probability.FiniteUnionBound.finiteEventMass at hunion
+ simp_rw [← Finset.sum_filter] at hunion
+ simpa using hunion
+ have hbucket :
+ ∀ g : γ, (∑ ω ∈ bucketEvent g, ν ω) ≤ confidenceOf g := by
+ intro g
+ simpa [bucketEvent] using
+ finitePACBayesBernstein_fixedLambda_badEventMass_le_delta
+ (ν := ν) hν hπ (lambdaOf g) (scaleOf g) (confidenceOf g)
+ (hlambda g) (hscale g) (hconfidenceOf g) riskFn empiricalRiskFn
+ varianceProxy (hExpected g)
+ have hbucket_sum :
+ (∑ g : γ, ∑ ω ∈ bucketEvent g, ν ω) ≤ ∑ g : γ, confidenceOf g := by
+ exact Finset.sum_le_sum (fun g _hg => hbucket g)
+ exact hmass_subset.trans (hunion_mass.trans hbucket_sum)
+
+/-- Finite-grid Bernstein bad-event bound with an explicit confidence budget. -/
+theorem finitePACBayesBernsteinGrid_badEventMass_le_delta
+ [Fintype Ω] [DecidableEq Ω] [Fintype ι] [Nonempty ι] [Fintype γ]
+ {ν : Ω → ℝ} (hν : IsPMF ν)
+ {π : ι → ℝ} (hπ : IsFullSupportPMF π)
+ (lambdaOf scaleOf confidenceOf : γ → ℝ)
+ (hlambda : ∀ g : γ, 0 < lambdaOf g)
+ (hscale : ∀ g : γ, scaleOf g * lambdaOf g < 1)
+ (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g)
+ (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ)
+ (varianceProxy : ι → ℝ)
+ (hExpected :
+ ∀ g : γ,
+ expectedPriorBernsteinExpMoment ν π (lambdaOf g) (scaleOf g)
+ riskFn empiricalRiskFn varianceProxy ≤ 1)
+ {delta : ℝ} (hgridConfidence : (∑ g : γ, confidenceOf g) ≤ delta) :
+ (∑ ω ∈
+ finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf
+ riskFn empiricalRiskFn varianceProxy,
+ ν ω) ≤ delta :=
+ (finitePACBayesBernsteinGrid_badEventMass_le_sum_delta
+ (Ω := Ω) (ι := ι) (γ := γ) hν hπ lambdaOf scaleOf confidenceOf
+ hlambda hscale hconfidenceOf riskFn empiricalRiskFn varianceProxy hExpected).trans
+ hgridConfidence
+
+/--
+Finite-grid optimized Bernstein bad-event bound.
+
+The user-supplied `posteriorPenalty` may depend on the posterior. The theorem
+requires a finite grid certificate: each posterior is assigned to a bucket whose
+fixed-parameter Bernstein penalty is no larger than the supplied penalty.
+-/
+theorem finitePACBayesBernsteinGridOptimized_badEventMass_le_sum_delta
+ [Fintype Ω] [DecidableEq Ω] [Fintype ι] [Nonempty ι] [Fintype γ]
+ {ν : Ω → ℝ} (hν : IsPMF ν)
+ {π : ι → ℝ} (hπ : IsFullSupportPMF π)
+ (lambdaOf scaleOf confidenceOf : γ → ℝ)
+ (posteriorPenalty : (ι → ℝ) → ℝ)
+ (hlambda : ∀ g : γ, 0 < lambdaOf g)
+ (hscale : ∀ g : γ, scaleOf g * lambdaOf g < 1)
+ (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g)
+ (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ)
+ (varianceProxy : ι → ℝ)
+ (hgridCovers :
+ ∀ ρ : ι → ℝ, IsPMF ρ →
+ ∃ g : γ,
+ (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g +
+ lambdaOf g * posteriorMarginVarianceProxy ρ varianceProxy /
+ (2 * (1 - scaleOf g * lambdaOf g))
+ ≤ posteriorPenalty ρ)
+ (hExpected :
+ ∀ g : γ,
+ expectedPriorBernsteinExpMoment ν π (lambdaOf g) (scaleOf g)
+ riskFn empiricalRiskFn varianceProxy ≤ 1) :
+ (∑ ω ∈
+ finitePACBayesBernsteinPenaltyBadSamples riskFn empiricalRiskFn posteriorPenalty,
+ ν ω) ≤
+ ∑ g : γ, confidenceOf g := by
+ classical
+ have hsubset :
+ finitePACBayesBernsteinPenaltyBadSamples riskFn empiricalRiskFn posteriorPenalty
+ ⊆
+ finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf
+ riskFn empiricalRiskFn varianceProxy := by
+ intro ω hω
+ rw [finitePACBayesBernsteinPenaltyBadSamples, Finset.mem_filter] at hω
+ rcases hω.2 with ⟨ρ, hρ, hbad⟩
+ rcases hgridCovers ρ hρ with ⟨g, hpenalty⟩
+ rw [finitePACBayesBernsteinGridBadSamples, Finset.mem_filter]
+ refine ⟨Finset.mem_univ ω, g, ρ, hρ, ?_⟩
+ linarith
+ have hmass_subset :
+ (∑ ω ∈
+ finitePACBayesBernsteinPenaltyBadSamples riskFn empiricalRiskFn posteriorPenalty,
+ ν ω)
+ ≤
+ ∑ ω ∈
+ finitePACBayesBernsteinGridBadSamples π lambdaOf scaleOf confidenceOf
+ riskFn empiricalRiskFn varianceProxy,
+ ν ω := by
+ exact Finset.sum_le_sum_of_subset_of_nonneg hsubset (by
+ intro ω _hω _hnot
+ exact hν.nonneg ω)
+ exact hmass_subset.trans
+ (finitePACBayesBernsteinGrid_badEventMass_le_sum_delta
+ (Ω := Ω) (ι := ι) (γ := γ) hν hπ lambdaOf scaleOf confidenceOf
+ hlambda hscale hconfidenceOf riskFn empiricalRiskFn varianceProxy hExpected)
+
+/--
+Finite-grid optimized Bernstein bad-event bound with an explicit total
+confidence budget.
+-/
+theorem finitePACBayesBernsteinGridOptimized_badEventMass_le_delta
+ [Fintype Ω] [DecidableEq Ω] [Fintype ι] [Nonempty ι] [Fintype γ]
+ {ν : Ω → ℝ} (hν : IsPMF ν)
+ {π : ι → ℝ} (hπ : IsFullSupportPMF π)
+ (lambdaOf scaleOf confidenceOf : γ → ℝ)
+ (posteriorPenalty : (ι → ℝ) → ℝ)
+ (hlambda : ∀ g : γ, 0 < lambdaOf g)
+ (hscale : ∀ g : γ, scaleOf g * lambdaOf g < 1)
+ (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g)
+ (riskFn : ι → ℝ) (empiricalRiskFn : Ω → ι → ℝ)
+ (varianceProxy : ι → ℝ)
+ (hgridCovers :
+ ∀ ρ : ι → ℝ, IsPMF ρ →
+ ∃ g : γ,
+ (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g +
+ lambdaOf g * posteriorMarginVarianceProxy ρ varianceProxy /
+ (2 * (1 - scaleOf g * lambdaOf g))
+ ≤ posteriorPenalty ρ)
+ (hExpected :
+ ∀ g : γ,
+ expectedPriorBernsteinExpMoment ν π (lambdaOf g) (scaleOf g)
+ riskFn empiricalRiskFn varianceProxy ≤ 1)
+ {delta : ℝ} (hgridConfidence : (∑ g : γ, confidenceOf g) ≤ delta) :
+ (∑ ω ∈
+ finitePACBayesBernsteinPenaltyBadSamples riskFn empiricalRiskFn posteriorPenalty,
+ ν ω) ≤ delta :=
+ (finitePACBayesBernsteinGridOptimized_badEventMass_le_sum_delta
+ (Ω := Ω) (ι := ι) (γ := γ) hν hπ lambdaOf scaleOf confidenceOf
+ posteriorPenalty hlambda hscale hconfidenceOf riskFn empiricalRiskFn
+ varianceProxy hgridCovers hExpected).trans hgridConfidence
+
end
end FormalSLT.PACBayesBernstein
diff --git a/FormalSLT/PACBayesFiniteProductMGF.lean b/FormalSLT/PACBayesFiniteProductMGF.lean
index 9dfb491..6947403 100644
--- a/FormalSLT/PACBayesFiniteProductMGF.lean
+++ b/FormalSLT/PACBayesFiniteProductMGF.lean
@@ -163,6 +163,43 @@ theorem finiteProduct_mgf_empiricalRiskDeviation_le_of_single
field_simp [hn_ne]
exact h_pow.trans_eq h_exp_pow
+/--
+Finite product MGF bound from an arbitrary one-coordinate exponential budget.
+
+This is the same product-factorization step as
+`finiteProduct_mgf_empiricalRiskDeviation_le_of_single`, but keeps the
+one-coordinate budget explicit. It is useful for Bernstein/sub-Gamma bounds,
+whose denominator depends on the one-coordinate scale.
+-/
+theorem finiteProduct_mgf_empiricalRiskDeviation_le_exp_of_single
+ {n : ℕ} [Fintype Z] (hn : 0 < n)
+ (p : Z → ℝ) (hp : IsPMF p)
+ (ℓ : ι → Z → ℝ) (i : ι) (lam singleBudget : ℝ)
+ (hsingle :
+ oneCoordinateDeviationMGF (n := n) p ℓ i lam ≤
+ Real.exp singleBudget) :
+ (∑ S : Fin n → Z,
+ finiteProductSampleWeight p S *
+ Real.exp (lam * (finitePopulationRisk p ℓ i - finiteEmpiricalRisk ℓ i S))) ≤
+ Real.exp ((n : ℝ) * singleBudget) := by
+ classical
+ have h_eq :=
+ finiteProduct_mgf_empiricalRiskDeviation_eq_pow
+ (ι := ι) (Z := Z) hn p ℓ i lam
+ rw [h_eq]
+ have hsingle_nonneg : 0 ≤ oneCoordinateDeviationMGF (n := n) p ℓ i lam := by
+ unfold oneCoordinateDeviationMGF
+ exact Finset.sum_nonneg
+ (fun z _hz => mul_nonneg (hp.nonneg z) (le_of_lt (Real.exp_pos _)))
+ have h_pow :
+ (oneCoordinateDeviationMGF (n := n) p ℓ i lam) ^ n ≤
+ (Real.exp singleBudget) ^ n :=
+ pow_le_pow_left₀ hsingle_nonneg hsingle n
+ have h_exp_pow :
+ (Real.exp singleBudget) ^ n = Real.exp ((n : ℝ) * singleBudget) := by
+ rw [← Real.exp_nat_mul]
+ exact h_pow.trans_eq h_exp_pow
+
/--
Prior-averaged finite product MGF bound.
diff --git a/FormalSLT/PACBayesMargin.lean b/FormalSLT/PACBayesMargin.lean
new file mode 100644
index 0000000..27ab9af
--- /dev/null
+++ b/FormalSLT/PACBayesMargin.lean
@@ -0,0 +1,642 @@
+/-
+Copyright (c) 2026 Robby Sneiderman. All rights reserved.
+Released under MIT license as described in the file LICENSE.
+Authors: Robby Sneiderman
+-/
+import FormalSLT.PACBayesBernstein
+import FormalSLT.PACBayesFiniteProductMGF
+import FormalSLT.Probability.BernsteinMGF
+
+/-!
+# Finite PAC-Bayes classifier-margin adapter
+
+This module specializes the finite PAC-Bayes Bernstein shell to classifier
+margin losses on a finite data domain.
+
+The supplied-certificate adapter remains available for arbitrary normalized
+Bernstein prior-moment certificates. For iid finite classifier-margin losses,
+this file also derives that certificate internally from the finite product law
+and the Bennett/sub-Gamma MGF layer.
+
+## Main declarations
+
+* `classifierMarginLoss` — thresholded margin loss in `{0,1}`.
+* `classifierMarginPopulationRisk` and `classifierMarginEmpiricalRisk` —
+ concrete finite risk functions for margin loss.
+* `classifierMarginVarianceProxy` — the margin-risk proxy used by the
+ Bernstein shell.
+* `expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one` — iid finite
+ classifier-margin losses discharge the normalized Bernstein prior-moment
+ certificate.
+* `finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta` — the
+ end-to-end finite iid PAC-Bayes Bernstein theorem for classifier-margin
+ losses and a fixed admissible `lambda`.
+* `finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_delta` —
+ the finite-grid version, with a caller-supplied grid certificate over
+ admissible Bernstein parameters.
+-/
+
+namespace FormalSLT.PACBayesMargin
+
+open Finset Real BigOperators
+open FormalSLT.PACBayesKL
+open FormalSLT.PACBayesFiniteProductMGF
+open FormalSLT.PACBayesBernstein
+open FormalSLT.Probability.BernsteinMGF
+
+noncomputable section
+
+variable {ι Z : Type*}
+
+local instance (p : Prop) : Decidable p := Classical.propDecidable p
+
+/-! ### Classifier margin losses -/
+
+/-- Encode a Boolean label as `+1` for `true` and `-1` for `false`. -/
+def signedBool (y : Bool) : ℝ :=
+ if y then 1 else -1
+
+/-- Signed margin of hypothesis `i` on example `z`. -/
+def classifierMargin
+ (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) (z : Z) : ℝ :=
+ signedBool (label z) * score i z
+
+/--
+Thresholded classifier margin loss.
+
+The loss is `1` when the signed margin is at most `threshold`, and `0`
+otherwise. For `threshold = 0`, this is the usual error-or-no-positive-margin
+indicator for real-valued scores with Boolean labels.
+-/
+def classifierMarginLoss
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (i : ι) (z : Z) : ℝ :=
+ if classifierMargin score label i z ≤ threshold then 1 else 0
+
+/-- Classifier margin loss is always in `[0,1]`. -/
+lemma classifierMarginLoss_mem_Icc_zero_one
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (i : ι) (z : Z) :
+ classifierMarginLoss threshold score label i z ∈ Set.Icc (0 : ℝ) 1 := by
+ unfold classifierMarginLoss
+ by_cases h : classifierMargin score label i z ≤ threshold
+ · simp [h]
+ · simp [h]
+
+/-- Finite population risk of a thresholded classifier margin loss. -/
+def classifierMarginPopulationRisk [Fintype Z]
+ (p : Z → ℝ) (threshold : ℝ)
+ (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) : ℝ :=
+ finitePopulationRisk p (classifierMarginLoss threshold score label) i
+
+/-- Finite empirical risk of a thresholded classifier margin loss. -/
+def classifierMarginEmpiricalRisk {n : ℕ}
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (i : ι) (S : Fin n → Z) : ℝ :=
+ finiteEmpiricalRisk (classifierMarginLoss threshold score label) i S
+
+/-- Sample-indexed empirical-risk function for the PAC-Bayes Bernstein shell. -/
+def classifierMarginEmpiricalRiskFn {n : ℕ}
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (S : Fin n → Z) (i : ι) : ℝ :=
+ classifierMarginEmpiricalRisk threshold score label i S
+
+/--
+Concrete margin-risk variance proxy.
+
+For an indicator margin loss, the second moment equals the population risk, so
+this is the natural Bernstein variance proxy exposed to the PAC-Bayes layer.
+The equality is kept as a definition here; deriving the normalized prior MGF
+from this proxy is a separate concentration step.
+-/
+def classifierMarginVarianceProxy [Fintype Z]
+ (p : Z → ℝ) (threshold : ℝ)
+ (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) : ℝ :=
+ classifierMarginPopulationRisk p threshold score label i
+
+/-- The margin variance proxy is definitionally the margin population risk. -/
+lemma classifierMarginVarianceProxy_eq_populationRisk [Fintype Z]
+ (p : Z → ℝ) (threshold : ℝ)
+ (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) :
+ classifierMarginVarianceProxy p threshold score label i =
+ classifierMarginPopulationRisk p threshold score label i := rfl
+
+/-- Sample-average variance proxy for iid classifier-margin empirical risks. -/
+def classifierMarginSampleVarianceProxy {n : ℕ} [Fintype Z]
+ (p : Z → ℝ) (threshold : ℝ)
+ (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) : ℝ :=
+ classifierMarginVarianceProxy p threshold score label i / (n : ℝ)
+
+/-- Posterior-averaged classifier-margin variance proxy. -/
+def posteriorClassifierMarginVarianceProxy [Fintype Z] [Fintype ι]
+ (ρ : ι → ℝ) (p : Z → ℝ) (threshold : ℝ)
+ (score : ι → Z → ℝ) (label : Z → Bool) : ℝ :=
+ posteriorMarginVarianceProxy ρ
+ (classifierMarginVarianceProxy p threshold score label)
+
+/-- Posterior-averaged iid sample classifier-margin variance proxy. -/
+def posteriorClassifierMarginSampleVarianceProxy {n : ℕ} [Fintype Z] [Fintype ι]
+ (ρ : ι → ℝ) (p : Z → ℝ) (threshold : ℝ)
+ (score : ι → Z → ℝ) (label : Z → Bool) : ℝ :=
+ posteriorMarginVarianceProxy ρ
+ (classifierMarginSampleVarianceProxy (n := n) p threshold score label)
+
+lemma classifierMarginPopulationRisk_mem_Icc_zero_one [Fintype Z]
+ (p : Z → ℝ) (hp : IsPMF p)
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) :
+ classifierMarginPopulationRisk p threshold score label i ∈ Set.Icc (0 : ℝ) 1 := by
+ constructor
+ · unfold classifierMarginPopulationRisk finitePopulationRisk
+ exact Finset.sum_nonneg
+ (fun z _hz =>
+ mul_nonneg (hp.nonneg z)
+ (classifierMarginLoss_mem_Icc_zero_one threshold score label i z).1)
+ · unfold classifierMarginPopulationRisk finitePopulationRisk
+ calc
+ (∑ z : Z, p z * classifierMarginLoss threshold score label i z)
+ ≤ ∑ z : Z, p z * 1 := by
+ apply Finset.sum_le_sum
+ intro z _hz
+ exact mul_le_mul_of_nonneg_left
+ (classifierMarginLoss_mem_Icc_zero_one threshold score label i z).2
+ (hp.nonneg z)
+ _ = 1 := by simp [hp.sum_one]
+
+lemma classifierMarginLoss_sq
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (i : ι) (z : Z) :
+ classifierMarginLoss threshold score label i z ^ 2 =
+ classifierMarginLoss threshold score label i z := by
+ unfold classifierMarginLoss
+ by_cases h : classifierMargin score label i z ≤ threshold
+ · simp [h]
+ · simp [h]
+
+lemma classifierMarginVariance_le_risk [Fintype Z]
+ (p : Z → ℝ) (hp : IsPMF p)
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) (i : ι) :
+ (∑ z : Z,
+ p z *
+ (classifierMarginPopulationRisk p threshold score label i -
+ classifierMarginLoss threshold score label i z) ^ 2)
+ ≤ classifierMarginVarianceProxy p threshold score label i := by
+ classical
+ let R := classifierMarginPopulationRisk p threshold score label i
+ let loss : Z → ℝ := classifierMarginLoss threshold score label i
+ have hsum_loss : (∑ z : Z, p z * loss z) = R := by
+ simp [R, loss, classifierMarginPopulationRisk, finitePopulationRisk]
+ have hsum_loss_sq : (∑ z : Z, p z * loss z ^ 2) = R := by
+ calc
+ (∑ z : Z, p z * loss z ^ 2)
+ = ∑ z : Z, p z * loss z := by
+ apply Finset.sum_congr rfl
+ intro z _hz
+ rw [classifierMarginLoss_sq]
+ _ = R := hsum_loss
+ have hvariance_eq :
+ (∑ z : Z, p z * (R - loss z) ^ 2) = R - R ^ 2 := by
+ calc
+ (∑ z : Z, p z * (R - loss z) ^ 2)
+ =
+ ∑ z : Z, (p z * R ^ 2 - 2 * R * (p z * loss z) +
+ p z * loss z ^ 2) := by
+ apply Finset.sum_congr rfl
+ intro z _hz
+ ring
+ _ =
+ (∑ z : Z, p z * R ^ 2) -
+ (∑ z : Z, 2 * R * (p z * loss z)) +
+ ∑ z : Z, p z * loss z ^ 2 := by
+ rw [Finset.sum_add_distrib, Finset.sum_sub_distrib]
+ _ =
+ (∑ z : Z, p z * R ^ 2) -
+ 2 * R * (∑ z : Z, p z * loss z) +
+ ∑ z : Z, p z * loss z ^ 2 := by
+ congr 2
+ rw [Finset.mul_sum]
+ _ =
+ R ^ 2 * (∑ z : Z, p z) -
+ 2 * R * (∑ z : Z, p z * loss z) +
+ ∑ z : Z, p z * loss z ^ 2 := by
+ congr 2
+ calc
+ (∑ z : Z, p z * R ^ 2)
+ = ∑ z : Z, R ^ 2 * p z := by
+ apply Finset.sum_congr rfl
+ intro z _hz
+ ring
+ _ = R ^ 2 * (∑ z : Z, p z) := by
+ rw [Finset.mul_sum]
+ _ = R - R ^ 2 := by
+ rw [hp.sum_one, hsum_loss, hsum_loss_sq]
+ ring
+ have hR_nonneg : 0 ≤ R :=
+ (classifierMarginPopulationRisk_mem_Icc_zero_one
+ p hp threshold score label i).1
+ calc
+ (∑ z : Z,
+ p z *
+ (classifierMarginPopulationRisk p threshold score label i -
+ classifierMarginLoss threshold score label i z) ^ 2)
+ = ∑ z : Z, p z * (R - loss z) ^ 2 := by
+ simp [R, loss]
+ _ = R - R ^ 2 := hvariance_eq
+ _ ≤ R := by nlinarith [sq_nonneg R]
+ _ = classifierMarginVarianceProxy p threshold score label i := by
+ simp [classifierMarginVarianceProxy, R]
+
+theorem oneCoordinate_classifierMarginLoss_mgf_subgamma
+ {n : ℕ} [Fintype Z] (hn : 0 < n)
+ (p : Z → ℝ) (hp : IsPMF p)
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (i : ι) (lambda : ℝ)
+ (hlambda : 0 ≤ lambda) (hlambda_bound : lambda * (n : ℝ)⁻¹ < 3) :
+ oneCoordinateDeviationMGF (n := n) p
+ (classifierMarginLoss threshold score label) i lambda
+ ≤
+ Real.exp
+ ((lambda * (n : ℝ)⁻¹) ^ 2 *
+ classifierMarginVarianceProxy p threshold score label i /
+ (2 * (1 - (lambda * (n : ℝ)⁻¹) / 3))) := by
+ classical
+ let R := classifierMarginPopulationRisk p threshold score label i
+ let loss : Z → ℝ := classifierMarginLoss threshold score label i
+ have hcenter : (∑ z : Z, p z * (R - loss z)) = 0 := by
+ have hexp :
+ (∑ z : Z, p z * (R - loss z))
+ = R * (∑ z : Z, p z) - ∑ z : Z, p z * loss z := by
+ rw [Finset.mul_sum, ← Finset.sum_sub_distrib]
+ exact Finset.sum_congr rfl (fun z _hz => by ring)
+ rw [hexp, hp.sum_one]
+ simp [R, loss, classifierMarginPopulationRisk, finitePopulationRisk]
+ have hbound : ∀ z : Z, R - loss z ≤ 1 := by
+ intro z
+ have hR_le :
+ R ≤ 1 :=
+ (classifierMarginPopulationRisk_mem_Icc_zero_one
+ p hp threshold score label i).2
+ have hloss_nonneg :
+ 0 ≤ loss z :=
+ (classifierMarginLoss_mem_Icc_zero_one threshold score label i z).1
+ linarith
+ have hvar :
+ (∑ z : Z, p z * (R - loss z) ^ 2)
+ ≤ classifierMarginVarianceProxy p threshold score label i := by
+ simpa [R, loss] using
+ (classifierMarginVariance_le_risk p hp threshold score label i)
+ have h :=
+ bennett_mgf_subgamma p (fun z : Z => R - loss z)
+ (b := 1)
+ (v := classifierMarginVarianceProxy p threshold score label i)
+ (lam := lambda * (n : ℝ)⁻¹)
+ (by norm_num) (by positivity) (by simpa using hlambda_bound)
+ hp.nonneg hp.sum_one hcenter hbound hvar
+ simpa [oneCoordinateDeviationMGF, R, loss, classifierMarginPopulationRisk] using h
+
+theorem finiteProduct_classifierMarginLoss_mgf_subgamma
+ {n : ℕ} [Fintype Z] (hn : 0 < n)
+ (p : Z → ℝ) (hp : IsPMF p)
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (i : ι) (lambda : ℝ)
+ (hlambda : 0 ≤ lambda) (hlambda_bound : lambda * (n : ℝ)⁻¹ < 3) :
+ (∑ S : Fin n → Z,
+ finiteProductSampleWeight p S *
+ Real.exp
+ (lambda *
+ (classifierMarginPopulationRisk p threshold score label i -
+ classifierMarginEmpiricalRisk threshold score label i S)))
+ ≤
+ Real.exp
+ (lambda ^ 2 *
+ classifierMarginSampleVarianceProxy (n := n) p threshold score label i /
+ (2 * (1 - (3 * (n : ℝ))⁻¹ * lambda))) := by
+ classical
+ let singleBudget : ℝ :=
+ (lambda * (n : ℝ)⁻¹) ^ 2 *
+ classifierMarginVarianceProxy p threshold score label i /
+ (2 * (1 - (lambda * (n : ℝ)⁻¹) / 3))
+ have hsingle :
+ oneCoordinateDeviationMGF (n := n) p
+ (classifierMarginLoss threshold score label) i lambda ≤
+ Real.exp singleBudget := by
+ simpa [singleBudget] using
+ (oneCoordinate_classifierMarginLoss_mgf_subgamma
+ (n := n) hn p hp threshold score label i lambda hlambda hlambda_bound)
+ have hprod :=
+ finiteProduct_mgf_empiricalRiskDeviation_le_exp_of_single
+ (ι := ι) (Z := Z) hn p hp
+ (classifierMarginLoss threshold score label) i lambda singleBudget hsingle
+ refine hprod.trans_eq ?_
+ congr 1
+ have hn_ne : (n : ℝ) ≠ 0 := by exact_mod_cast (Nat.ne_of_gt hn)
+ have hden₁ : 2 * (1 - (lambda * (n : ℝ)⁻¹) / 3) ≠ 0 := by
+ have hpos : 0 < 1 - (lambda * (n : ℝ)⁻¹) / 3 := by linarith
+ nlinarith
+ have hden₂ : 2 * (1 - (3 * (n : ℝ))⁻¹ * lambda) ≠ 0 := by
+ have hn_pos : 0 < (n : ℝ) := by exact_mod_cast hn
+ have hlt : lambda < 3 * (n : ℝ) := by
+ rw [← div_lt_iff₀ hn_pos]
+ simpa [div_eq_mul_inv] using hlambda_bound
+ have hpos : 0 < 1 - (3 * (n : ℝ))⁻¹ * lambda := by
+ rw [sub_pos]
+ rw [inv_mul_lt_iff₀ (by positivity : 0 < 3 * (n : ℝ))]
+ nlinarith
+ nlinarith
+ simp [singleBudget, classifierMarginSampleVarianceProxy]
+ field_simp [hn_ne, hden₁, hden₂]
+
+theorem expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one
+ {n : ℕ} [Fintype Z] [Fintype ι]
+ (hn : 0 < n)
+ (p : Z → ℝ) (hp : IsPMF p)
+ {π : ι → ℝ} (hπ : IsPMF π)
+ (lambda : ℝ)
+ (hlambda : 0 ≤ lambda) (hlambda_bound : lambda * (n : ℝ)⁻¹ < 3)
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool) :
+ expectedPriorBernsteinExpMoment
+ (finiteProductSampleWeight (n := n) p) π lambda ((3 * (n : ℝ))⁻¹)
+ (classifierMarginPopulationRisk p threshold score label)
+ (classifierMarginEmpiricalRiskFn threshold score label)
+ (classifierMarginSampleVarianceProxy (n := n) p threshold score label)
+ ≤ 1 := by
+ classical
+ apply expectedPriorBernsteinExpMoment_le_one_of_mgf_bound hπ
+ intro i
+ simpa [classifierMarginEmpiricalRiskFn] using
+ (finiteProduct_classifierMarginLoss_mgf_subgamma
+ (n := n) hn p hp threshold score label i lambda hlambda hlambda_bound)
+
+/-! ### Product sample law -/
+
+/-- Finite iid product sample weights form a PMF when the base mass is a PMF. -/
+theorem finiteProductSampleWeight_isPMF
+ {n : ℕ} [Fintype Z] {p : Z → ℝ} (hp : IsPMF p) :
+ IsPMF (finiteProductSampleWeight (n := n) p) where
+ nonneg := by
+ intro S
+ unfold finiteProductSampleWeight
+ exact Finset.prod_nonneg (fun k _hk => hp.nonneg (S k))
+ sum_one := by
+ unfold finiteProductSampleWeight
+ calc
+ (∑ S : Fin n → Z, ∏ k : Fin n, p (S k))
+ = ∏ _k : Fin n, ∑ z : Z, p z := by
+ exact (Fintype.prod_sum (f := fun _k : Fin n => p)).symm
+ _ = 1 := by simp [hp.sum_one]
+
+/-! ### PAC-Bayes Bernstein specialization -/
+
+/-- Bad samples for the concrete classifier-margin PAC-Bayes Bernstein event. -/
+def finitePACBayesClassifierMarginBernsteinBadSamples
+ {n : ℕ} [Fintype Z] [Fintype ι]
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (p : Z → ℝ) (posteriorPenalty : (ι → ℝ) → ℝ) :
+ Finset (Fin n → Z) :=
+ finitePACBayesBernsteinPenaltyBadSamples
+ (classifierMarginPopulationRisk p threshold score label)
+ (classifierMarginEmpiricalRiskFn threshold score label)
+ posteriorPenalty
+
+/--
+Finite PAC-Bayes Bernstein theorem specialized to classifier margin losses.
+
+The theorem states that, under the same explicit finite-parameter certificates
+as `finitePACBayesBernsteinMargin_badEventMass_le_delta`, the iid sample mass
+of classifier-margin bad samples is at most `delta`.
+-/
+theorem finitePACBayesClassifierMarginBernstein_badEventMass_le_delta
+ {n : ℕ} [Fintype Z] [Fintype ι] [Nonempty ι]
+ (p : Z → ℝ) (hp : IsPMF p)
+ {π : ι → ℝ} (hπ : IsFullSupportPMF π)
+ (lambda scale delta : ℝ)
+ (hlambda : 0 < lambda) (hscale : scale * lambda < 1)
+ (hdelta : 0 < delta)
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (complexityOf : (ι → ℝ) → ℝ)
+ (hcomplexity :
+ ∀ ρ : ι → ℝ, IsPMF ρ →
+ klDiv ρ π + Real.log (1 / delta) ≤ complexityOf ρ)
+ (hpenalty :
+ ∀ ρ : ι → ℝ, IsPMF ρ →
+ complexityOf ρ / lambda +
+ lambda *
+ posteriorClassifierMarginVarianceProxy ρ p threshold score label /
+ (2 * (1 - scale * lambda))
+ ≤
+ Real.sqrt
+ (2 * posteriorClassifierMarginVarianceProxy ρ p threshold score label *
+ complexityOf ρ) +
+ scale * complexityOf ρ)
+ (hExpected :
+ expectedPriorBernsteinExpMoment
+ (finiteProductSampleWeight (n := n) p) π lambda scale
+ (classifierMarginPopulationRisk p threshold score label)
+ (classifierMarginEmpiricalRiskFn threshold score label)
+ (classifierMarginVarianceProxy p threshold score label) ≤ 1) :
+ (∑ S ∈
+ finitePACBayesClassifierMarginBernsteinBadSamples
+ (n := n) threshold score label p
+ (fun ρ =>
+ Real.sqrt
+ (2 * posteriorClassifierMarginVarianceProxy ρ p threshold score label *
+ complexityOf ρ) +
+ scale * complexityOf ρ),
+ finiteProductSampleWeight p S) ≤ delta := by
+ classical
+ simpa [finitePACBayesClassifierMarginBernsteinBadSamples,
+ posteriorClassifierMarginVarianceProxy]
+ using
+ (finitePACBayesBernsteinMargin_badEventMass_le_delta
+ (Ω := Fin n → Z) (ι := ι)
+ (ν := finiteProductSampleWeight (n := n) p)
+ (hν := finiteProductSampleWeight_isPMF (n := n) hp)
+ (π := π) hπ lambda scale delta hlambda hscale hdelta
+ (classifierMarginPopulationRisk p threshold score label)
+ (classifierMarginEmpiricalRiskFn threshold score label)
+ (classifierMarginVarianceProxy p threshold score label)
+ complexityOf hcomplexity hpenalty hExpected)
+
+/--
+Finite iid PAC-Bayes Bernstein theorem for classifier-margin losses.
+
+This version derives the normalized Bernstein prior-moment certificate from the
+iid product sample law and the finite Bennett/sub-Gamma bound for thresholded
+margin losses. The variance proxy is scaled by the sample size.
+-/
+theorem finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta
+ {n : ℕ} [Fintype Z] [Fintype ι] [Nonempty ι]
+ (hn : 0 < n)
+ (p : Z → ℝ) (hp : IsPMF p)
+ {π : ι → ℝ} (hπ : IsFullSupportPMF π)
+ (lambda delta : ℝ)
+ (hlambda : 0 < lambda)
+ (hlambda_bound : lambda * (n : ℝ)⁻¹ < 3)
+ (hdelta : 0 < delta)
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (complexityOf : (ι → ℝ) → ℝ)
+ (hcomplexity :
+ ∀ ρ : ι → ℝ, IsPMF ρ →
+ klDiv ρ π + Real.log (1 / delta) ≤ complexityOf ρ)
+ (hpenalty :
+ ∀ ρ : ι → ℝ, IsPMF ρ →
+ complexityOf ρ / lambda +
+ lambda *
+ posteriorClassifierMarginSampleVarianceProxy
+ (n := n) ρ p threshold score label /
+ (2 * (1 - (3 * (n : ℝ))⁻¹ * lambda))
+ ≤
+ Real.sqrt
+ (2 *
+ posteriorClassifierMarginSampleVarianceProxy
+ (n := n) ρ p threshold score label *
+ complexityOf ρ) +
+ (3 * (n : ℝ))⁻¹ * complexityOf ρ) :
+ (∑ S ∈
+ finitePACBayesClassifierMarginBernsteinBadSamples
+ (n := n) threshold score label p
+ (fun ρ =>
+ Real.sqrt
+ (2 *
+ posteriorClassifierMarginSampleVarianceProxy
+ (n := n) ρ p threshold score label *
+ complexityOf ρ) +
+ (3 * (n : ℝ))⁻¹ * complexityOf ρ),
+ finiteProductSampleWeight p S) ≤ delta := by
+ classical
+ have hExpected :
+ expectedPriorBernsteinExpMoment
+ (finiteProductSampleWeight (n := n) p) π lambda ((3 * (n : ℝ))⁻¹)
+ (classifierMarginPopulationRisk p threshold score label)
+ (classifierMarginEmpiricalRiskFn threshold score label)
+ (classifierMarginSampleVarianceProxy (n := n) p threshold score label) ≤ 1 := by
+ exact
+ expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one
+ (n := n) hn p hp hπ.toIsPMF lambda hlambda.le hlambda_bound
+ threshold score label
+ have hscale : (3 * (n : ℝ))⁻¹ * lambda < 1 := by
+ have hn_pos : 0 < (n : ℝ) := by exact_mod_cast hn
+ have hlt : lambda < 3 * (n : ℝ) := by
+ rw [← div_lt_iff₀ hn_pos]
+ simpa [div_eq_mul_inv] using hlambda_bound
+ rw [inv_mul_lt_iff₀ (by positivity : 0 < 3 * (n : ℝ))]
+ nlinarith
+ simpa [finitePACBayesClassifierMarginBernsteinBadSamples,
+ posteriorClassifierMarginSampleVarianceProxy]
+ using
+ (finitePACBayesBernsteinMargin_badEventMass_le_delta
+ (Ω := Fin n → Z) (ι := ι)
+ (ν := finiteProductSampleWeight (n := n) p)
+ (hν := finiteProductSampleWeight_isPMF (n := n) hp)
+ (π := π) hπ lambda ((3 * (n : ℝ))⁻¹) delta hlambda
+ hscale hdelta
+ (classifierMarginPopulationRisk p threshold score label)
+ (classifierMarginEmpiricalRiskFn threshold score label)
+ (classifierMarginSampleVarianceProxy (n := n) p threshold score label)
+ complexityOf hcomplexity hpenalty hExpected)
+
+/--
+Finite-grid iid PAC-Bayes Bernstein theorem for classifier-margin losses.
+
+Each grid index supplies an admissible Bernstein parameter and confidence
+allocation. A caller supplies the finite grid certificate assigning each
+posterior to a bucket whose fixed-parameter Bernstein penalty is no larger than
+`posteriorPenalty`.
+-/
+theorem finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_sum_delta
+ {n : ℕ} [Fintype Z] [Fintype ι] [Nonempty ι] [Fintype γ]
+ (hn : 0 < n)
+ (p : Z → ℝ) (hp : IsPMF p)
+ {π : ι → ℝ} (hπ : IsFullSupportPMF π)
+ (lambdaOf confidenceOf : γ → ℝ)
+ (hlambda : ∀ g : γ, 0 < lambdaOf g)
+ (hlambda_bound : ∀ g : γ, lambdaOf g * (n : ℝ)⁻¹ < 3)
+ (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g)
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (posteriorPenalty : (ι → ℝ) → ℝ)
+ (hgridCovers :
+ ∀ ρ : ι → ℝ, IsPMF ρ →
+ ∃ g : γ,
+ (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g +
+ lambdaOf g *
+ posteriorClassifierMarginSampleVarianceProxy
+ (n := n) ρ p threshold score label /
+ (2 * (1 - (3 * (n : ℝ))⁻¹ * lambdaOf g))
+ ≤ posteriorPenalty ρ) :
+ (∑ S ∈
+ finitePACBayesClassifierMarginBernsteinBadSamples
+ (n := n) threshold score label p posteriorPenalty,
+ finiteProductSampleWeight p S) ≤
+ ∑ g : γ, confidenceOf g := by
+ classical
+ have hscale : ∀ g : γ, (3 * (n : ℝ))⁻¹ * lambdaOf g < 1 := by
+ intro g
+ have hn_pos : 0 < (n : ℝ) := by exact_mod_cast hn
+ have hlt : lambdaOf g < 3 * (n : ℝ) := by
+ rw [← div_lt_iff₀ hn_pos]
+ simpa [div_eq_mul_inv] using hlambda_bound g
+ rw [inv_mul_lt_iff₀ (by positivity : 0 < 3 * (n : ℝ))]
+ nlinarith
+ have hExpected :
+ ∀ g : γ,
+ expectedPriorBernsteinExpMoment
+ (finiteProductSampleWeight (n := n) p) π (lambdaOf g)
+ ((3 * (n : ℝ))⁻¹)
+ (classifierMarginPopulationRisk p threshold score label)
+ (classifierMarginEmpiricalRiskFn threshold score label)
+ (classifierMarginSampleVarianceProxy (n := n) p threshold score label) ≤ 1 := by
+ intro g
+ exact
+ expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one
+ (n := n) hn p hp hπ.toIsPMF (lambdaOf g) (hlambda g).le
+ (hlambda_bound g) threshold score label
+ simpa [finitePACBayesClassifierMarginBernsteinBadSamples,
+ posteriorClassifierMarginSampleVarianceProxy]
+ using
+ (finitePACBayesBernsteinGridOptimized_badEventMass_le_sum_delta
+ (Ω := Fin n → Z) (ι := ι) (γ := γ)
+ (ν := finiteProductSampleWeight (n := n) p)
+ (hν := finiteProductSampleWeight_isPMF (n := n) hp)
+ (π := π) hπ lambdaOf (fun _g : γ => (3 * (n : ℝ))⁻¹) confidenceOf
+ posteriorPenalty hlambda hscale hconfidenceOf
+ (classifierMarginPopulationRisk p threshold score label)
+ (classifierMarginEmpiricalRiskFn threshold score label)
+ (classifierMarginSampleVarianceProxy (n := n) p threshold score label)
+ hgridCovers hExpected)
+
+/--
+Finite-grid iid PAC-Bayes Bernstein theorem for classifier-margin losses with
+an explicit total confidence budget.
+-/
+theorem finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_delta
+ {n : ℕ} [Fintype Z] [Fintype ι] [Nonempty ι] [Fintype γ]
+ (hn : 0 < n)
+ (p : Z → ℝ) (hp : IsPMF p)
+ {π : ι → ℝ} (hπ : IsFullSupportPMF π)
+ (lambdaOf confidenceOf : γ → ℝ)
+ (hlambda : ∀ g : γ, 0 < lambdaOf g)
+ (hlambda_bound : ∀ g : γ, lambdaOf g * (n : ℝ)⁻¹ < 3)
+ (hconfidenceOf : ∀ g : γ, 0 < confidenceOf g)
+ (threshold : ℝ) (score : ι → Z → ℝ) (label : Z → Bool)
+ (posteriorPenalty : (ι → ℝ) → ℝ)
+ (hgridCovers :
+ ∀ ρ : ι → ℝ, IsPMF ρ →
+ ∃ g : γ,
+ (klDiv ρ π + Real.log (1 / confidenceOf g)) / lambdaOf g +
+ lambdaOf g *
+ posteriorClassifierMarginSampleVarianceProxy
+ (n := n) ρ p threshold score label /
+ (2 * (1 - (3 * (n : ℝ))⁻¹ * lambdaOf g))
+ ≤ posteriorPenalty ρ)
+ {delta : ℝ} (hgridConfidence : (∑ g : γ, confidenceOf g) ≤ delta) :
+ (∑ S ∈
+ finitePACBayesClassifierMarginBernsteinBadSamples
+ (n := n) threshold score label p posteriorPenalty,
+ finiteProductSampleWeight p S) ≤ delta :=
+ (finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_sum_delta
+ (n := n) (Z := Z) (ι := ι) (γ := γ)
+ hn p hp hπ lambdaOf confidenceOf hlambda hlambda_bound hconfidenceOf
+ threshold score label posteriorPenalty hgridCovers).trans hgridConfidence
+
+end
+
+end FormalSLT.PACBayesMargin
diff --git a/FormalSLT/Test/PACBayesBernsteinTest.lean b/FormalSLT/Test/PACBayesBernsteinTest.lean
index 5609928..347d8a9 100644
--- a/FormalSLT/Test/PACBayesBernsteinTest.lean
+++ b/FormalSLT/Test/PACBayesBernsteinTest.lean
@@ -19,6 +19,7 @@ namespace FormalSLT.PACBayesBernstein.Test
#check FormalSLT.PACBayesBernstein.posteriorMarginVarianceProxy
#check FormalSLT.PACBayesBernstein.priorBernsteinExpMoment
#check FormalSLT.PACBayesBernstein.expectedPriorBernsteinExpMoment
+#check FormalSLT.PACBayesBernstein.expectedPriorBernsteinExpMoment_le_one_of_mgf_bound
#check FormalSLT.PACBayesBernstein.priorBernsteinExpMomentTailMass
#check FormalSLT.PACBayesBernstein.priorBernsteinExpMoment_nonneg
#check FormalSLT.PACBayesBernstein.priorBernsteinExpMoment_tailMass_le_expected_div
@@ -30,4 +31,6 @@ namespace FormalSLT.PACBayesBernstein.Test
#check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinPenalty_badEventMass_le_delta
#check FormalSLT.PACBayesBernstein.finitePACBayesBernsteinMargin_badEventMass_le_delta
+#print axioms FormalSLT.PACBayesBernstein.expectedPriorBernsteinExpMoment_le_one_of_mgf_bound
+
end FormalSLT.PACBayesBernstein.Test
diff --git a/README.md b/README.md
index 8f9ef74..272ccfc 100644
--- a/README.md
+++ b/README.md
@@ -7,154 +7,91 @@
[](#audit-commands)
[](./LICENSE)
-FormalSLT is a Lean 4 library for checking finite-sample statistical learning
-theory proof chains. The current v0.1 surface is organized around two checked
-endpoints:
-
-1. a finite-class, countable-time Hoeffding confidence sequence for `[0,1]`
- losses; and
-2. a finite-net Dudley bridge for a concrete process indexed by the non-finite
- unit interval `[0,1]`.
-
-The repository also packages reusable dyadic Dudley API instances for a
-two-point metric space and for finite discrete spaces `Fin n`. Those examples
-are API checks: they show that the finite-net wrapper is reusable outside the
-unit-interval file.
-
-The concentration layer now carries the sharp McDiarmid bounded-differences
-inequality over a homogeneous product measure. For a function whose value
-changes by at most `c k` when coordinate `k` is altered, the library proves the
-one-sided tail `exp(-2ε²/∑ₖcₖ²)` (`mcdiarmid_of_hasBoundedDifferences_sharp`),
-its lower-tail form (`mcdiarmid_of_hasBoundedDifferences_sharp_lower`), and the
-two-sided bound `2·exp(-2ε²/∑ₖcₖ²)`
-(`mcdiarmid_twoSided_of_hasBoundedDifferences_sharp`). The sharp `2B²` exponent
-is propagated into the high-probability Rademacher, finite-class, VC, and
-algorithmic-stability wrappers listed below, replacing the looser Azuma `8B²`
-constant on those paths. The localized-Rademacher wrapper still uses the older
-Azuma constant. The bounded-differences theorem is stated for the same product
-law in each coordinate, not for arbitrary non-iid product spaces.
-
-The PAC-Bayes layer now includes a finite Bernstein margin-proxy shell. It
-adds a supplied per-hypothesis variance proxy, a normalized Bernstein
-prior-moment certificate, fixed-`λ` finite bad-event bounds, and a
-posterior-dependent square-root-plus-linear wrapper. This is a supplied-proxy
-finite shell: it is not yet a concrete classifier-margin extractor, an
-all-real-`λ` optimization theorem, or a continuous hypothesis-space theorem.
-
-**60 `FormalSLT/` Lean files. 85 checked Lean files under `FormalSLT/` and
-`examples/`. 33,427 lines. Zero `sorry`. Zero `admit`. Zero custom axioms.**
-
-Counts are generated with `find FormalSLT -name '*.lean'`, `find FormalSLT
-examples -name '*.lean'`, and `find FormalSLT examples -name '*.lean' -print0 |
-xargs -0 wc -l`.
-
-The printed axiom profile for the v0.1 headline surface stays inside:
+FormalSLT is a compact Lean 4 library for the finite-sample statistical
+learning theory route from empirical risk minimization to VC-style
+generalization bounds, with recent extensions for contraction, linear
+predictors, finite sub-Gaussian chaining, algorithmic stability, and
+finite PAC-Bayes confidence bounds, a conditional sub-gamma MGF extractor,
+sharp bounded-differences/McDiarmid tails, and a first total-bounded finite-net
+bridge for the Dudley lane. The PAC-Bayes layer now also includes a finite iid
+Bernstein classifier-margin theorem for thresholded margin losses.
+
+> **Repository names.** The public home of this project is
+> [`Robby955/FormalSLT`](https://github.com/Robby955/FormalSLT); the same code
+> also lives in the development repo `Robby955/lean-statistical-learning`. Clone
+> instructions and badges below use the public `FormalSLT` name.
+
+
+**61 Lean modules. 32,558 lines of Lean. Zero `sorry`. Zero `admit`. Zero custom axioms.**
+
+
+Axioms used by the public theorem spine:
`[propext, Classical.choice, Quot.sound]`.

-## What v0.1 Proves
-
-| Surface | Main checked endpoint | Checker |
-|---|---|---|
-| Finite-class Hoeffding confidence sequence | `FiniteClassConfidenceSequence.failure_probability_le` | `examples/CheckUniformConvergence.lean` |
-| Unit-interval finite-net Dudley bridge | `unitIntervalRademacherLinearSup_roundedDyadicGrid_dudley_m_bound_prefixFree` | `examples/CheckUnitIntervalDudley.lean` |
-| Packaged finite dyadic Dudley API | `FiniteDyadicDudleyInstance.suppliedSup_dudley_bound` | `examples/CheckV01Usability.lean` |
-| Two-point dyadic Dudley instance | `twoPointDudleyInstance` | `examples/CheckTwoPointDudley.lean` |
-| `Fin n` discrete dyadic Dudley instance | `finDiscreteDudleyInstance` | `examples/CheckFiniteDiscreteDudley.lean` |
-| Two-sided sharp McDiarmid over a homogeneous product measure | `mcdiarmid_twoSided_of_hasBoundedDifferences_sharp` | `FormalSLT/Test/SharpMcDiarmidTest.lean` |
-| PAC-Bayes Bernstein supplied margin-proxy shell | `finitePACBayesBernsteinMargin_badEventMass_le_delta` | `examples/CheckPACBayesBernstein.lean` |
-
-The confidence-sequence chain makes the finite-class and countable-time union
-bound explicit. The Dudley chain connects finite sub-Gaussian chaining to a
-non-finite metric index space by constructing rounded dyadic finite nets and
-checking the supplied supremum for the concrete process
-`X(b,t) = signOfBool b * t`.
-
-## Fast Verification
-
-From the repository root:
-
-```bash
-lake exe cache get
-lake build FormalSLT
-lake env lean examples/CheckV01Usability.lean
-lake env lean examples/CheckUniformConvergence.lean
-lake env lean examples/CheckUnitIntervalDudley.lean
-lake env lean examples/CheckTwoPointDudley.lean
-lake env lean examples/CheckFiniteDiscreteDudley.lean
-lake env lean examples/CheckPACBayesBernstein.lean
-python3 scripts/generate_proof_frontier_manifest.py --check
-```
-
-If `lake` is not on the shell path, use `~/.elan/bin/lake`.
-
-## What v0.1 Does Not Prove
-
-- No full continuous Dudley entropy-integral theorem.
-- No arbitrary measurable-supremum construction over non-finite classes.
-- No general separability theorem.
-- No infinite-class confidence sequence.
-- No martingale or e-process stopping theorem in the v0.1 confidence-sequence
- surface.
-- No neural-network generalization theorem.
+The core path runs from ERM through Rademacher symmetrization,
+high-probability Rademacher bounds, Massart, Sauer-Shelah, the binary VC
+bridge, finite contraction, linear predictors, finite sub-Gaussian chaining,
+finite Dudley entropy-budget wrappers, finite algorithmic stability, finite
+localized-Rademacher scaffolding, sharp McDiarmid product-measure concentration,
+finite PAC-Bayes KL/DV/MGF, bounded-loss confidence bounds, finite iid
+Bernstein classifier-margin adapters, and total-bounded finite-net adapters for
+the next Dudley steps.
## Where to start
-- **For the v0.1 proof surface:** start with
- [FormalSLT v0.1 quickstart](./docs/formalslt-v0.1-quickstart.md).
-- **For the v0.1 technical note:** read
- [FormalSLT v0.1 technical note](./docs/formalslt-v0.1-technical-note.md).
-- **For TheoremPath packaging:** read
- [TheoremPath FormalSLT v0.1 draft](./docs/theorempath-formalslt-v0.1-page-draft.mdx).
-- **For the live walkthrough:** see
- [theorempath.com/theorems/formalslt-v0-1](https://theorempath.com/theorems/formalslt-v0-1).
- **For ML readers:** start with [How to read the proofs](./docs/how-to-read-the-proofs.md),
then [Intuition](./docs/intuition.md).
- **For proof structure:** see [Diagrams](./docs/diagrams.md).
- **For exact theorem names:** use [Theorem map](./docs/theorem-map.md).
-- **For the conditional sub-Gamma extractor:** see
- [Conditional Sub-Gamma Extractor](./docs/subgamma-extractor.md).
-- **For the non-finite unit-interval Dudley example:** see
- [Unit-Interval Dudley Example](./docs/unit-interval-dudley.md).
-- **For reusable dyadic Dudley API packaging:** see
- [FormalSLT v0.1 quickstart](./docs/formalslt-v0.1-quickstart.md).
-- **For the finite discrete dyadic-net family:** see
- [FormalSLT v0.1 quickstart](./docs/formalslt-v0.1-quickstart.md).
-- **For a generated proof-surface index:** see
- [Proof frontier manifest](./docs/proof-frontier.md).
- **For scope and assumptions:** read
- [Scope and assumptions](./docs/assumptions-and-nonclaims.md).
+ [Assumptions and current boundaries](./docs/assumptions-and-nonclaims.md).
- **For contributors:** read [Contributing](./CONTRIBUTING.md), then
[Good first issues](./docs/good-first-issues.md).
- **For related Lean projects:** see [Related work](./docs/related-work.md).
- FormalSLT is scoped as a finite-class theorem spine and is complementary to
- existing empirical-process and Rademacher-generalization formalizations.
-
-## Library Map
-
-FormalSLT records finite-sample statistical learning bounds as Lean theorems
-with explicit hypotheses and constants. Beyond the v0.1 headline chains, the
-library contains checked finite routes for:
-
-- finite-class ERM, Rademacher symmetrization, Massart, Sauer-Shelah, and
- VC-style sample-complexity wrappers;
-- high-probability Rademacher and VC sample-complexity bounds carrying the
- sharp McDiarmid `2B²` exponent;
-- the sharp bounded-differences/McDiarmid concentration module
- (`Concentration.SharpMcDiarmid`): one-sided, lower-tail, and two-sided
- homogeneous product-measure tails, plus the additive independent special case;
-- finite contraction and linear-predictor Rademacher bounds;
-- finite Bennett/Bernstein and localized-Rademacher scaffolding;
-- conditional sub-Gamma MGF extraction for bounded, conditionally centered
- increments with an explicit conditional second-moment proxy;
-- finite covering, finite sub-Gaussian chaining, and finite Dudley
- entropy-budget wrappers;
-- finite algorithmic stability expected-gap and high-probability wrappers;
-- finite PAC-Bayes KL/DV/MGF, bounded-loss confidence bounds, and finite-grid
- peeling wrappers;
-- finite PAC-Bayes Bernstein margin-proxy bounds with a supplied
- per-hypothesis variance proxy.
+ FormalSLT is complementary to
+ [YuanheZ/lean-stat-learning-theory](https://github.com/YuanheZ/lean-stat-learning-theory),
+ which builds deeper empirical-process infrastructure; this repo focuses on a
+ readable finite-class theorem spine.
+
+## Why this matters
+
+Finite-sample proofs in statistical learning theory carry assumptions that are
+easy to blur in prose. FormalSLT turns a readable finite-sample route into
+machine-checked Lean statements, so each bound carries its hypotheses,
+constants, and finite-class scope in the theorem signature. The motivations:
+
+- **Reproducibility.** Every constant — the sharp McDiarmid exponent, the
+ `2 * E[Rad]` factor, the `(en/d)^d` Sauer-Shelah closed form — is a Lean
+ term that the kernel re-checks on every build. There is no implicit "without
+ loss of generality" or "up to constants".
+- **Curriculum.** A graduate student reading the README can click through to
+ the exact Lean statement for any bound they have seen on a chalkboard, with
+ the assumptions made explicit in the type signature.
+- **Frontier ML.** As learning theory increasingly feeds back into modern
+ deep-learning analyses (PAC-Bayes generalization, stability, chaining),
+ having a checked finite scaffold is the first step toward formal guarantees
+ on the methods themselves.
+
+## Theorem families
+
+| Family | Main modules | Representative result | Status |
+|---|---|---|---|
+| Finite-class ERM | `Risk`, `ERM`, `Rademacher.ERMGeneralization` | Excess risk controlled by uniform deviation | Verified |
+| Rademacher symmetrization | `GhostSample`, `Rademacher.Symmetrization` | `E[genGap] ≤ 2 * E[Rad]` | Verified |
+| High-probability Rademacher | `Azuma.*`, `Rademacher.HighProbability` | `P(genGap ≥ 2 * E[Rad] + ε) ≤ exp(-ε² n / (2B²))` | Verified |
+| Conditional sub-gamma concentration | `Concentration.SubGamma.Extractor`, `Concentration.SubGamma.BennettBound` | bounded + conditionally-centered + conditional second moment `≤ σ²` ⇒ conditional MGF `≤ exp(σ²λ²/(2(1−bλ/3)))` on `b·λ < 3` | Verified |
+| Massart finite-class bound | `Rademacher.Massart` | `Rad(H,S) ≤ B * sqrt(2 * log card(H) / n)` | Verified |
+| Binary VC route | `VC.SauerShelah`, `VC.BinaryVCBridge`, `VC.SampleComplexity` | VC-style ERM excess-risk tail and closed sample-complexity form via effective classes | Verified |
+| Finite contraction | `Rademacher.Contraction` | `Rad_S(φ ∘ F) ≤ L * Rad_S(F)` for scalar finite samples/classes | Verified |
+| Linear predictors | `Rademacher.LinearPredictor` | `Rad ≤ R * n⁻¹ * sqrt(∑ k, ‖z k‖²)` and `Rad ≤ R * B / sqrt n` | Verified |
+| Localized Rademacher scaffold | `Rademacher.Localized` | Bernstein excess-risk localization embeds into second-moment localized empirical Rademacher complexity | Verified finite scaffold |
+| Finite covering and two-scale chaining | `Covering.Rademacher`, `Covering.DudleyChaining` | ε-net peeling and two-scale finite chaining | Verified |
+| Finite sub-Gaussian chaining foundation | `Covering.FiniteSubGaussianChaining` | finite-max entropy bounds and finite Dudley-style entropy-budget sums | Verified finite infrastructure |
+| Total-bounded Dudley bridge | `Covering.TotalBoundedDudley` | totally bounded metric spaces yield dyadic finite-net schedules, projected finite-net wrappers, truncated interval-integral entropy comparisons, and supplied-supremum / finite-skeleton / pathwise-modulus / epsilonized boundary adapters | Verified bridge |
+| Algorithmic stability | `AlgorithmicStability`, `Stability.BousquetElisseeff` | bounded-differences constants, finite and product-measure expected-gap wrappers with bound `β`, bounded-loss measurability adapters, and bounded-loss Azuma-constant concentration wrappers | Verified finite scaffold |
+| PAC-Bayes finite confidence layer | `PACBayesKL`, `PACBayesMcAllester`, `PACBayesFiniteProductMGF`, `PACBayesBoundedLoss`, `PACBayesBernstein`, `PACBayesMargin` | finite KL/DV change-of-measure, bounded-loss Catoni-style bound, closed PAC-Bayes good-event payoff, fixed-budget McAllester corollary, finite-grid McAllester peeling wrapper, and finite iid Bernstein classifier-margin bad-event bounds | Verified finite layer |
## Scope and assumptions
@@ -165,45 +102,36 @@ The main generalization theorems are intentionally finite and explicit.
| Hypothesis classes | Finite index types unless a theorem states a separate finite net/family |
| Samples | Finite iid samples through product measures |
| Losses/processes | Scalar real-valued, with boundedness or finite sub-Gaussian MGF assumptions |
-| Conditional MGF layer | Bounded, conditionally centered real increments with an explicit conditional second-moment proxy |
-| Constants | High-probability Rademacher and VC sample-complexity bounds use the sharp McDiarmid `2B²` exponent; the localized-Rademacher wrapper still cites the Azuma `8B²` constant |
-| PAC-Bayes Bernstein layer | Finite posterior/prior setting with supplied variance proxy and normalized prior-moment certificate |
-| Chaining | Finite nets/images, finite support/outcome spaces, finite entropy sums; the unit-interval example instantiates the bridge on a non-finite metric index space with explicit finite meshes |
+| Constants | Main Rademacher, VC, stability, and bounded-differences wrappers use the sharp McDiarmid route where stated; legacy Azuma lemmas remain as named infrastructure |
+| Chaining | Finite nets/images, finite support/outcome spaces, finite entropy sums |
| Public axiom target | `[propext, Classical.choice, Quot.sound]` only |
-## Open work
+## Current boundaries
Short version:
-- The main Rademacher and VC results are finite-class and finite-sample
- theorems.
-- The sharp McDiarmid constant `exp(-2t²/∑cᵢ²)` is proved for the additive
- independent case (`mcdiarmid_additive_independent`), for Doob martingale
- increments with conditional sub-Gaussian MGF control
- (`sharp_mcdiarmid_of_doob_increments`), and for a general bounded-differences
- function over a homogeneous product measure, one-sided
- (`mcdiarmid_of_hasBoundedDifferences_sharp`) and two-sided
- (`mcdiarmid_twoSided_of_hasBoundedDifferences_sharp`). The high-probability
- Rademacher and VC sample-complexity wrappers now use this sharp `2B²`
- exponent; the localized-Rademacher wrapper still uses the older Azuma `8B²`
- constant. The bounded-differences theorem is stated for the same product law
- in each coordinate, not for arbitrary non-iid product spaces.
-- The chaining layer proves finite entropy-budget infrastructure and an initial
+- The main Rademacher and VC results are finite-class / finite-sample theorems.
+- The main Rademacher, VC, stability, and bounded-differences wrappers have
+ been rewired to the sharp McDiarmid route where the theorem name states that
+ sharp constant.
+- The chaining layer proves finite entropy-budget infrastructure and a first
total-bounded finite-net extraction bridge, not the continuous Dudley
integral.
- PAC-Bayes includes a finite `[0,1]` bounded-loss Catoni-style confidence
bound, a closed high-confidence good-event theorem, a fixed-budget
- McAllester-style square-root corollary, a finite-grid peeling wrapper for
- posterior-dependent penalties, and a finite Bernstein margin-proxy shell with
- a supplied per-hypothesis variance proxy. Exact all-real-`λ`, concrete
- classifier-margin extractors, infinite-hypothesis, and continuous-posterior
- variants are not yet implemented.
+ McAllester-style square-root corollary, a finite-grid peeling wrapper for
+ posterior-dependent penalties, and a finite iid Bernstein classifier-margin
+ theorem for thresholded margin losses with the prior-moment certificate
+ derived from the iid finite product law. The Bernstein lane now also has a
+ finite-grid `λ` wrapper for posterior-dependent penalties. Exact all-real
+ `λ`, infinite-hypothesis, and continuous-posterior variants remain future
+ work.
- Algorithmic stability includes finite iid and measure-theoretic iid
expected-gap wrappers, plus bounded-loss high-probability wrappers for
finite measurable hypothesis interfaces.
For the full scope statement, see
-[Scope and assumptions](./docs/assumptions-and-nonclaims.md).
+[Assumptions and current boundaries](./docs/assumptions-and-nonclaims.md).
## Installation
@@ -235,18 +163,7 @@ Run these before treating a branch as a release candidate:
```bash
lake exe cache get
lake build FormalSLT
-lake env lean examples/CheckV01Usability.lean
-lake env lean examples/CheckSubGammaExtractor.lean
-lake env lean examples/CheckUnitIntervalDudley.lean
-lake env lean examples/CheckTwoPointDudley.lean
-lake env lean examples/CheckFiniteDiscreteDudley.lean
-lake env lean examples/CheckPACBayesBernstein.lean
-python3 scripts/generate_proof_frontier_manifest.py --check
-python3 scripts/check_doc_anchors.py \
- docs/formalslt-v0.1-technical-note.md \
- docs/formalslt-v0.1-artifact-map-2026-06-01.md \
- docs/formalslt-v0.1-release-review-2026-06-01.md \
- docs/theorempath-formalslt-v0.1-page-draft.mdx
+lake env lean examples/CheckShowcaseTheorems.lean
```
## Audit commands
@@ -257,35 +174,16 @@ audits:
```bash
rg -n --pcre2 '^\s*(?:by\s+)?(?:sorry|admit)\b|:=\s*(?:by\s+)?(?:sorry|admit)\b' FormalSLT examples
rg -n --pcre2 '^\s*(?:axiom|constant)\s+[A-Za-z_]' FormalSLT examples
-python3 scripts/generate_proof_frontier_manifest.py --check
-python3 scripts/check_doc_anchors.py \
- docs/formalslt-v0.1-technical-note.md \
- docs/formalslt-v0.1-artifact-map-2026-06-01.md \
- docs/formalslt-v0.1-release-review-2026-06-01.md \
- docs/theorempath-formalslt-v0.1-page-draft.mdx
git diff --check
```
The expected result is:
- `lake build FormalSLT` exits successfully;
-- `examples/CheckV01Usability.lean` resolves the v0.1 citation targets and
- prints standard Lean/Mathlib axioms for the bundled confidence-sequence API,
- the dyadic-net sequence API, and the concrete dyadic-net instances;
- `examples/CheckShowcaseTheorems.lean` prints standard Lean/Mathlib axioms
for selected public theorems;
-- `examples/CheckSubGammaExtractor.lean` prints standard Lean/Mathlib axioms
- for the conditional sub-Gamma extractor and its helper lemmas;
-- `examples/CheckUnitIntervalDudley.lean` prints standard Lean/Mathlib axioms
- for the concrete unit-interval Dudley example;
-- `examples/CheckTwoPointDudley.lean` prints standard Lean/Mathlib axioms
- for the second dyadic-net example;
-- `examples/CheckFiniteDiscreteDudley.lean` prints standard Lean/Mathlib
- axioms for the finite discrete embedded Rademacher dyadic-net family;
- the `rg` commands find no executable `sorry`, no executable `admit`, and no
custom axioms/constants in `FormalSLT` or `examples`;
-- the proof-frontier manifest is in sync with the theorem map and source counts;
-- documented `FormalSLT/...lean:line` anchors point at the named declarations;
- `git diff --check` reports no whitespace errors.
## Module map
@@ -294,11 +192,12 @@ The expected result is:
|---|---|
| Core definitions | `Risk`, `ERM`, `UniformConvergence`, `GhostSample` |
| Probability utilities | `Probability.Concentration`, `Probability.FiniteUnionBound`, `Probability.FiniteExpectation` |
-| Conditional sub-Gamma infrastructure | `Concentration.SubGamma.BennettBound`, `Concentration.SubGamma.BoundedExpIntegrable`, `Concentration.SubGamma.CondExpProduct`, `Concentration.SubGamma.CondJensen`, `Concentration.SubGamma.CondMarkov`, `Concentration.SubGamma.CondVarianceFromSquare`, `Concentration.SubGamma.Extractor` |
+| Conditional sub-gamma concentration | `Concentration.SubGamma.BennettBound`, `Concentration.SubGamma.BoundedExpIntegrable`, `Concentration.SubGamma.CondExpProduct`, `Concentration.SubGamma.CondJensen`, `Concentration.SubGamma.CondMarkov`, `Concentration.SubGamma.CondVarianceFromSquare`, `Concentration.SubGamma.Extractor` |
+| Sharp bounded-differences concentration | `Concentration.SharpMcDiarmid` |
| Rademacher route | `Rademacher.FiniteSample`, `Rademacher.FiniteSampleSymmetrization`, `Rademacher.ProbabilityBridge`, `Rademacher.Decoupling`, `Rademacher.Symmetrization`, `Rademacher.Massart`, `Rademacher.HighProbability`, `Rademacher.FiniteClassHighProb`, `Rademacher.UniformDeviation`, `Rademacher.ERMGeneralization`, `Rademacher.Contraction`, `Rademacher.LinearPredictor`, `Rademacher.Localized` |
| Azuma infrastructure | `Azuma.ExposureMartingale`, `Azuma.BoundedDifferences`, `Azuma.BoundedDiffMartingale`, `Azuma.BoundedDiffsAzumaInput`, `Azuma.BoundedIncrementBound`, `Azuma.HasBoundedDifferences`, `Azuma.ExposureIncrementHoeffding`, `Azuma.ExposureIncrementCondMGF`, `Azuma.GenGapTail` |
| VC route | `VC.Dimension`, `VC.PACBridge`, `VC.SauerShelah`, `VC.Rademacher`, `VC.SampleComplexity`, `VC.BinaryVCBridge` |
-| Covering and chaining | `Covering.Rademacher`, `Covering.DudleyChaining`, `Covering.FiniteSubGaussianChaining`, `Covering.TotalBoundedDudley`, `Covering.UnitIntervalDudley`, `Covering.TwoPointDudley`, `Covering.FiniteDiscreteDudley` |
+| Covering and chaining | `Covering.Rademacher`, `Covering.DudleyChaining`, `Covering.FiniteSubGaussianChaining`, `Covering.TotalBoundedDudley` |
| Stability and PAC-Bayes foundations | `AlgorithmicStability`, `Stability.BousquetElisseeff`, `PACBayesKL`, `PACBayesMcAllester`, `PACBayesFiniteProductMGF`, `PACBayesBoundedLoss`, `PACBayesBernstein` |
## Roadmap
@@ -308,6 +207,8 @@ The expected result is:
- [x] Massart finite-class bound
- [x] Azuma-Hoeffding genGap tail
- [x] High-probability Rademacher bound
+- [x] Conditional sub-gamma MGF extractor (bounded + conditionally-centered +
+ conditional second moment ⇒ sub-gamma conditional MGF on `b·λ < 3`)
- [x] Sauer-Shelah polynomial bound
- [x] VC-style pointwise Rademacher
- [x] VC uniform deviation and ERM excess-risk tail
@@ -316,8 +217,6 @@ The expected result is:
- [x] Finite-sample scalar contraction
- [x] Finite-dimensional linear predictor Rademacher bound
- [x] Finite localized Rademacher/Bernstein variance-localization scaffold
-- [x] Conditional sub-Gamma MGF extractor from boundedness, conditional
- centering, and a conditional second-moment proxy
- [x] Covering number peeling and two-scale chaining
- [x] Finite sub-Gaussian max and finite chaining entropy budgets
- [x] Algorithmic stability bounded-differences scaffold
@@ -330,8 +229,10 @@ The expected result is:
- [x] PAC-Bayes closed high-confidence generalization payoff theorem
- [x] PAC-Bayes fixed-budget McAllester-style square-root corollary
- [x] PAC-Bayes finite-grid McAllester peeling and optimized finite-grid wrapper
-- [x] PAC-Bayes finite Bernstein margin-proxy wrapper with supplied variance
- proxy and normalized prior-moment certificate
+- [x] PAC-Bayes finite iid Bernstein classifier-margin confidence wrapper with
+ concrete margin losses and an iid-derived normalized prior-moment certificate
+- [x] PAC-Bayes finite-grid Bernstein `λ` wrapper for posterior-dependent
+ classifier-margin penalties
- [x] Finite Dudley discrete entropy-bound refinements: annulus, integral-budget,
and prefix-envelope wrappers
- [x] Total-bounded finite-net extraction bridge for the continuous Dudley lane
@@ -360,8 +261,6 @@ The expected result is:
- [x] Finite-cover/pathwise-modulus bridge into the separable-terminal Dudley
boundary interface
- [x] Finite-terminal total-bounded dyadic Dudley wrapper
-- [x] Concrete unit-interval Dudley example with explicit half/quarter meshes
- and a supplied-supremum projected-mesh bound
- [ ] Continuous Dudley entropy-integral theorem over total-bounded classes
- [x] Measure-theoretic iid algorithmic stability expected bound, with explicit
integrability assumptions
@@ -369,16 +268,13 @@ The expected result is:
expected-gap theorem
- [x] Bounded-loss high-probability stability wrappers for finite measurable
hypothesis interfaces
-- [ ] PAC-Bayes concrete margin-loss Bernstein extractor, all-real-`λ`, or
- continuous-posterior extensions
-- [x] Sharp McDiarmid constant for the additive independent case and for Doob
- martingale increments with conditional sub-Gaussian MGF control
-- [x] Sharp McDiarmid for a general bounded-differences function over a
- homogeneous product measure: one-sided, lower-tail, and two-sided
- (`mcdiarmid_twoSided_of_hasBoundedDifferences_sharp`), with the sharp `2B²`
- exponent propagated into the high-probability Rademacher and VC wrappers
-- [ ] Sharp McDiarmid over arbitrary non-iid product spaces (different law per
- coordinate)
+- [x] PAC-Bayes iid margin-loss Bernstein prior-moment derivation
+- [x] PAC-Bayes Bernstein finite-grid `λ` extension
+- [ ] PAC-Bayes Bernstein all-real-`λ` or continuous-posterior extensions
+- [x] Sharp McDiarmid/product-kernel assembly. The sharp exposure-increment
+ conditional MGF, one-sided iid product bounded-differences tail, lower-tail
+ wrapper, and two-sided `mcdiarmid_twoSided_of_hasBoundedDifferences_sharp`
+ theorem are on the verified spine
- [ ] Continuous Dudley-style entropy integral
## Dependencies
@@ -406,7 +302,7 @@ If you use FormalSLT in academic work, please cite:
```bibtex
@software{formal_slt,
title = {FormalSLT: Formal Statistical Learning Theory in Lean 4},
- author = {Sneiderman, Robert},
+ author = {Sneiderman, Robby},
year = {2026},
url = {https://github.com/Robby955/FormalSLT},
note = {Lean 4 formalization of finite-sample SLT bounds.}
diff --git a/docs/assumptions-and-nonclaims.md b/docs/assumptions-and-nonclaims.md
index d0f821e..17af217 100644
--- a/docs/assumptions-and-nonclaims.md
+++ b/docs/assumptions-and-nonclaims.md
@@ -1,6 +1,7 @@
-# Scope and Assumptions
+# Assumptions and Current Boundaries
-This page records the assumptions behind the current theorem spine.
+This page records the assumptions behind the current theorem spine and the
+current boundaries for public summaries.
## Scope
@@ -75,38 +76,10 @@ generic VC bridge is not yet part of the library.
- finite multiscale chains;
- finite entropy sums and entropy-budget wrappers.
-It is finite infrastructure for Dudley-style arguments. It is not the
-continuous Dudley entropy integral.
+It is a foundation for Dudley-style arguments. It is not the continuous
+Dudley entropy integral.
-### Unit-interval Dudley example
-
-`Covering.UnitIntervalDudley` instantiates the total-bounded finite-net bridge
-on the non-finite metric index space `[0,1]`. It constructs explicit half and
-quarter meshes, packages the Rademacher process
-`X(b,t) = signOfBool b * t`, and routes a nonzero supplied supremum through a
-projected finite-net Dudley bound with a concrete `sqrt (log 15)` entropy
-envelope.
-
-This example still uses the supplied-supremum interface. It does not construct
-arbitrary measurable suprema, prove a general separability theorem, or state
-the full continuous Dudley entropy integral.
-
-### Conditional sub-Gamma extraction
-
-`Concentration.SubGamma.Extractor` proves a conditional MGF bound for a
-single bounded real increment. The headline theorem
-`condSubGammaMGF_of_bounded_centered_condVariance` assumes:
-
-- an a.s. bound `|X| ≤ b`;
-- conditional centering `μ[X | m] = 0`;
-- a conditional second-moment proxy `μ[X² | m] ≤ σ²`;
-- the parameter regime `0 ≤ λ` and `b * λ < 3`.
-
-The theorem converts those assumptions into a conditional sub-Gamma MGF bound.
-It does not by itself construct a filtration, prove independence, or state a
-full sequential concentration inequality.
-
-## Open Work
+## Current Boundaries
### VC-to-PAC equivalence
@@ -121,11 +94,6 @@ general infinite classes requires covering-number APIs, measurable suprema,
separability assumptions, and approximation arguments that are outside the
current theorem spine.
-The unit-interval Dudley example is the current concrete bridge beyond a
-finite ambient index type. It verifies finite-net machinery on `[0,1]`, but it
-does not remove the remaining measurable-supremum and separability obligations
-for arbitrary non-finite classes.
-
### Downstream sharp-tail propagation
The product-measure sharp McDiarmid theorem is checked, and the main
@@ -154,10 +122,13 @@ posterior-dependent penalties certified by that finite grid. The closed
total iid product mass to state the finite high-confidence good event directly.
`PACBayesBernstein` adds a finite Bernstein margin-proxy shell: the variance
proxy is supplied per hypothesis, and the theorem consumes a normalized
-Bernstein prior-moment certificate. It does not yet derive that variance proxy
-from a concrete classifier-margin loss. Exact all-real `λ`, finite-grid
-Bernstein optimization, infinite-hypothesis, and continuous-posterior
-PAC-Bayes theorems are not yet implemented.
+Bernstein prior-moment certificate. `PACBayesMargin` specializes that shell to
+thresholded classifier-margin losses, proves the iid finite classifier-margin
+MGF budget, derives the normalized Bernstein prior-moment certificate, and
+packages the fixed-`lambda` iid bad-event theorem with the sample-size-scaled
+variance proxy. Exact all-real `λ`, finite-grid Bernstein optimization,
+infinite-hypothesis, and continuous-posterior PAC-Bayes theorems remain future
+work.
### Algorithmic stability expected bound
diff --git a/docs/diagrams.md b/docs/diagrams.md
index 264c04b..79b4013 100644
--- a/docs/diagrams.md
+++ b/docs/diagrams.md
@@ -186,8 +186,8 @@ flowchart TD
The PAC-Bayes lane runs from the KL/Donsker-Varadhan change of measure through
a finite Catoni-style bound to a finite-grid McAllester peeling wrapper for
posterior-dependent penalties. The Bernstein lane adds a finite margin-proxy
-wrapper with a supplied per-hypothesis variance proxy and a normalized
-prior-moment certificate.
+wrapper, then discharges the normalized prior-moment certificate for iid finite
+classifier-margin losses, and then adds a finite-grid Bernstein `lambda` wrapper.
```mermaid
flowchart TD
@@ -198,18 +198,20 @@ flowchart TD
fixed["Fixed-budget McAllester square-root corollary"]
peeling["Finite-grid McAllester peeling
posterior-dependent penalty"]
optimized["Optimized finite-grid wrapper"]
- bernstein["Finite Bernstein margin-proxy shell
supplied variance proxy"]
- future["Next: all-real-λ / continuous-posterior extensions
(open)"]
-
- kl --> mgf --> catoni --> payoff --> fixed --> peeling --> optimized
- kl --> bernstein
- bernstein -.-> future
- optimized -.-> future
-
- classDef verified fill:#f0fdf4,stroke:#16a34a,color:#14532d;
- classDef future fill:#f8fafc,stroke:#94a3b8,color:#475569,stroke-dasharray: 5 4;
- class kl,mgf,catoni,payoff,fixed,peeling,optimized,bernstein verified;
- class future future;
+ bernstein["Finite Bernstein margin-proxy shell
supplied variance proxy"]
+ margin["Iid classifier-margin MGF
fixed-lambda bad-event theorem"]
+ bernsteinGrid["Finite-grid Bernstein λ wrapper
posterior-dependent penalty"]
+ future["Next: all-real-λ / continuous-posterior extensions
(open)"]
+
+ kl --> mgf --> catoni --> payoff --> fixed --> peeling --> optimized
+ kl --> bernstein --> margin --> bernsteinGrid
+ bernsteinGrid -.-> future
+ optimized -.-> future
+
+ classDef verified fill:#f0fdf4,stroke:#16a34a,color:#14532d;
+ classDef future fill:#f8fafc,stroke:#94a3b8,color:#475569,stroke-dasharray: 5 4;
+ class kl,mgf,catoni,payoff,fixed,peeling,optimized,bernstein,margin,bernsteinGrid verified;
+ class future future;
```
## Where each definition first appears
diff --git a/docs/proof-frontier-manifest.json b/docs/proof-frontier-manifest.json
index 0785093..a6e04e1 100644
--- a/docs/proof-frontier-manifest.json
+++ b/docs/proof-frontier-manifest.json
@@ -67,11 +67,11 @@
},
"schema": "FormalSLT.proof_frontier.v1",
"source_counts": {
- "example_lean_files": 25,
- "library_and_example_lean_lines": 33427,
- "library_lean_lines": 31533,
- "library_modules": 60,
- "theorem_map_entries": 412
+ "example_lean_files": 26,
+ "library_and_example_lean_lines": 34536,
+ "library_lean_lines": 32558,
+ "library_modules": 61,
+ "theorem_map_entries": 426
},
"theorem_families": [
{
@@ -2437,6 +2437,12 @@
"name": "finiteProduct_mgf_empiricalRiskDeviation_le_of_single",
"summary": "Single-coordinate MGF budget lifts to the finite sample-average MGF"
},
+ {
+ "kind": "theorem",
+ "module": "PACBayesFiniteProductMGF",
+ "name": "finiteProduct_mgf_empiricalRiskDeviation_le_exp_of_single",
+ "summary": "Single-coordinate exponential budget lifts to an explicit finite product MGF budget"
+ },
{
"kind": "theorem",
"module": "PACBayesFiniteProductMGF",
@@ -2527,6 +2533,12 @@
"name": "priorBernsteinExpMoment",
"summary": "Normalized Bernstein prior exponential moment with variance and scale terms"
},
+ {
+ "kind": "theorem",
+ "module": "PACBayesBernstein",
+ "name": "expectedPriorBernsteinExpMoment_le_one_of_mgf_bound",
+ "summary": "Expected normalized prior moment is at most one when each hypothesis has the corresponding Bernstein MGF budget"
+ },
{
"kind": "theorem",
"module": "PACBayesBernstein",
@@ -2550,6 +2562,78 @@
"module": "PACBayesBernstein",
"name": "finitePACBayesBernsteinMargin_badEventMass_le_delta",
"summary": "Finite supplied margin-proxy wrapper with sqrt(2 * V\u03c1 * C\u03c1) + scale * C\u03c1 penalty form"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "classifierMarginLoss",
+ "summary": "Thresholded finite classifier-margin loss in {0,1}"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "classifierMarginPopulationRisk",
+ "summary": "Concrete population risk for thresholded classifier-margin loss"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "classifierMarginEmpiricalRisk",
+ "summary": "Concrete empirical risk for thresholded classifier-margin loss"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "classifierMarginVarianceProxy",
+ "summary": "Concrete risk-as-variance proxy for indicator classifier-margin loss"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "classifierMarginSampleVarianceProxy",
+ "summary": "Sample-size-scaled variance proxy for iid classifier-margin empirical-risk deviations"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "classifierMarginVariance_le_risk",
+ "summary": "Indicator margin-loss centered second moment is bounded by its risk proxy"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "oneCoordinate_classifierMarginLoss_mgf_subgamma",
+ "summary": "One-coordinate finite Bennett/sub-Gamma MGF bound for thresholded classifier-margin loss"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "finiteProduct_classifierMarginLoss_mgf_subgamma",
+ "summary": "Iid product MGF bound for classifier-margin empirical-risk deviations"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "expectedPriorBernsteinExpMoment_classifierMargin_iid_le_one",
+ "summary": "Iid classifier-margin losses discharge the normalized Bernstein prior-moment certificate"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "finiteProductSampleWeight_isPMF",
+ "summary": "Iid finite product sample weights form a PMF when the base mass is a PMF"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "finitePACBayesClassifierMarginBernstein_badEventMass_le_delta",
+ "summary": "PAC-Bayes Bernstein bad-event theorem specialized to finite classifier-margin losses under the supplied prior-moment certificate"
+ },
+ {
+ "kind": "theorem",
+ "module": "PACBayesMargin",
+ "name": "finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta",
+ "summary": "End-to-end fixed-lambda iid PAC-Bayes Bernstein bad-event theorem for finite classifier-margin losses"
}
],
"name": "Stability and PAC-Bayes foundations"
diff --git a/docs/roadmap.md b/docs/roadmap.md
index f1dc628..4a0616f 100644
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@@ -231,6 +231,9 @@
proxy.~~ ✓ (`PACBayesBernstein.posteriorMarginVarianceProxy`)
- ~~Step 2: define the normalized Bernstein prior exponential moment with
variance and scale terms.~~ ✓ (`PACBayesBernstein.priorBernsteinExpMoment`)
+ - ~~Step 2.5: lift per-hypothesis Bernstein MGF budgets to the expected
+ normalized prior moment.~~ ✓
+ (`PACBayesBernstein.expectedPriorBernsteinExpMoment_le_one_of_mgf_bound`)
- ~~Step 3: prove the deterministic fixed-sample PAC-Bayes Bernstein
adapter from a prior-moment certificate.~~ ✓
(`PACBayesBernstein.posteriorGeneralizationGap_le_bernstein_of_priorBernsteinExpMoment_le`)
@@ -238,10 +241,19 @@
posterior-dependent margin-style penalties under explicit complexity and
penalty certificates.~~ ✓
(`PACBayesBernstein.finitePACBayesBernsteinMargin_badEventMass_le_delta`)
- - Remaining PAC-Bayes Bernstein extensions: concrete classifier-margin
- variance extractors, finite-grid `lambda` optimization, exact all-real
- `lambda` optimization, continuous posteriors, and infinite hypothesis
- classes.
+ - ~~Step 5: specialize the finite Bernstein shell to concrete thresholded
+ classifier-margin losses and the risk-as-variance proxy.~~ ✓
+ (`PACBayesMargin.finitePACBayesClassifierMarginBernstein_badEventMass_le_delta`)
+ - ~~Step 6: derive the normalized Bernstein prior-moment certificate from iid
+ finite classifier-margin losses and package the fixed-`lambda` iid
+ bad-event theorem.~~ ✓
+ (`PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_badEventMass_le_delta`)
+ - ~~Step 7: add finite-grid `lambda` optimization wrappers for Bernstein
+ posterior-dependent penalties.~~ ✓
+ (`PACBayesBernstein.finitePACBayesBernsteinGridOptimized_badEventMass_le_delta`,
+ `PACBayesMargin.finitePACBayesClassifierMarginBernstein_iid_gridOptimized_badEventMass_le_delta`)
+ - Remaining PAC-Bayes Bernstein extensions: exact all-real `lambda`
+ optimization, continuous posteriors, and infinite hypothesis classes.
### Long-term
diff --git a/docs/theorem-chain.svg b/docs/theorem-chain.svg
index 2c97431..97d4a03 100644
--- a/docs/theorem-chain.svg
+++ b/docs/theorem-chain.svg
@@ -1,6 +1,6 @@
-