diff --git a/.Rbuildignore b/.Rbuildignore
deleted file mode 100644
index dff09a7..0000000
--- a/.Rbuildignore
+++ /dev/null
@@ -1,4 +0,0 @@
-^requirements\.txt$
-^renv$
-^renv\.lock$
-^\.github$
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 3047166..1c02bb2 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -31,6 +31,13 @@ jobs:
       - name: Render to html
         run: quarto render --to html --profile html --no-clean
 
+      - name: Ensure PDF is in output
+        run: |
+          # Copy PDF to _book if it exists at root level
+          if [ -f "Machine-Learning-from-Human-Preferences.pdf" ]; then
+            cp "Machine-Learning-from-Human-Preferences.pdf" "_book/"
+          fi
+
       - name: Publish to GitHub Pages
         uses: quarto-dev/quarto-actions/publish@v2
         with:
diff --git a/.gitignore b/.gitignore
index c66a6a8..ff231f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,7 +54,7 @@ images/management/analysis_pipeline.pdf
 *.tex
 *.pdf
 *.idx
-*.ild
+*.ilg
 *.ind
 *.lock
 *.ipynb
diff --git a/Machine-Learning-from-Human-Preferences.idx b/Machine-Learning-from-Human-Preferences.idx
deleted file mode 100644
index 2384389..0000000
--- a/Machine-Learning-from-Human-Preferences.idx
+++ /dev/null
@@ -1,10 +0,0 @@
-\indexentry{de-identification|hyperindexformat{\seealso{anonymization}}}{232}
-\indexentry{anonymization|hyperindexformat{\seealso{de-identification}}}{232}
-\indexentry{analytic flexibility|hyperindexformat{\seealso{p-hacking}}}{232}
-\indexentry{p-hacking|hyperindexformat{\seealso{analytic flexibility}}}{232}
-\indexentry{Cohen's d|hyperindexformat{\seealso{standardized mean difference (SMD)}}}{232}
-\indexentry{standardized mean difference (SMD)|hyperindexformat{\seealso{Cohen's d}}}{232}
-\indexentry{APA|hyperindexformat{\see{American Psychological Association (APA)}}}{232}
-\indexentry{CDI|hyperindexformat{\see{Communicative Development Inventory}}}{232}
-\indexentry{DAG|hyperindexformat{\see{directed acyclic graph (DAG)}}}{232}
-\indexentry{blinding|hyperindexformat{\see{masking}}}{232}
diff --git a/Machine-Learning-from-Human-Preferences.ilg b/Machine-Learning-from-Human-Preferences.ilg
deleted file mode 100644
index f80a623..0000000
--- a/Machine-Learning-from-Human-Preferences.ilg
+++ /dev/null
@@ -1,6 +0,0 @@
-This is makeindex, version 2.17 [TeX Live 2024] (kpathsea + Thai support).
-Scanning input file Machine-Learning-from-Human-Preferences.idx....done (10 entries accepted, 0 rejected).
-Sorting entries....done (34 comparisons).
-Generating output file Machine-Learning-from-Human-Preferences.ind....done (37 lines written, 0 warnings).
-Output written in Machine-Learning-from-Human-Preferences.ind.
-Transcript written in Machine-Learning-from-Human-Preferences.ilg.
diff --git a/Machine-Learning-from-Human-Preferences.ind b/Machine-Learning-from-Human-Preferences.ind
deleted file mode 100644
index 2b1b4f7..0000000
--- a/Machine-Learning-from-Human-Preferences.ind
+++ /dev/null
@@ -1,37 +0,0 @@
-\begin{theindex}
-
-  \item analytic flexibility, 
-		\hyperindexformat{\seealso{p-hacking}}{232}
-  \item anonymization, 
-		\hyperindexformat{\seealso{de-identification}}{232}
-  \item APA, 
-		\hyperindexformat{\see{American Psychological Association (APA)}}{232}
-
-  \indexspace
-
-  \item blinding, \hyperindexformat{\see{masking}}{232}
-
-  \indexspace
-
-  \item CDI, 
-		\hyperindexformat{\see{Communicative Development Inventory}}{232}
-  \item Cohen's d, 
-		\hyperindexformat{\seealso{standardized mean difference (SMD)}}{232}
-
-  \indexspace
-
-  \item DAG, \hyperindexformat{\see{directed acyclic graph (DAG)}}{232}
-  \item de-identification, 
-		\hyperindexformat{\seealso{anonymization}}{232}
-
-  \indexspace
-
-  \item p-hacking, 
-		\hyperindexformat{\seealso{analytic flexibility}}{232}
-
-  \indexspace
-
-  \item standardized mean difference (SMD), 
-		\hyperindexformat{\seealso{Cohen's d}}{232}
-
-\end{theindex}
diff --git a/Machine-Learning-from-Human-Preferences.tex b/Machine-Learning-from-Human-Preferences.tex
deleted file mode 100644
index c1169e5..0000000
--- a/Machine-Learning-from-Human-Preferences.tex
+++ /dev/null
@@ -1,15404 +0,0 @@
-% Options for packages loaded elsewhere
-\PassOptionsToPackage{unicode}{hyperref}
-\PassOptionsToPackage{hyphens}{url}
-\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
-%
-\documentclass[
-  letterpaper,
-  numbers=noenddot,
-  DIV=11]{scrreprt}
-
-\usepackage{amsmath,amssymb}
-\usepackage{iftex}
-\ifPDFTeX
-  \usepackage[T1]{fontenc}
-  \usepackage[utf8]{inputenc}
-  \usepackage{textcomp} % provide euro and other symbols
-\else % if luatex or xetex
-  \usepackage{unicode-math}
-  \defaultfontfeatures{Scale=MatchLowercase}
-  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
-\fi
-\usepackage{lmodern}
-\ifPDFTeX\else  
-    % xetex/luatex font selection
-\fi
-% Use upquote if available, for straight quotes in verbatim environments
-\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
-\IfFileExists{microtype.sty}{% use microtype if available
-  \usepackage[]{microtype}
-  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
-}{}
-\makeatletter
-\@ifundefined{KOMAClassName}{% if non-KOMA class
-  \IfFileExists{parskip.sty}{%
-    \usepackage{parskip}
-  }{% else
-    \setlength{\parindent}{0pt}
-    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
-}{% if KOMA class
-  \KOMAoptions{parskip=half}}
-\makeatother
-\usepackage{xcolor}
-\setlength{\emergencystretch}{3em} % prevent overfull lines
-\setcounter{secnumdepth}{5}
-% Make \paragraph and \subparagraph free-standing
-\makeatletter
-\ifx\paragraph\undefined\else
-  \let\oldparagraph\paragraph
-  \renewcommand{\paragraph}{
-    \@ifstar
-      \xxxParagraphStar
-      \xxxParagraphNoStar
-  }
-  \newcommand{\xxxParagraphStar}[1]{\oldparagraph*{#1}\mbox{}}
-  \newcommand{\xxxParagraphNoStar}[1]{\oldparagraph{#1}\mbox{}}
-\fi
-\ifx\subparagraph\undefined\else
-  \let\oldsubparagraph\subparagraph
-  \renewcommand{\subparagraph}{
-    \@ifstar
-      \xxxSubParagraphStar
-      \xxxSubParagraphNoStar
-  }
-  \newcommand{\xxxSubParagraphStar}[1]{\oldsubparagraph*{#1}\mbox{}}
-  \newcommand{\xxxSubParagraphNoStar}[1]{\oldsubparagraph{#1}\mbox{}}
-\fi
-\makeatother
-
-\usepackage{color}
-\usepackage{fancyvrb}
-\newcommand{\VerbBar}{|}
-\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
-\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
-% Add ',fontsize=\small' for more characters per line
-\usepackage{framed}
-\definecolor{shadecolor}{RGB}{241,243,245}
-\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
-\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
-\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
-\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.40,0.45,0.13}{#1}}
-\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
-\newcommand{\BuiltInTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
-\newcommand{\CharTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
-\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
-\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
-\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
-\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{\textbf{#1}}}
-\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
-\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
-\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
-\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
-\newcommand{\ExtensionTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
-\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
-\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.28,0.35,0.67}{#1}}
-\newcommand{\ImportTok}[1]{\textcolor[rgb]{0.00,0.46,0.62}{#1}}
-\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
-\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{\textbf{#1}}}
-\newcommand{\NormalTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
-\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
-\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
-\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
-\newcommand{\RegionMarkerTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
-\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
-\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
-\newcommand{\StringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
-\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.07,0.07,0.07}{#1}}
-\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
-\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
-
-\providecommand{\tightlist}{%
-  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array}
-\usepackage{calc} % for calculating minipage widths
-% Correct order of tables after \paragraph or \subparagraph
-\usepackage{etoolbox}
-\makeatletter
-\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
-\makeatother
-% Allow footnotes in longtable head/foot
-\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
-\makesavenoteenv{longtable}
-\usepackage{graphicx}
-\makeatletter
-\newsavebox\pandoc@box
-\newcommand*\pandocbounded[1]{% scales image to fit in text height/width
-  \sbox\pandoc@box{#1}%
-  \Gscale@div\@tempa{\textheight}{\dimexpr\ht\pandoc@box+\dp\pandoc@box\relax}%
-  \Gscale@div\@tempb{\linewidth}{\wd\pandoc@box}%
-  \ifdim\@tempb\p@<\@tempa\p@\let\@tempa\@tempb\fi% select the smaller of both
-  \ifdim\@tempa\p@<\p@\scalebox{\@tempa}{\usebox\pandoc@box}%
-  \else\usebox{\pandoc@box}%
-  \fi%
-}
-% Set default figure placement to htbp
-\def\fps@figure{htbp}
-\makeatother
-% definitions for citeproc citations
-\NewDocumentCommand\citeproctext{}{}
-\NewDocumentCommand\citeproc{mm}{%
-  \begingroup\def\citeproctext{#2}\cite{#1}\endgroup}
-\makeatletter
- % allow citations to break across lines
- \let\@cite@ofmt\@firstofone
- % avoid brackets around text for \cite:
- \def\@biblabel#1{}
- \def\@cite#1#2{{#1\if@tempswa , #2\fi}}
-\makeatother
-\newlength{\cslhangindent}
-\setlength{\cslhangindent}{1.5em}
-\newlength{\csllabelwidth}
-\setlength{\csllabelwidth}{3em}
-\newenvironment{CSLReferences}[2] % #1 hanging-indent, #2 entry-spacing
- {\begin{list}{}{%
-  \setlength{\itemindent}{0pt}
-  \setlength{\leftmargin}{0pt}
-  \setlength{\parsep}{0pt}
-  % turn on hanging indent if param 1 is 1
-  \ifodd #1
-   \setlength{\leftmargin}{\cslhangindent}
-   \setlength{\itemindent}{-1\cslhangindent}
-  \fi
-  % set entry spacing
-  \setlength{\itemsep}{#2\baselineskip}}}
- {\end{list}}
-\usepackage{calc}
-\newcommand{\CSLBlock}[1]{\hfill\break\parbox[t]{\linewidth}{\strut\ignorespaces#1\strut}}
-\newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{\strut#1\strut}}
-\newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{\strut#1\strut}}
-\newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1}
-
-%%%%%%%%%%%%%%%%%%%%
-% start preamble.tex
-%%%%%%%%%%%%%%%%%%%%
-
-\input{resources/tex/boxes.tex}
-
-% page layout
-\usepackage{geometry}
-\geometry{
-  dvips=false, pdftex=false, vtex=false, % drivers can have unexpected behaviors
-  papersize={8in,10in},                  % size specified by MIT Press
-  centering,                             % split margins equally
-  margin=.6in,                           % margins (must all be at least .5in)
-  includemp, includehead                % include sidenotes & header in body
-  % showframe                              % show page structure (for debugging)
-}
-
-% set fonts
-% \setmainfont[]{ETbb}
-\setmainfont{ETbb}[
-  UprightFont = {*-Regular},
-  BoldFont = {*-Bold},
-  ItalicFont = {*-Italic},
-  BoldItalicFont = {*-BoldItalic},
-  Path = {./resources/fonts/ETbb/},
-  Extension = {.otf}
-]
-
-\setsansfont{SourceSansPro}[
-  UprightFont = {*-Regular},
-  % BoldFont = {*-Bold},
-  % ItalicFont = {*-Italic},
-  Path = {./resources/fonts/},
-  Extension = {.ttf}
-]
-
-% set font specifications
-\setkomafont{disposition}{\rmfamily\itshape}
-\addtokomafont{part}{\sffamily\scshape}
-\addtokomafont{partnumber}{\sffamily\scshape}
-\addtokomafont{chapter}{\sffamily\scshape}
-\setkomafont{partentry}{\sffamily\scshape}
-\setkomafont{chapterentry}{\sffamily\scshape}
-\addtokomafont{title}{\sffamily}%\scshape}
-\addtokomafont{subtitle}{\sffamily}%\scshape}
-% \addtokomafont{author}{\sffamily}
-\addtokomafont{pagehead}{\sffamily\scshape}
-\addtokomafont{pagenumber}{\sffamily\scshape}
-
-\usepackage{amsmath}
-\usepackage{unicode-math}
-
-% adjust spacing around section headers
-\RedeclareSectionCommand[
-  runin=false,
-  afterskip=0pt % remove extra space after for section
-]{section}
-\RedeclareSectionCommand[
-  runin=false,
-  afterskip=0pt % remove extra space after for subsection
-]{subsection}
-
-% only part number on part title pages
-\renewcommand{\partformat}{\thepart}
-
-% headers/footers
-\usepackage{scrlayer-scrpage}
-\KOMAoptions{headwidth=textwithmarginpar} % make header full width
-\automark{chapter}
-\clearpairofpagestyles
-\renewcommand{\chaptermark}[1]{\markboth{#1}{}} % prevent chaptermark from uppercasing
-\ihead{%
-  \ifnum\value{chapter}>0 \thechapter\hspace{3pt} \fi % include chapter number if not 0
-  \textsc{\leftmark} % then chapter name
-}
-\ohead{\pagemark}
-\pagestyle{scrheadings}
-
-% table of contents
-\usepackage[titles]{tocloft}
-\renewcommand{\cftpartfont}{\sffamily\scshape\Large}     % part title
-\renewcommand{\cftpartpagefont}{\sffamily\scshape\large} % part page number
-\setlength{\cftbeforepartskip}{1.25em}                   % part vspace before
-\renewcommand{\cftchapfont}{\sffamily\scshape\large}     % chapter title
-\renewcommand{\cftchappagefont}{\sffamily\scshape\large} % chapter page number
-\setlength{\cftbeforechapskip}{.05em}                    % chapter vspace before
-
-% set chapter numbers flushright
-\newcommand{\chapnumlen}{.5em}
-\renewcommand{\cftchappresnum}{\hfill}
-\renewcommand{\cftchapaftersnum}{\hspace*{\chapnumlen}}
-\addtolength{\cftchapnumwidth}{\chapnumlen}
-% \renewcommand{\cftchapnumwidth}{\chapnumlen}
-% \addtolength{\cftchapindent}{2em}
-
-% \setlength{\cftbeforechapskip}{.25em}
-% \setlength{\cftbeforepartskip}{1.5em}
-
-\newcommand{\partnumlen}{.75em}
-\renewcommand{\cftpartpresnum}{\hfill}
-\renewcommand{\cftpartaftersnum}{\hspace*{\partnumlen}}
-% \addtolength{\cftpartnumwidth}{\partnumlen}
-\setlength{\cftpartindent}{0em}
-% \renewcommand{\cftpartnumwidth}{\partnumlen}
-
-% \renewcommand{\cftpartnumwidth}{\cftpartpagewidth}
-% \renewcommand{\cftpartnumformat}[1]{\hfill{\bfseries #1}} % Adjust font weight/style if necessary
-% \renewcommand{\cftpartnumwidth}{\numlen}  % Adjust this width as needed
-% \renewcommand{\cftpartleader}{\hfill} % Use this to add the space before the number
-
-% lists
-\usepackage{enumitem}
-\setlist[itemize]{
-  label={--} % en-dash as bullet symbol
-}
-
-\usepackage{threeparttable} % for papaja apa tables
-\setlength{\tabcolsep}{4pt} % horizontal space between table columns
-
-% styling for captions
-\usepackage[format=plain]{caption}
-\usepackage{marginfix} % load before sidenotes to improve sidenote positioning
-\usepackage{sidenotes}
-\usepackage{marginnote}
-\DeclareCaptionFont{caps}{\footnotesize}
-
-\captionsetup{
-  labelfont=caps,
-  textfont=caps,
-  skip=0pt,
-  belowskip=-6pt,
-  labelsep=newline
-}
-\DeclareCaptionStyle{sidecaption}{labelfont=caps,textfont=caps,skip=6pt,belowskip=0pt,labelsep=newline}
-\DeclareCaptionStyle{marginfigure}{labelfont=caps,textfont=caps,skip=6pt,belowskip=0pt,labelsep=newline}
-\DeclareCaptionStyle{margintable}{labelfont=caps,textfont=caps,skip=6pt,labelsep=newline}
-\DeclareCaptionStyle{longtable}{labelfont=caps,textfont=caps,skip=6pt,labelsep=newline}
-
-% reset sidenote counter at start of each chapter
-\let\oldchapter\chapter
-\def\chapter{%
-  \setcounter{sidenote}{1}%
-  \oldchapter
-}
-
-\usepackage{bbm}
-\usepackage{unicode-math}
-
-\usepackage{fvextra}
-\DefineVerbatimEnvironment{Highlighting}{Verbatim}{breaklines,commandchars=\\\{\}}
-
-% space above and below equations
-% \setlength{\abovedisplayskip}{0pt}
-% \setlength{\belowdisplayskip}{0pt}
-\usepackage[nodisplayskipstretch]{setspace}
-
- % override quarto box settings
-\ifdefined\Shaded\renewenvironment{Shaded}{\begin{tcolorbox}[enhanced, borderline west={3pt}{0pt}{shadecolor}, breakable, interior hidden, frame hidden, boxrule=0pt, sharp corners]}{\end{tcolorbox}}\fi
-
-% index
-\usepackage{imakeidx}
-\makeindex[intoc=true] %, columns=3, columnseprule=true, options=-s latex/indexstyles.ist]
-
-% temporary settings for copyediting
-% \setstretch{2}
-% \usepackage{lineno}
-% \linenumbers
-
-%%%%%%%%%%%%%%%%%%
-% end preamble.tex
-%%%%%%%%%%%%%%%%%%
-\makeatletter
-\@ifpackageloaded{tcolorbox}{}{\usepackage[skins,breakable]{tcolorbox}}
-\@ifpackageloaded{fontawesome5}{}{\usepackage{fontawesome5}}
-\definecolor{quarto-callout-color}{HTML}{909090}
-\definecolor{quarto-callout-note-color}{HTML}{0758E5}
-\definecolor{quarto-callout-important-color}{HTML}{CC1914}
-\definecolor{quarto-callout-warning-color}{HTML}{EB9113}
-\definecolor{quarto-callout-tip-color}{HTML}{00A047}
-\definecolor{quarto-callout-caution-color}{HTML}{FC5300}
-\definecolor{quarto-callout-color-frame}{HTML}{acacac}
-\definecolor{quarto-callout-note-color-frame}{HTML}{4582ec}
-\definecolor{quarto-callout-important-color-frame}{HTML}{d9534f}
-\definecolor{quarto-callout-warning-color-frame}{HTML}{f0ad4e}
-\definecolor{quarto-callout-tip-color-frame}{HTML}{02b875}
-\definecolor{quarto-callout-caution-color-frame}{HTML}{fd7e14}
-\makeatother
-\makeatletter
-\@ifpackageloaded{bookmark}{}{\usepackage{bookmark}}
-\makeatother
-\makeatletter
-\@ifpackageloaded{caption}{}{\usepackage{caption}}
-\AtBeginDocument{%
-\ifdefined\contentsname
-  \renewcommand*\contentsname{Table of contents}
-\else
-  \newcommand\contentsname{Table of contents}
-\fi
-\ifdefined\listfigurename
-  \renewcommand*\listfigurename{List of Figures}
-\else
-  \newcommand\listfigurename{List of Figures}
-\fi
-\ifdefined\listtablename
-  \renewcommand*\listtablename{List of Tables}
-\else
-  \newcommand\listtablename{List of Tables}
-\fi
-\ifdefined\figurename
-  \renewcommand*\figurename{Figure}
-\else
-  \newcommand\figurename{Figure}
-\fi
-\ifdefined\tablename
-  \renewcommand*\tablename{Table}
-\else
-  \newcommand\tablename{Table}
-\fi
-}
-\@ifpackageloaded{float}{}{\usepackage{float}}
-\floatstyle{ruled}
-\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]}
-\floatname{codelisting}{Listing}
-\newcommand*\listoflistings{\listof{codelisting}{List of Listings}}
-\usepackage{amsthm}
-\theoremstyle{definition}
-\newtheorem{definition}{Definition}[chapter]
-\theoremstyle{plain}
-\newtheorem{proposition}{Proposition}[chapter]
-\theoremstyle{remark}
-\AtBeginDocument{\renewcommand*{\proofname}{Proof}}
-\newtheorem*{remark}{Remark}
-\newtheorem*{solution}{Solution}
-\newtheorem{refremark}{Remark}[chapter]
-\newtheorem{refsolution}{Solution}[chapter]
-\makeatother
-\makeatletter
-\makeatother
-\makeatletter
-\@ifpackageloaded{caption}{}{\usepackage{caption}}
-\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}}
-\makeatother
-\makeatletter
-\@ifpackageloaded{algorithm}{}{\usepackage{algorithm}}
-\makeatother
-\makeatletter
-\@ifpackageloaded{algpseudocode}{}{\usepackage{algpseudocode}}
-\makeatother
-\makeatletter
-\@ifpackageloaded{caption}{}{\usepackage{caption}}
-\makeatother
-
-\usepackage{bookmark}
-
-\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
-\urlstyle{same} % disable monospaced font for URLs
-% Make links footnotes instead of hotlinks:
-\DeclareRobustCommand{\href}[2]{#2\footnote{\url{#1}}}
-\hypersetup{
-  pdftitle={Machine Learning from Human Preferences},
-  pdfauthor={Sang T. Truong; Andreas Haupt; Sanmi Koyejo},
-  colorlinks=true,
-  linkcolor={DarkBlue},
-  filecolor={Maroon},
-  citecolor={DarkGreen},
-  urlcolor={DarkGreen},
-  pdfcreator={LaTeX via pandoc}}
-
-
-\title{Machine Learning from Human Preferences}
-\author{Sang T. Truong \and Andreas Haupt \and Sanmi Koyejo}
-\date{2025-05-20}
-
-\begin{document}
-\newgeometry{}
-
-\begin{titlepage}
-\end{titlepage}
-
-\begin{titlepage}
-  \centering
-  {\usekomafont{title}\scshape\Huge Machine Learning from Human
-Preferences\par}\clearpage
-\end{titlepage}
-
-\begin{titlepage}
-  \begin{center}
-    {\usekomafont{title}\scshape\Huge Machine Learning from Human
-Preferences\par}
-    \vskip 1em
-    {\usekomafont{subtitle}\LARGE \par}
-    \vskip 1em
-    \setstretch{1.5}
-    {\usekomafont{author} Sang T. Truong, Andreas Haupt,  and~Sanmi
-Koyejo \par}
-    \vfill
-    {\rmfamily\large Stanford University\\Stanford, CA}
-  \end{center}
-\end{titlepage}
-
-\begin{titlepage}
-  \vspace*{\fill}
-  {\rmfamily\scriptsize
-    © 2025 Stanford University\par
-    All rights reserved.
-  }
-  \vspace*{\fill}
-\end{titlepage}
-
-\restoregeometry{}
-\RecustomVerbatimEnvironment{verbatim}{Verbatim}{
-  showspaces = false,
-  showtabs = false,
-  breaksymbolleft={},
-  breaklines
-}
-\numberwithin{algorithm}{chapter}
-\algrenewcommand{\algorithmiccomment}[1]{\hskip3em$\rightarrow$ #1}
-
-\floatname{algorithm}{Algorithm}
-
-\numberwithin{algorithm}{chapter}
-
-\renewcommand*\contentsname{Table of Contents}
-{
-\hypersetup{linkcolor=}
-\setcounter{tocdepth}{1}
-\tableofcontents
-}
-
-\phantomsection\label{sec-intro}
-\bookmarksetup{startatroot}
-
-\chapter*{Introduction}
-\addcontentsline{toc}{chapter}{Introduction}
-
-\markboth{Introduction}{Introduction}
-
-Machine learning is increasingly shaping various aspects of our lives,
-from education and healthcare to scientific discovery. A key challenge
-in developing trustworthy intelligent systems is ensuring they align
-with human preferences. Learning from human feedback offers a promising
-solution to this challenge. This book introduces the foundations and
-practical applications of machine learning from human preferences.
-Instead of manually predefining the learning goal, the book presents
-preference-based learning that incorporates human feedback to guide the
-learning process, drawing insights from related fields such as
-economics, psychology, and human-computer interaction. By the end of
-this book, readers will be equipped with the key concepts and tools
-needed to design systems that effectively align with human preferences.
-
-The book is intended for researchers, practitioners, and students who
-are interested in intergrating machine learning with human-centered
-application. We assume some basic knowledge of probability and
-statistics, but provides sufficient background and references for the
-readers to follow the main ideas. The book also provides illustrative
-program examples and datasets. The field of machine learning from human
-preference is a vibrant area of research and practice with many open
-challenges and opportunities, and we hope that this book will inspire
-readers to further explore and advance this exciting field.
-
-We hope with the present book to both allow more use of human
-preferences in machine learning, and new data modalities as Artificial
-Intelligence systems become increasingly important.
-
-Stanford, May 2025, THK
-
-\section*{Structure of this book}\label{structure-of-this-book}
-\addcontentsline{toc}{section}{Structure of this book}
-
-\markright{Structure of this book}
-
-The book has three parts which introduce fundamental models, present
-learning paradigms, and discuss assumptions.
-
-\subsection*{Background}\label{background}
-\addcontentsline{toc}{subsection}{Background}
-
-We provide background on axioms underlying comparisons in
-\textbf{Chapter 1}. We discover key modeling assumptions It covers
-random preference models the Independence of Irrelevant Alternatives
-(IIA), and types of comparison data (binary rankings, accept-reject,
-lists). The chapter also discusses the main limitations of IIA based on
-heterogeneity.
-
-\subsection*{Learning}\label{learning}
-\addcontentsline{toc}{subsection}{Learning}
-
-The second part introduces several approaches to learning from
-comparisons.
-
-\begin{itemize}
-\item
-  \textbf{Chapter 2} considers a setting where comparison data is given
-  and studies both maximum likelihood and posterior-based learning of
-  comparison models. It uses case studies from language modeling and
-  robotics. We discuss the challenges in learning
-  multimodal/heterogenous rewards that fail to satisfy IIA.
-\item
-  \textbf{Chapter 3} considers active data collection of comparisons
-  with the goal of optimal inference on comparison models using Various
-  strategies are explored, including reducing the learner's variance,
-  exploiting ambiguity and domain knowledge in ranking, with a case
-  study from robotics.
-\item
-  \textbf{Chapter 4} studies processes where comparisons are used to
-  guide decisions. We first set up the bandit approach to recommending
-  maximal objects with respect to comparisons, and discuss dueling
-  bandits. We then consider as well as reinforcement learning from human
-  feedback (RLHF) to align language models that decide on which text to
-  generate. We highlight the role of uncertainty quantification and
-  exploration for decision-making.
-\item
-  \textbf{Chapter 5} considers decision-making in the presence of
-  heterogeneity. We first focus on dealing with heterogeneity to
-  maximize average utility using \textbf{personalization}. We then
-  discuss aggregation mechanisms that are voting-based and decisions
-  that are independent of some some features of the outcome.
-\end{itemize}
-
-\subsection*{Reflection}\label{reflection}
-\addcontentsline{toc}{subsection}{Reflection}
-
-The final part of the book discusses limitations of comparison data, and
-opportunities resulting from stated preference data.
-
-\begin{itemize}
-\item
-  \textbf{Chapter 6} critiques machine learning from comparisons. It
-  takes different disciplinary lenses, from social psychology,
-  philosophy, and critical studies, to highlight where comparisons are
-  limited in the expression of human preferences, and what are
-  alternatives.
-\item
-  \textbf{Chapter 7} considers models that are broader than comparisons
-  in our model, many of which we can think of as \textbf{stated
-  preferences}. These are models in which value judgments are given in
-  terms of Likert scales or textual descriptions. We propose ways in how
-  such feedback can be merged with comparison data to better express
-  preferences.
-\end{itemize}
-
-\section*{How to engage with this
-book}\label{how-to-engage-with-this-book}
-\addcontentsline{toc}{section}{How to engage with this book}
-
-\markright{How to engage with this book}
-
-Threre are three models of reading, and teaching with, this book.
-Chapter 1 is underlying all of the book, so is part of all of these
-pathways.
-
-\begin{itemize}
-\item
-  For practitioners and those teaching applied AI content, we recommend
-  a reading of Chapters 1, 2, 4, and 7, which can be used as a sequence
-  in an early graduate course on Machine Learning. It allows to
-  highlight human data sources in an introductory machine learning
-  course.
-\item
-  For people with background in discrete choice, we propose to skim
-  Chapter 1, and study Chapters 2 and 4. These studies allow readers to
-  integrate machine learning in their studies of discrete choice, demand
-  models, and Industrial Organization.
-\item
-  For those with deep background in machine learning, we propose to
-  study chapters 2-4 and 7. These chapters maximize the amount of
-  machine learning covered, and is suitable for a deep learning-based
-  course of machine learning.
-\item
-  For those interested in the methodological and theoretical foundations
-  of machine leraning from comparisons, we recommend a reading of
-  chapters 1, 5, 6, and 7. Chapter 1 and 5 study the underpinnings of
-  revealed preferences and aggregation, chapter 6 critiques these
-  assumptions, and chapter 7 looks at broader ways of eliciting
-  preferences. It is suitable for critical study in a course on
-  Computation and Society.
-\end{itemize}
-
-\section*{Prior knowledge}\label{prior-knowledge}
-\addcontentsline{toc}{section}{Prior knowledge}
-
-\markright{Prior knowledge}
-
-The book assumes knowledge of the fundamentals of statistics, linear
-algebra and machine learning. Many example code excerpts are written in
-\texttt{python}, and make experience in the \texttt{python} programming
-language valuable for readers.
-
-\section*{Additional Materials}\label{additional-materials}
-\addcontentsline{toc}{section}{Additional Materials}
-
-\markright{Additional Materials}
-
-Every chapter has problems for readers and slides for teaching of the
-material available. They are available on the
-\href{mlhp.stanford.edu}{book's website}.
-
-\bookmarksetup{startatroot}
-
-\chapter{Background}\label{background-1}
-
-Human preference modeling aims to capture humans' decision making
-processes in a probabilistic framework. Many problems would benefit from
-a quantitative perspective, enabling an understanding of how humans
-engage with the world. In this chapter, we will explore how one can
-model human preferences, including different formulations of such
-models, how one can optimize these models given data, and considerations
-one should understand to create such systems.
-
-\section{The Construction of Preference}\label{sec-foundations}
-
-\subsection{Axiom 1. Construction of Choices Set: Luce's Choice Axiom
-(Luce, 1959)}\label{axiom-1-preference-models-model-choice}
-
-Preference models model the preferred choices amongst a set of items.
-Preference models must enumerate the set of all possible choices
-included in a human decision. As such, we must ensure that the choices
-we enumerate capture the entire domain (collectively exhaustive) but are
-distinct (mutually exclusive) choices. A discrete set of choices is a
-constraint we canonically impose to ensure we can tractably model
-preferences and aptly estimate the parameters of preference models. We
-assume that if a new item is added to the choice set, the relative
-probabilities of choosing between the original items remain unchanged.
-This is known as the Independence of Irrelevant Alternatives (IIA)
-property from Luce's axiom of choices (\citeproc{ref-Luce1977}{Luce
-1977}).
-
-\subsection{Axiom 2. Preference Centers around Utility: Reciprocity
-(Block \& Marschak,
-1960)}\label{axiom-2-preference-centers-around-reward}
-
-Preference models are centered around the notion of reward, a scalar
-quantity representing the benefit or value an individual attains from
-selecting a given choice. We assume that the underlying reward mechanism
-of a human preference model captures the final decision output from a
-human. We use the notation \(u_{i,j}\) as the reward of person \(i\)
-choosing item \(j\). The reward is a random variable, decomposing into
-true reward \(u_{i,j}^*\) and a random noise \(\epsilon_{i,j}\):
-\(u_{i,j} = u_{i,j}^* + \epsilon_{i,j}\). McFadden
-(\citeproc{ref-mcfadden_conditional_1974}{1974}) posits that reward can
-further be decomposed into user-specific reward \(\theta_i\) and
-item-specific reward \(z_j\): \(u_{i,j}^* = \theta_i + z_j\). This
-decomposition indicates that for a single user, only the relative
-difference in reward matters to predict the choice among items, and the
-scale of rewards is important when comparing across users.
-
-\subsection{Axiom 3. Preference captures decision-making: Wins as a
-Sufficient Statistic (Bühlmann \& Huber,
-1963)}\label{axiom-3-preference-captures-decision-making}
-
-Human preferences are classified into two categories: revealed
-preferences and stated preferences. Revealed preferences are those one
-can observe retroactively from existing data. The implicit
-decision-making knowledge can be captured via learnable parameters and
-their usage in models that represent relationships between input
-decision attributes that may have little interpretability but enable
-powerful models of human preference. Such data may be easier to acquire
-and can reflect real-world outcomes (since they are, at least
-theoretically, inherently based on human preferences). However, if we
-fail to capture sufficient context in such data, human preference models
-may not sufficiently capture human preferences. Stated preferences are
-those individuals explicitly indicate in potentially experimental
-conditions. The explicit knowledge may be leveraged by including
-inductive biases during modeling (for example, the context used in a
-model), which are reasonable assumptions for how a human would consider
-a set of items. This may include controlled experiments or studies. This
-may be harder to obtain and somewhat biased, as they can be hypothetical
-or only accurately reflect a piece of the overall context of a decision.
-However, they enable greater control of the decision-making process.
-
-\subsection{Axiom 4. Rationality: The Transitivity of odds (Good,
-1955)}\label{human-rationality}
-
-The preference model assumes that humans are rational. Perfect
-rationality posits that individuals make decisions that maximize their
-reward, assuming they have complete information and the cognitive
-ability to process this information to make optimal choices. Numerous
-studies have shown that this assumption frequently fails to describe
-actual human behavior. Bounded rationality acknowledges that individuals
-operate within the limits of their information and cognitive
-capabilities (\citeproc{ref-simon1972theories}{Simon 1972}). Here,
-decisions are influenced by noise, resulting in probabilistic choice
-behavior: while individuals aim to maximize their reward, noise can lead
-to deviations from perfectly rational choices
-(\citeproc{ref-miljkovic2005rational}{Miljkovic 2005}). Instead of
-deterministic reward maximization, the decision maker will choose an
-item with probability proportional to the reward they receive for that
-item. This probabilistic model can be operationalized with Boltzmann
-distribution. Utility of person \(i\) on item \(j\) is computed by a
-function \(f_i: e_j \rightarrow \mathbb{R}\), where
-\(e_j \in \mathbb{R}^d\) is an embedding of item \(j\). The probability
-of item \(j\) being preferred by person \(i\) over all other
-alternatives in the choice set \(\mathcal{C}\) is
-
-\[
-p_{ij} =  p_i(j \succ j': j' \neq j \forall j' \in \mathcal{C}) = Z_i^{-1} \exp \circ f_i(e_j) \text{ where } Z_i = \sum_{j' \in \mathcal{C}} \exp \circ f_i(e_{j'})
-\]
-
-One can extend the above model in various ways. For example, the above
-model does not account for similar actions. Consider the following
-example when choosing a mode of transportation: car and train, with no
-particular preference for either choice. The preferred probability is
-50\% for either item. However, if we have 99 cars and one train in the
-choice set, we would have a 99\% probability of choosing a car. To
-address this issue, various extensions have been proposed. For example,
-we can introduce a similarity metric to cluster items. We want a metric
-that acts more as a distance in the feature space with the following
-properties: Identity (an item is most similar to itself), symmetric (the
-similarity of item \(j\) to \(j'\) is the same as that of \(j'\) to
-\(j\)), and positive semidefinite (similarity metric is non-negative).
-Under this extension, the probablity of item \(j\) being preferred over
-all other alternatives by person \(i\) is
-\(p_{ij} / w_j, \text{ where } w_j = \sum_{j' \in \mathcal{C}} s(e_j, e_{j'})\).
-This de-weights similar items, which is the desired effect for human
-decision-making.
-
-\section{Models of Preferences and Decisions}\label{preference-model}
-
-Next, we explore ways humans can express their preferences, including
-accept-reject sampling, pairwise sampling, rank-order sampling,
-rating-scale sampling, best-worst scaling, and multiple-choice samples.
-We will understand the process of collecting data through simulation
-and, when appropriate, discuss the real-world application of these
-models. Each item \(i\) is represented by a \(d=2\) dimensional vector
-\(x^i\). There is only one user in the simulation, and they have a
-latent reward function \(f\) that they use to compute the latent reward
-of an item from the features. Here, the latent reward function is the
-Ackley function \cite{ackley1987}.
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{import numpy as np}
-\NormalTok{np.random.seed(0)}
-
-\NormalTok{def ackley(X, a=20, b=0.2, c=2*np.pi):}
-\NormalTok{    """}
-\NormalTok{    Compute the Ackley function.}
-\NormalTok{    Parameters:}
-\NormalTok{      X: A NumPy array of shape (n, d) where each row is a d{-}dimensional point.}
-\NormalTok{      a, b, c: Parameters of the Ackley function.}
-\NormalTok{    Returns:}
-\NormalTok{      A NumPy array of function values.}
-\NormalTok{    """}
-\NormalTok{    X = np.atleast\_2d(X)}
-\NormalTok{    d = X.shape[1]}
-\NormalTok{    sum\_sq = np.sum(X ** 2, axis=1)}
-\NormalTok{    term1 = {-}a * np.exp({-}b * np.sqrt(sum\_sq / d))}
-\NormalTok{    term2 = {-}np.exp(np.sum(np.cos(c * X), axis=1) / d)}
-\NormalTok{    return term1 + term2 + a + np.e}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\NormalTok{np.random.seed(}\DecValTok{0}\NormalTok{)}
-
-\KeywordTok{def}\NormalTok{ ackley(X, a}\OperatorTok{=}\DecValTok{20}\NormalTok{, b}\OperatorTok{=}\FloatTok{0.2}\NormalTok{, c}\OperatorTok{=}\DecValTok{2}\OperatorTok{*}\NormalTok{np.pi):}
-    \CommentTok{"""}
-\CommentTok{    Compute the Ackley function.}
-\CommentTok{    Parameters:}
-\CommentTok{      X: A NumPy array of shape (n, d) where each row is a d{-}dimensional point.}
-\CommentTok{      a, b, c: Parameters of the Ackley function.}
-\CommentTok{    Returns:}
-\CommentTok{      A NumPy array of function values.}
-\CommentTok{    """}
-\NormalTok{    X }\OperatorTok{=}\NormalTok{ np.atleast\_2d(X)}
-\NormalTok{    d }\OperatorTok{=}\NormalTok{ X.shape[}\DecValTok{1}\NormalTok{]}
-\NormalTok{    sum\_sq }\OperatorTok{=}\NormalTok{ np.}\BuiltInTok{sum}\NormalTok{(X }\OperatorTok{**} \DecValTok{2}\NormalTok{, axis}\OperatorTok{=}\DecValTok{1}\NormalTok{)}
-\NormalTok{    term1 }\OperatorTok{=} \OperatorTok{{-}}\NormalTok{a }\OperatorTok{*}\NormalTok{ np.exp(}\OperatorTok{{-}}\NormalTok{b }\OperatorTok{*}\NormalTok{ np.sqrt(sum\_sq }\OperatorTok{/}\NormalTok{ d))}
-\NormalTok{    term2 }\OperatorTok{=} \OperatorTok{{-}}\NormalTok{np.exp(np.}\BuiltInTok{sum}\NormalTok{(np.cos(c }\OperatorTok{*}\NormalTok{ X), axis}\OperatorTok{=}\DecValTok{1}\NormalTok{) }\OperatorTok{/}\NormalTok{ d)}
-    \ControlFlowTok{return}\NormalTok{ term1 }\OperatorTok{+}\NormalTok{ term2 }\OperatorTok{+}\NormalTok{ a }\OperatorTok{+}\NormalTok{ np.e}
-\end{Highlighting}
-\end{Shaded}
-
-We next define a function to visualize the surface:
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{import matplotlib.pyplot as plt}
-\NormalTok{from matplotlib.colors import LinearSegmentedColormap}
-\NormalTok{ccmap = LinearSegmentedColormap.from\_list("ackley", ["\#f76a05", "\#FFF2C9"])}
-\NormalTok{plt.rcParams.update(\{}
-\NormalTok{    "font.size": 14,}
-\NormalTok{    "axes.labelsize": 16,}
-\NormalTok{    "xtick.labelsize": 14,}
-\NormalTok{    "ytick.labelsize": 14,}
-\NormalTok{    "legend.fontsize": 14,}
-\NormalTok{    "axes.titlesize": 16,}
-\NormalTok{\})}
-
-\NormalTok{def draw\_surface():}
-\NormalTok{    inps = np.linspace({-}2, 2, 100)}
-\NormalTok{    X, Y = np.meshgrid(inps, inps)}
-\NormalTok{    grid = np.column\_stack([X.ravel(), Y.ravel()])}
-\NormalTok{    Z = ackley(grid).reshape(X.shape)}
-    
-\NormalTok{    plt.figure(figsize=(6, 5))}
-\NormalTok{    contour = plt.contourf(X, Y, Z, 50, cmap=ccmap)}
-\NormalTok{    plt.contour(X, Y, Z, levels=15, colors=\textquotesingle{}black\textquotesingle{}, linewidths=0.5, alpha=0.6)}
-\NormalTok{    plt.colorbar(contour, label=r\textquotesingle{}$f(x)$\textquotesingle{}, ticks=[0, 3, 6])}
-\NormalTok{    plt.xlim({-}2, 2)}
-\NormalTok{    plt.ylim({-}2, 2)}
-\NormalTok{    plt.xticks([{-}2, 0, 2])}
-\NormalTok{    plt.yticks([{-}2, 0, 2])}
-\NormalTok{    plt.xlabel(r\textquotesingle{}$x\_1$\textquotesingle{})}
-\NormalTok{    plt.ylabel(r\textquotesingle{}$x\_2$\textquotesingle{})}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ matplotlib.colors }\ImportTok{import}\NormalTok{ LinearSegmentedColormap}
-\NormalTok{ccmap }\OperatorTok{=}\NormalTok{ LinearSegmentedColormap.from\_list(}\StringTok{"ackley"}\NormalTok{, [}\StringTok{"\#f76a05"}\NormalTok{, }\StringTok{"\#FFF2C9"}\NormalTok{])}
-\NormalTok{plt.rcParams.update(\{}
-    \StringTok{"font.size"}\NormalTok{: }\DecValTok{14}\NormalTok{,}
-    \StringTok{"axes.labelsize"}\NormalTok{: }\DecValTok{16}\NormalTok{,}
-    \StringTok{"xtick.labelsize"}\NormalTok{: }\DecValTok{14}\NormalTok{,}
-    \StringTok{"ytick.labelsize"}\NormalTok{: }\DecValTok{14}\NormalTok{,}
-    \StringTok{"legend.fontsize"}\NormalTok{: }\DecValTok{14}\NormalTok{,}
-    \StringTok{"axes.titlesize"}\NormalTok{: }\DecValTok{16}\NormalTok{,}
-\NormalTok{\})}
-\NormalTok{plt.rcParams[}\StringTok{\textquotesingle{}text.usetex\textquotesingle{}}\NormalTok{] }\OperatorTok{=} \VariableTok{True}
-
-\KeywordTok{def}\NormalTok{ draw\_surface():}
-\NormalTok{    inps }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{2}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{100}\NormalTok{)}
-\NormalTok{    X, Y }\OperatorTok{=}\NormalTok{ np.meshgrid(inps, inps)}
-\NormalTok{    grid }\OperatorTok{=}\NormalTok{ np.column\_stack([X.ravel(), Y.ravel()])}
-\NormalTok{    Z }\OperatorTok{=}\NormalTok{ ackley(grid).reshape(X.shape)}
-    
-\NormalTok{    plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{6}\NormalTok{, }\DecValTok{5}\NormalTok{))}
-\NormalTok{    contour }\OperatorTok{=}\NormalTok{ plt.contourf(X, Y, Z, }\DecValTok{50}\NormalTok{, cmap}\OperatorTok{=}\NormalTok{ccmap)}
-\NormalTok{    plt.contour(X, Y, Z, levels}\OperatorTok{=}\DecValTok{15}\NormalTok{, colors}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linewidths}\OperatorTok{=}\FloatTok{0.5}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.6}\NormalTok{)}
-\NormalTok{    plt.colorbar(contour, label}\OperatorTok{=}\VerbatimStringTok{r\textquotesingle{}$f(x)$\textquotesingle{}}\NormalTok{, ticks}\OperatorTok{=}\NormalTok{[}\DecValTok{0}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{6}\NormalTok{])}
-\NormalTok{    plt.xlim(}\OperatorTok{{-}}\DecValTok{2}\NormalTok{, }\DecValTok{2}\NormalTok{)}
-\NormalTok{    plt.ylim(}\OperatorTok{{-}}\DecValTok{2}\NormalTok{, }\DecValTok{2}\NormalTok{)}
-\NormalTok{    plt.xticks([}\OperatorTok{{-}}\DecValTok{2}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{2}\NormalTok{])}
-\NormalTok{    plt.yticks([}\OperatorTok{{-}}\DecValTok{2}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{2}\NormalTok{])}
-\NormalTok{    plt.xlabel(}\VerbatimStringTok{r\textquotesingle{}$x\_1$\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.ylabel(}\VerbatimStringTok{r\textquotesingle{}$x\_2$\textquotesingle{}}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\subsection{Item-wise Model}\label{item-wise-model}
-
-One method for data collection is accept-reject sampling, where the user
-considers one item at a time and decides if they like it. Below is an
-example survey using accept-reject sampling:
-
-We will use a simulation to familiarize ourselves with accept-reject
-sampling. On the surface below, blue and red points correspond to accept
-or reject points.
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{d = 2}
-\NormalTok{n\_items = 800}
-\NormalTok{items = np.random.randn(n\_items, d)*0.5 + np.ones((n\_items, d))*0.5}
-\NormalTok{rewards = ackley(items)}
-\NormalTok{y = (rewards \textgreater{} rewards.mean())}
-\NormalTok{draw\_surface()}
-\NormalTok{plt.scatter(items[:, 0], items[:, 1], c=y, cmap=\textquotesingle{}coolwarm\textquotesingle{}, alpha=0.5)}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{d }\OperatorTok{=} \DecValTok{2}
-\NormalTok{n\_items }\OperatorTok{=} \DecValTok{800}
-\NormalTok{items }\OperatorTok{=}\NormalTok{ np.random.randn(n\_items, d)}\OperatorTok{*}\FloatTok{0.5} \OperatorTok{+}\NormalTok{ np.ones((n\_items, d))}\OperatorTok{*}\FloatTok{0.5}
-\NormalTok{rewards }\OperatorTok{=}\NormalTok{ ackley(items)}
-\NormalTok{y }\OperatorTok{=}\NormalTok{ (rewards }\OperatorTok{\textgreater{}}\NormalTok{ rewards.mean())}
-\NormalTok{draw\_surface()}
-\NormalTok{plt.scatter(items[:, }\DecValTok{0}\NormalTok{], items[:, }\DecValTok{1}\NormalTok{], c}\OperatorTok{=}\NormalTok{y, cmap}\OperatorTok{=}\StringTok{\textquotesingle{}coolwarm\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{)}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/chap2_files/figure-pdf/cell-4-output-1.pdf}}
-
-The binary choice model centers around one item. The model predicts, for
-that item, after observing user choices in the past, whether that item
-will be chosen. We use binary variable \(y \in \{0, 1\}\) to represent
-whether the user will pick that choice in the next selection phase. We
-denote \(P = p(y = 1)\). We can formally model \(y\) as a function of
-the reward of the positive choice: \(y = \mathbb{I}[U>0]\). We explore
-two cases based on the noise distribution. \(\psi\) is the logistic
-function or the standard normal cumulative distribution function if
-noise follows logistic distribution and the standard normal
-distribution, respectively: \[
-p(u_{i,j} > 0) = p(u_{i,j}^* + \epsilon > 0) = 1 - p( \epsilon < -u_{i,j}^*) = \psi(u_{i,j}^*).
-\]
-
-A generalization of accept-reject sampling is rating-scale sampling.
-Rating-scale sampling, such as the Likert scale, is a method in which
-participants rate items on a fixed-point scale (e.g., 1 to 5, ``Strongly
-Disagree'' to ``Strongly Agree'') to measure levels of preference
-towards items (\citeproc{ref-harpe2015}{Harpe 2015}). Participants can
-also mark a point on a continuous rating scale to indicate their
-preference or attitude. Commonly used in surveys, product reviews, and
-psychological assessments, this method provides a more nuanced measure
-than discrete scales. Rating-scale sampling is simple for participants
-to understand and use, provides rich data on the intensity of
-preferences, and is flexible enough for various measurements (e.g.,
-agreement, satisfaction). However, rating-scale sampling methods also
-have limitations. Ratings can be influenced by personal biases and
-interpretations of scales, leading to subjectivity. There is a central
-tendency bias, where participants may avoid extreme ratings, resulting
-in clustering responses around the middle. Different participants might
-interpret scale points differently, and fixed-point scales may not
-capture the full nuance of participants' preferences or attitudes.
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{from matplotlib.colors import LinearSegmentedColormap}
-\NormalTok{likert\_cmap = LinearSegmentedColormap.from\_list("likert\_scale", ["red", "blue"], N=5)}
-\NormalTok{normalized = (rewards {-} rewards.min()) / (rewards.max() {-} rewards.min())}
-\NormalTok{ratings = np.round(normalized * 4).squeeze()}
-
-\NormalTok{draw\_surface()}
-\NormalTok{scatter = plt.scatter(items[:, 0], items[:, 1], c=ratings, cmap=likert\_cmap, alpha=0.5)}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{from}\NormalTok{ matplotlib.colors }\ImportTok{import}\NormalTok{ LinearSegmentedColormap}
-\NormalTok{likert\_cmap }\OperatorTok{=}\NormalTok{ LinearSegmentedColormap.from\_list(}\StringTok{"likert\_scale"}\NormalTok{, [}\StringTok{"red"}\NormalTok{, }\StringTok{"blue"}\NormalTok{], N}\OperatorTok{=}\DecValTok{5}\NormalTok{)}
-\NormalTok{normalized }\OperatorTok{=}\NormalTok{ (rewards }\OperatorTok{{-}}\NormalTok{ rewards.}\BuiltInTok{min}\NormalTok{()) }\OperatorTok{/}\NormalTok{ (rewards.}\BuiltInTok{max}\NormalTok{() }\OperatorTok{{-}}\NormalTok{ rewards.}\BuiltInTok{min}\NormalTok{())}
-\NormalTok{ratings }\OperatorTok{=}\NormalTok{ np.}\BuiltInTok{round}\NormalTok{(normalized }\OperatorTok{*} \DecValTok{4}\NormalTok{).squeeze()}
-
-\NormalTok{draw\_surface()}
-\NormalTok{scatter }\OperatorTok{=}\NormalTok{ plt.scatter(items[:, }\DecValTok{0}\NormalTok{], items[:, }\DecValTok{1}\NormalTok{], c}\OperatorTok{=}\NormalTok{ratings, cmap}\OperatorTok{=}\NormalTok{likert\_cmap, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{)}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/chap2_files/figure-pdf/cell-5-output-1.pdf}}
-
-Suppose we have a single example with attributes \(z_i\) and wish to
-know which of \(J\) rating scales an individual will choose from. We can
-define \(J - 1\) parameters, which act as thresholds on the reward
-computed by \(u_i = u_{i,j}^*\) to classify the predicted choice between
-these items. For example, if there are three predefined items, we can
-define parameters \(a, b \in \mathbb{R}\) such that \[
-y_i =
-\begin{cases} 
-    1 & u < a \\
-    2 & a \le u < b \\
-    3 & \text{else}
-\end{cases}
-\]
-
-By assuming the noise distribution to be either logistic or standard
-normal, we have \[
-\begin{split}
-    p(y_i = 1) & = p(u < a) = p(u_{i,j}^* + \epsilon < a) = \psi(a-u_{i,j}^*) \\
-    p(y_i = 2) & = p(a \le u < b) = p(a - u_{i,j}^* \le \epsilon < b - u_{i,j}^*) = \psi(b-u_{i,j}^*)  - \psi(u_{i,j}^*-a) \\
-    p(y_i = 3) & = p(u > b) = p(u_{i,j}^* + \epsilon > b ) = p( \epsilon > b - u_{i,j}^*) = \psi(b-u_{i,j}^*)
-\end{split}
-\]
-
-Having the model, we next explore the estimation of model parameters. A
-common approach for parameter estimation is maximum likelihood
-(\citeproc{ref-book_estimation_casella}{Casella and Berger 1990};
-\citeproc{ref-book_estimation_bock}{Bock et al. 2015}). The likelihood
-of a model is the probability of the observed data given the model
-parameters; intuitively, we wish to maximize this likelihood, as that
-would mean that our model associates observed human preferences with
-high probability. Assuming our data is independent and identically
-distributed (iid), the likelihood over the entire dataset is the joint
-probability of all observed data as defined by the binary choice model
-with logistic noise is
-
-\[\mathcal{L}(z, Y; \beta) = \prod_{i = 1}^J p(y = y_i | z_i; \beta) = \prod_{i = 1}^J \frac{1}{1 + \exp^{-u_{i,j}^*}}\]
-
-This objective can be optimized with a gradient-based method, such as
-gradient descent (\citeproc{ref-gradient_descent}{Ruder 2016}). Gradient
-descent operates by computing the gradient of the objective with respect
-to the parameters of the model, which provides a signal of the direction
-in which the parameters must move to minimize the objective. Then, SGD
-makes an update step by subtracting this gradient from the parameters
-(most often with a scale factor called a learning rate) to move the
-parameters in a direction that minimizes the objective. In the case of
-logistic and Gaussian models, SGD may yield a challenging optimization
-problem as its stochasticity can lead to noisy updates, for example, if
-certain examples or batches of examples are biased. Mitigations include
-batched SGD, in which multiple samples are randomly sampled from the
-dataset at each iteration; learning rates, which reduce the impact of
-noisy gradient updates, and momentum and higher-order optimizers, which
-reduce noise by using moving averages of gradients or provide better
-estimates of the best direction in which to update the gradients.
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{import numpy as np}
-\NormalTok{from scipy.optimize import minimize}
-\NormalTok{from sklearn.metrics import roc\_auc\_score}
-\NormalTok{from tqdm import tqdm}
-
-\NormalTok{\# Set random seed for reproducibility (optional)}
-\NormalTok{np.random.seed(42)}
-
-\NormalTok{\# Number of users and items}
-\NormalTok{num\_users = 50}
-\NormalTok{num\_items = 100}
-
-\NormalTok{\# Generate user{-}specific and item{-}specific rewards}
-\NormalTok{theta\_true = np.random.randn(num\_users)}
-\NormalTok{z\_true = np.random.randn(num\_items)}
-
-\NormalTok{\# Define the logistic (sigmoid) function}
-\NormalTok{def sigmoid(x):}
-\NormalTok{    return 1.0 / (1.0 + np.exp({-}x))}
-
-\NormalTok{\# Generate observed choices using the logistic function}
-\NormalTok{\# Compute probability matrix: shape (num\_users, num\_items)}
-\NormalTok{probs = sigmoid(theta\_true[:, None] {-} z\_true[None, :])}
-\NormalTok{\# Sample binary responses (0 or 1) from a Bernoulli distribution}
-\NormalTok{data = np.random.binomial(1, probs)}
-
-\NormalTok{\# Mask out a fraction of the response matrix (80\% observed, 20\% missing)}
-\NormalTok{mask = np.random.rand(num\_users, num\_items) \textgreater{} 0.2  \# boolean mask}
-\NormalTok{\# Create a version of the data with missing values (not needed for optimization, but for reference)}
-\NormalTok{data\_masked = data.copy().astype(float)}
-\NormalTok{data\_masked[\textasciitilde{}mask] = np.nan}
-
-\NormalTok{\# Count of observed entries (used for averaging)}
-\NormalTok{observed\_count = np.sum(mask)}
-
-\NormalTok{\# We will optimize over parameters theta and z.}
-\NormalTok{\# Initialize estimates (random starting points)}
-\NormalTok{theta\_init = np.random.randn(num\_users)}
-\NormalTok{z\_init = np.random.randn(num\_items)}
-
-\NormalTok{\# Pack parameters into a single vector for the optimizer.}
-\NormalTok{\# First num\_users elements are theta\_est, next num\_items are z\_est.}
-\NormalTok{params\_init = np.concatenate([theta\_init, z\_init])}
-
-\NormalTok{def objective(params):}
-\NormalTok{    """}
-\NormalTok{    Computes the loss and gradient for the current parameters.}
-\NormalTok{    Loss is defined as the negative log likelihood (averaged over observed entries).}
-\NormalTok{    """}
-\NormalTok{    \# Unpack parameters}
-\NormalTok{    theta = params[:num\_users]}
-\NormalTok{    z = params[num\_users:]}
-    
-\NormalTok{    \# Compute difference and estimated probabilities}
-\NormalTok{    diff = theta[:, None] {-} z[None, :]  \# shape: (num\_users, num\_items)}
-\NormalTok{    sigma = sigmoid(diff)}
-    
-\NormalTok{    \# To avoid log(0), clip probabilities a little bit}
-\NormalTok{    eps = 1e{-}8}
-\NormalTok{    sigma = np.clip(sigma, eps, 1 {-} eps)}
-    
-\NormalTok{    \# Compute negative log likelihood only on observed entries}
-\NormalTok{    \# For each observed entry: if data == 1 then {-}log(sigma) else {-}log(1{-}sigma)}
-\NormalTok{    log\_likelihood = data * np.log(sigma) + (1 {-} data) * np.log(1 {-} sigma)}
-\NormalTok{    loss = {-}np.sum(mask * log\_likelihood) / observed\_count}
-    
-\NormalTok{    \# Compute gradient with respect to the difference x = theta\_i {-} z\_j}
-\NormalTok{    \# d(loss)/d(x) = sigma {-} data  (for observed entries, zero otherwise)}
-\NormalTok{    diff\_grad = (sigma {-} data) * mask  \# shape: (num\_users, num\_items)}
-    
-\NormalTok{    \# Gradients for theta: sum over items (axis 1)}
-\NormalTok{    grad\_theta = np.sum(diff\_grad, axis=1) / observed\_count}
-\NormalTok{    \# Gradients for z: negative sum over users (axis 0)}
-\NormalTok{    grad\_z = {-}np.sum(diff\_grad, axis=0) / observed\_count}
-    
-\NormalTok{    \# Pack gradients back into a single vector}
-\NormalTok{    grad = np.concatenate([grad\_theta, grad\_z])}
-\NormalTok{    return loss, grad}
-
-\NormalTok{\# Callback to track progress (optional)}
-\NormalTok{iteration\_progress = tqdm()}
-
-\NormalTok{def callback(xk):}
-\NormalTok{    iteration\_progress.update(1)}
-
-\NormalTok{\# Optimize using L{-}BFGS{-}B}
-\NormalTok{result = minimize(}
-\NormalTok{    fun=lambda params: objective(params),}
-\NormalTok{    x0=params\_init,}
-\NormalTok{    method="L{-}BFGS{-}B",}
-\NormalTok{    jac=True,}
-\NormalTok{    callback=callback,}
-\NormalTok{    options=\{"maxiter": 100, "disp": True\}}
-\NormalTok{)}
-\NormalTok{iteration\_progress.close()}
-
-\NormalTok{\# Extract the estimated parameters}
-\NormalTok{theta\_est = result.x[:num\_users]}
-\NormalTok{z\_est = result.x[num\_users:]}
-
-\NormalTok{\# Compute final estimated probabilities}
-\NormalTok{probs\_final = sigmoid(theta\_est[:, None] {-} z\_est[None, :])}
-
-\NormalTok{\# Compute AUC ROC on observed (training) and missing (test) entries}
-\NormalTok{train\_probs = probs\_final[mask]}
-\NormalTok{test\_probs = probs\_final[\textasciitilde{}mask]}
-\NormalTok{train\_labels = data[mask]}
-\NormalTok{test\_labels = data[\textasciitilde{}mask]}
-
-\NormalTok{auc\_train = roc\_auc\_score(train\_labels, train\_probs)}
-\NormalTok{auc\_test = roc\_auc\_score(test\_labels, test\_probs)}
-
-\NormalTok{print(f"Train AUC: \{auc\_train:.4f\}")}
-\NormalTok{print(f"Test AUC: \{auc\_test:.4f\}")}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ torch}
-\ImportTok{import}\NormalTok{ torch.nn }\ImportTok{as}\NormalTok{ nn}
-\ImportTok{import}\NormalTok{ torch.optim }\ImportTok{as}\NormalTok{ optim}
-\ImportTok{from}\NormalTok{ torch.distributions }\ImportTok{import}\NormalTok{ Bernoulli}
-\ImportTok{from}\NormalTok{ tqdm }\ImportTok{import}\NormalTok{ tqdm}
-
-\CommentTok{\# Set device}
-\NormalTok{device }\OperatorTok{=}\NormalTok{ torch.device(}\StringTok{"cuda"} \ControlFlowTok{if}\NormalTok{ torch.cuda.is\_available() }\ControlFlowTok{else} \StringTok{"cpu"}\NormalTok{)}
-
-\CommentTok{\# Number of users and items}
-\NormalTok{num\_users }\OperatorTok{=} \DecValTok{50}
-\NormalTok{num\_items }\OperatorTok{=} \DecValTok{100}
-
-\CommentTok{\# Generate user{-}specific and item{-}specific rewards}
-\NormalTok{theta }\OperatorTok{=}\NormalTok{ torch.randn(num\_users, device}\OperatorTok{=}\NormalTok{device, requires\_grad}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-\NormalTok{z }\OperatorTok{=}\NormalTok{ torch.randn(num\_items, device}\OperatorTok{=}\NormalTok{device, requires\_grad}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-
-\CommentTok{\# Generate observed choices using logistic function}
-\NormalTok{probs }\OperatorTok{=}\NormalTok{ torch.sigmoid(theta[:, }\VariableTok{None}\NormalTok{] }\OperatorTok{{-}}\NormalTok{ z[}\VariableTok{None}\NormalTok{, :])}
-\NormalTok{data }\OperatorTok{=}\NormalTok{ Bernoulli(probs}\OperatorTok{=}\NormalTok{probs).sample()}
-
-\CommentTok{\# Mask out a fraction of the response matrix}
-\NormalTok{mask }\OperatorTok{=}\NormalTok{ torch.rand\_like(data) }\OperatorTok{\textgreater{}} \FloatTok{0.2}  \CommentTok{\# 80\% observed, 20\% missing}
-\NormalTok{data\_masked }\OperatorTok{=}\NormalTok{ data.clone()}
-\NormalTok{data\_masked[}\OperatorTok{\textasciitilde{}}\NormalTok{mask] }\OperatorTok{=} \BuiltInTok{float}\NormalTok{(}\StringTok{\textquotesingle{}nan\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# Initialize parameters for EM algorithm}
-\NormalTok{theta\_est }\OperatorTok{=}\NormalTok{ torch.randn(num\_users, device}\OperatorTok{=}\NormalTok{device, requires\_grad}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-\NormalTok{z\_est }\OperatorTok{=}\NormalTok{ torch.randn(num\_items, device}\OperatorTok{=}\NormalTok{device, requires\_grad}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-
-\CommentTok{\# Optimizer}
-\NormalTok{optimizer }\OperatorTok{=}\NormalTok{ optim.LBFGS([theta\_est, z\_est], lr}\OperatorTok{=}\FloatTok{0.1}\NormalTok{, max\_iter}\OperatorTok{=}\DecValTok{20}\NormalTok{, history\_size}\OperatorTok{=}\DecValTok{10}\NormalTok{, line\_search\_fn}\OperatorTok{=}\StringTok{"strong\_wolfe"}\NormalTok{)}
-
-\KeywordTok{def}\NormalTok{ closure():}
-\NormalTok{    optimizer.zero\_grad()}
-\NormalTok{    probs\_est }\OperatorTok{=}\NormalTok{ torch.sigmoid(theta\_est[:, }\VariableTok{None}\NormalTok{] }\OperatorTok{{-}}\NormalTok{ z\_est[}\VariableTok{None}\NormalTok{, :])}
-\NormalTok{    loss }\OperatorTok{=} \OperatorTok{{-}}\NormalTok{(Bernoulli(probs}\OperatorTok{=}\NormalTok{probs\_est).log\_prob(data) }\OperatorTok{*}\NormalTok{ mask).mean()}
-\NormalTok{    loss.backward()}
-    \ControlFlowTok{return}\NormalTok{ loss}
-
-\CommentTok{\# EM Algorithm}
-\NormalTok{pbar }\OperatorTok{=}\NormalTok{ tqdm(}\BuiltInTok{range}\NormalTok{(}\DecValTok{100}\NormalTok{))}
-\ControlFlowTok{for}\NormalTok{ iteration }\KeywordTok{in}\NormalTok{ pbar:}
-    \ControlFlowTok{if}\NormalTok{ iteration }\OperatorTok{\textgreater{}} \DecValTok{0}\NormalTok{:}
-\NormalTok{        previous\_theta }\OperatorTok{=}\NormalTok{ theta\_est.clone()}
-\NormalTok{        previous\_z }\OperatorTok{=}\NormalTok{ z\_est.clone()}
-\NormalTok{        previous\_loss }\OperatorTok{=}\NormalTok{ loss.clone()}
-    
-\NormalTok{    loss }\OperatorTok{=}\NormalTok{ optimizer.step(closure)}
-    
-    \ControlFlowTok{if}\NormalTok{ iteration }\OperatorTok{\textgreater{}} \DecValTok{0}\NormalTok{:}
-\NormalTok{        d\_loss }\OperatorTok{=}\NormalTok{ (previous\_loss }\OperatorTok{{-}}\NormalTok{ loss).item()}
-\NormalTok{        d\_theta }\OperatorTok{=}\NormalTok{ torch.norm(previous\_theta }\OperatorTok{{-}}\NormalTok{ theta\_est, p}\OperatorTok{=}\DecValTok{2}\NormalTok{).item()}
-\NormalTok{        d\_z }\OperatorTok{=}\NormalTok{ torch.norm(previous\_z }\OperatorTok{{-}}\NormalTok{ z\_est, p}\OperatorTok{=}\DecValTok{2}\NormalTok{).item()}
-\NormalTok{        grad\_norm }\OperatorTok{=}\NormalTok{ torch.norm(optimizer.param\_groups[}\DecValTok{0}\NormalTok{][}\StringTok{"params"}\NormalTok{][}\DecValTok{0}\NormalTok{].grad, p}\OperatorTok{=}\DecValTok{2}\NormalTok{).item()}
-\NormalTok{        grad\_norm }\OperatorTok{+=}\NormalTok{ torch.norm(optimizer.param\_groups[}\DecValTok{0}\NormalTok{][}\StringTok{"params"}\NormalTok{][}\DecValTok{1}\NormalTok{].grad, p}\OperatorTok{=}\DecValTok{2}\NormalTok{).item()}
-\NormalTok{        pbar.set\_postfix(\{}\StringTok{"grad\_norm"}\NormalTok{: grad\_norm, }\StringTok{"d\_theta"}\NormalTok{: d\_theta, }\StringTok{"d\_z"}\NormalTok{: d\_z, }\StringTok{"d\_loss"}\NormalTok{: d\_loss\})}
-        \ControlFlowTok{if}\NormalTok{ d\_loss }\OperatorTok{\textless{}} \FloatTok{1e{-}5} \KeywordTok{and}\NormalTok{ d\_theta }\OperatorTok{\textless{}} \FloatTok{1e{-}5} \KeywordTok{and}\NormalTok{ d\_z }\OperatorTok{\textless{}} \FloatTok{1e{-}5} \KeywordTok{and}\NormalTok{ grad\_norm }\OperatorTok{\textless{}} \FloatTok{1e{-}5}\NormalTok{:}
-            \ControlFlowTok{break}
-
-\CommentTok{\# Compute AUC ROC on observed and inferred data}
-\ImportTok{from}\NormalTok{ torchmetrics }\ImportTok{import}\NormalTok{ AUROC}
-\NormalTok{auroc }\OperatorTok{=}\NormalTok{ AUROC(task}\OperatorTok{=}\StringTok{"binary"}\NormalTok{)}
-\NormalTok{probs\_final }\OperatorTok{=}\NormalTok{ torch.sigmoid(theta\_est[:, }\VariableTok{None}\NormalTok{] }\OperatorTok{{-}}\NormalTok{ z\_est[}\VariableTok{None}\NormalTok{, :])}
-\NormalTok{train\_probs }\OperatorTok{=}\NormalTok{ probs\_final[mask]}
-\NormalTok{test\_probs }\OperatorTok{=}\NormalTok{ probs\_final[}\OperatorTok{\textasciitilde{}}\NormalTok{mask]}
-\NormalTok{train\_labels }\OperatorTok{=}\NormalTok{ data[mask]}
-\NormalTok{test\_labels }\OperatorTok{=}\NormalTok{ data[}\OperatorTok{\textasciitilde{}}\NormalTok{mask]}
-\NormalTok{auc\_train }\OperatorTok{=}\NormalTok{ auroc(train\_probs, train\_labels)}
-\NormalTok{auc\_test }\OperatorTok{=}\NormalTok{ auroc(test\_probs, test\_labels)}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"train auc: }\SpecialCharTok{\{}\NormalTok{auc\_train}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"test auc: }\SpecialCharTok{\{}\NormalTok{auc\_test}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-train auc: 0.8305394053459167
-test auc: 0.7656601071357727
-\end{verbatim}
-
-\subsection{Pairwise Model}\label{pairwise-model}
-
-In \emph{pairwise sampling}, participants compare two items to determine
-which is preferred. One of the major advantages of this method is the
-low cognitive demand for raters. Its disadvantage is the limited amount
-of information content elicited by a sample. Below is a survey based on
-pairwise sampling:
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{n\_pairs = 10000}
-\NormalTok{pair\_indices = np.random.randint(0, n\_items, size=(n\_pairs, 2))}
-\NormalTok{\# Exclude pairs where both indices are the same}
-\NormalTok{mask = pair\_indices[:, 0] != pair\_indices[:, 1]}
-\NormalTok{pair\_indices = pair\_indices[mask]}
-
-\NormalTok{scores = np.zeros(n\_items, dtype=int)}
-\NormalTok{wins = rewards[pair\_indices[:, 0]] \textgreater{} rewards[pair\_indices[:, 1]]}
-
-\NormalTok{\# For pairs where the first item wins:}
-\NormalTok{\#   {-} Increase score for the first item by 1}
-\NormalTok{\#   {-} Decrease score for the second item by 1}
-\NormalTok{np.add.at(scores, pair\_indices[wins, 0], 1)}
-\NormalTok{np.add.at(scores, pair\_indices[wins, 1], {-}1)}
-
-\NormalTok{\# For pairs where the second item wins or it\textquotesingle{}s a tie:}
-\NormalTok{\#   {-} Decrease score for the first item by 1}
-\NormalTok{\#   {-} Increase score for the second item by 1}
-\NormalTok{np.add.at(scores, pair\_indices[\textasciitilde{}wins, 0], {-}1)}
-\NormalTok{np.add.at(scores, pair\_indices[\textasciitilde{}wins, 1], 1)}
-
-\NormalTok{\# Determine preferred and non{-}preferred items based on scores}
-\NormalTok{preferred = scores \textgreater{} 0}
-\NormalTok{non\_preferred = scores \textless{} 0}
-
-\NormalTok{draw\_surface()}
-\NormalTok{plt.scatter(items[preferred, 0], items[preferred, 1], c=\textquotesingle{}blue\textquotesingle{}, label=\textquotesingle{}Preferred\textquotesingle{}, alpha=0.5)}
-\NormalTok{plt.scatter(items[non\_preferred, 0], items[non\_preferred, 1], c=\textquotesingle{}purple\textquotesingle{}, label=\textquotesingle{}Non{-}preferred\textquotesingle{}, alpha=0.5)}
-\NormalTok{plt.legend()}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{n\_pairs }\OperatorTok{=} \DecValTok{10000}
-\NormalTok{pair\_indices }\OperatorTok{=}\NormalTok{ np.random.randint(}\DecValTok{0}\NormalTok{, n\_items, size}\OperatorTok{=}\NormalTok{(n\_pairs, }\DecValTok{2}\NormalTok{))}
-\CommentTok{\# Exclude pairs where both indices are the same}
-\NormalTok{mask }\OperatorTok{=}\NormalTok{ pair\_indices[:, }\DecValTok{0}\NormalTok{] }\OperatorTok{!=}\NormalTok{ pair\_indices[:, }\DecValTok{1}\NormalTok{]}
-\NormalTok{pair\_indices }\OperatorTok{=}\NormalTok{ pair\_indices[mask]}
-
-\NormalTok{scores }\OperatorTok{=}\NormalTok{ np.zeros(n\_items, dtype}\OperatorTok{=}\BuiltInTok{int}\NormalTok{)}
-\NormalTok{wins }\OperatorTok{=}\NormalTok{ rewards[pair\_indices[:, }\DecValTok{0}\NormalTok{]] }\OperatorTok{\textgreater{}}\NormalTok{ rewards[pair\_indices[:, }\DecValTok{1}\NormalTok{]]}
-
-\CommentTok{\# For pairs where the first item wins:}
-\CommentTok{\#   {-} Increase score for the first item by 1}
-\CommentTok{\#   {-} Decrease score for the second item by 1}
-\NormalTok{np.add.at(scores, pair\_indices[wins, }\DecValTok{0}\NormalTok{], }\DecValTok{1}\NormalTok{)}
-\NormalTok{np.add.at(scores, pair\_indices[wins, }\DecValTok{1}\NormalTok{], }\OperatorTok{{-}}\DecValTok{1}\NormalTok{)}
-
-\CommentTok{\# For pairs where the second item wins or it\textquotesingle{}s a tie:}
-\CommentTok{\#   {-} Decrease score for the first item by 1}
-\CommentTok{\#   {-} Increase score for the second item by 1}
-\NormalTok{np.add.at(scores, pair\_indices[}\OperatorTok{\textasciitilde{}}\NormalTok{wins, }\DecValTok{0}\NormalTok{], }\OperatorTok{{-}}\DecValTok{1}\NormalTok{)}
-\NormalTok{np.add.at(scores, pair\_indices[}\OperatorTok{\textasciitilde{}}\NormalTok{wins, }\DecValTok{1}\NormalTok{], }\DecValTok{1}\NormalTok{)}
-
-\CommentTok{\# Determine preferred and non{-}preferred items based on scores}
-\NormalTok{preferred }\OperatorTok{=}\NormalTok{ scores }\OperatorTok{\textgreater{}} \DecValTok{0}
-\NormalTok{non\_preferred }\OperatorTok{=}\NormalTok{ scores }\OperatorTok{\textless{}} \DecValTok{0}
-
-\NormalTok{draw\_surface()}
-\NormalTok{plt.scatter(items[preferred, }\DecValTok{0}\NormalTok{], items[preferred, }\DecValTok{1}\NormalTok{], c}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Preferred\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{)}
-\NormalTok{plt.scatter(items[non\_preferred, }\DecValTok{0}\NormalTok{], items[non\_preferred, }\DecValTok{1}\NormalTok{], c}\OperatorTok{=}\StringTok{\textquotesingle{}purple\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Non{-}preferred\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{)}
-\NormalTok{plt.legend()}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/chap2_files/figure-pdf/cell-7-output-1.pdf}}
-
-The Bradley-Terry model compares the reward of choice over all others
-(\citeproc{ref-bradley-terry-model}{Bradley and Terry 1952}) in the set
-of \(J\) choices \(i \in \{1, 2, \dots, J\}\). Each choice can also have
-its unique random noise variable representing the unobserved factor.
-However, we can also choose to have all choices' unobserved factors
-follow the same distribution (e.g., independent and identically
-distributed, IID). The noise is represented as an extreme value
-distribution, although we can choose alternatives such as a multivariate
-Gaussian distribution: \(\epsilon \sim \mathcal{N}(0, \Sigma)\). If
-\(\Sigma\) is not a diagonal matrix, we effectively model correlations
-in the noise across choices, enabling us to avoid the IID assumption. In
-the case of the extreme value distribution, we model the probability of
-a user preferring choice \(i\), which we denote as
-\(P_i = Z^{-1}\exp(u_{i,j}^*)\) where
-\(Z = \sum_{j = 1}^{J} \exp(u_{i,j}^*)\).
-
-We can model an open-ended ranking of the available items with the
-Plackett-Luce model, in which we jointly model the full sequence of
-choice ordering (\citeproc{ref-plackett_luce}{Plackett 1975}). The
-general form models the joint distribution as the product of conditional
-probabilities, where each is conditioned on the preceding ranking terms.
-Given an ordering of \(J\) choices \(\{y_1, \dots, y_J\}\), we factorize
-the joint probability into conditionals. Each conditional follows the
-Bradley-Terry model: \[
-p(y_1, \dots, y_J) = p(y_1) p(y_2 | y_1) ... p(y_J | y_{1:{J - 1}}) = \prod_{i = 1}^J \frac{\exp(u_{i,j}^*)}{\sum_{j \ge i} \exp(u_{i,j}^*)}
-\]
-
-Pairwise sampling has proven useful in aligning large language models
-(LLM) with human preference. An LLM, such as GPT-4, Llama 3.2, and BERT,
-typically refers to a large and pre-trained neural network that serves
-as the basis for various downstream tasks. They are pre-trained on a
-massive corpus of text data, learning to understand language and
-context. They are capable of multiple language-related tasks such as
-text classification, language generation, and question answering. A LLM
-should be aligned to respond correctly based on human preferences. A
-promising approach is to train LLMs using reinforcement learning (RL)
-with the reward model (RM) learned from human preference data, providing
-a mechanism to score the quality of the generated text. This approach,
-known as RL from human feedback (RLHF), leverages human feedback to
-guide model training, allowing LLMs to better align with human
-expectations while continuously improving performance.
-
-We discuss the reward model used in the Llama2 model. The Llama2 RM
-(\citeproc{ref-2307.09288}{Touvron et al. 2023}) is initialized from the
-pretrained Llama2 LLM. In the LLM, the last layer is a mapping
-\(L: \mathbb{R}^D \rightarrow \mathbb{R}^V\), where \(D\) is the
-embedding dimension from the transformer decoder stack and \(V\) is the
-vocabulary size. To get the RM, we replace that last layer with a
-randomly initialized scalar head that maps
-\(L: \mathbb{R}^D \rightarrow \mathbb{R}^1\). It's important to
-initialize the RM from the LLM it's meant to evaluate. The RM will have
-the same ``knowledge'' as the LLM. This is particularly useful for
-evaluation objectives such as ``Does the LLM know when it doesn't
-know?''. However, in cases where the RM is simply evaluating helpfulness
-or factuality, it may be helpful to have the RM know more. In addition,
-the RM is on distribution for the LLM - it is initialized in a way where
-it semantically understands the LLM's outputs. An RM is trained with
-paired preferences (prompt history, accepted response, rejected
-response). Prompt history is a multiturn history of user prompts and
-model generations; the accepted response is the preferred final model
-generation by an annotator, and the rejected response is the unpreferred
-response. The RM is trained with maximum likelihood under the
-Bradley-Terry model with an optional margin term m(r):
-
-\[p(y_c \succ y_r | x) = \sigma(r_\theta(x,y_c) - r_\theta(x,y_r) - m(r))\]
-
-The margin term increases the distance in scores specifically for
-preference pairs annotators rate as easier to separate. Margins were
-designed primarily based on the sigmoid function, which is used to
-normalize the raw reward model score flattens out beyond the range of
-\([-4, 4]\). Thus, the maximum possible margin is eight. A small
-regularization term is often added to center the score distribution on
-0. We consider two variants of preference rating-based margin. When the
-preference rating-based margin is small, outcomes are rated as
-``Significantly Better'' (1), ``Better'' (2 out of 3), and ``Slightly
-Better'' (1 out of 3), and ``Negligibly Better or Unsure'' (0 out of 3).
-In contrast, when the margin is large, outcomes are rated as
-``Significantly Better'' (3), ``Better'' (2), and ``Slightly Better''
-(1), and ``Negligibly Better or Unsure'' (0 out of 3).
-
-\subsection{List-wise Model}\label{list-wise-model}
-
-\emph{Multiple-choice sampling} involves participants selecting one item
-from a set of alternatives. Multiple-choice sampling is simple for
-participants to understand and reflect on realistic decision-making
-scenarios where individuals choose one item from many. It is beneficial
-in complex choice scenarios, such as modes of transportation, where
-choices are not independent (\citeproc{ref-bolt2009}{Bolt and Wollack
-2009}). Multiple-choice sampling often relies on simplistic assumptions
-such as the independence of irrelevant alternatives (IIA), which may not
-always hold true. This method may also fail to capture the variation in
-preferences among different individuals, as it typically records only
-the most preferred choice without accounting for the relative importance
-of other items. In \emph{rank-order sampling}, participants rank items
-from most to least preferred. Used in voting, market research, and
-psychology, it provides rich preference data but is more complex and
-cognitively demanding than pairwise comparisons, especially for large
-item sets. Participants may also rank inconsistently
-(\citeproc{ref-ragain2019}{Ragain and Ugander 2019}). \emph{In
-Best-worst scaling} (BWS), participants are presented with items and
-asked to identify the most and least preferred items. The primary
-objective of BWS is to discern the relative importance or preference of
-items, making it widely applicable in various fields such as market
-research, health economics, and social sciences
-(\citeproc{ref-campbell2015}{Campbell and Erdem 2015}). BWS provides
-rich data on the relative importance of items, helps clarify
-preferences, reduces biases found in traditional rating scales, and
-results in rewards that are easy to interpret. However, BWS also has
-limitations, including potential scale interpretation differences among
-participants and design challenges to avoid biases, such as the order
-effect or the context in which items are presented.
-
-\section{The Utility Function Class}\label{function-class}
-
-\subsection{Parametric and Nonparametric Function
-Class}\label{parametric-and-nonparametric-function-class}
-
-The reward of the item can take parametric form, such as
-\(z_j = f_{\theta}(x_j)\). It can also take the nonparametric form,
-which is commonly used in the ideal point model, where the reward of an
-item \(j\) is calculated by the distance from the item to the human in
-some embedding space(\citeproc{ref-huber1976ideal}{Huber 1976}). Given
-vector representation \(e_i\) of choice \(i\) and a vector \(v_n\)
-representing an individual \(n\), we can use a distance function \(K\)
-to model a stochastic reward function with the unobserved factors
-following a specified distribution:
-\(u_{n, i} = K(e_i, v_n) + \epsilon_{n, i}\). The intuition is that
-vectors exist in a shared \(n\)-dimensional space, and as such, we can
-use geometry to match choices whose representations are closest to that
-of a given individual (\citeproc{ref-ideal_point}{Jamieson and Nowak
-2011}; \citeproc{ref-tatli2022distancepreferences}{Tatli, Nowak, and
-Vinayak 2022}) when equipped with a distance metric. Certain distance
-metrics, such as Euclidian distance or inner product, can easily be
-biased by the scale of vectors. A distance measure such as cosine
-similarity, which compensates for scale by normalizing the inner product
-of two vectors by the product of their magnitudes, can mitigate this
-bias yet may discard valuable information encoded by the length of the
-vectors. Beyond the distance metric alone, this model places a strong
-inductive bias that the individual and choice representations share a
-common embedding space. In some contexts, this can be a robust bias to
-add to the model (\citeproc{ref-idealpoints}{Greiner 2005}), but it is a
-key factor one must consider before employing such a model, and it is a
-key design choice for modeling.
-
-\subsection{Unimodal and Multimodal Function
-Class}\label{unimodal-and-multimodal-function-class}
-
-So far, we have considered learning from data from one person with a
-particular set of preferences or a group with similar preferences, but
-this is not always the case. Consider a scenario where a user turns left
-at an intersection (\citeproc{ref-myers2021learning}{Myers et al.
-2021}). What would they do if they saw a car speeding down the road
-approaching them? Following a timid driving pattern, some vehicles would
-stop to let the other car go, preventing a collision. Other vehicles
-would be more aggressive and try to make the turn before colliding with
-the oncoming vehicle. Given the data of one of these driving patterns,
-the model can make an appropriate decision. However, what if the model
-was given data from both aggressive and timid drivers and does not know
-which data corresponds to which type of driver? A naive preference
-learning approach would result in a model trying to find a policy close
-enough to both driving patterns. The group label is often unobserved
-because it is expensive to obtain or a data point cannot be cleanly
-separated into any group (e.g., a more timid driver can be aggressive
-when they are in a hurry).
-
-Myers et al. (\citeproc{ref-myers2022learning}{2022}) formulates this
-problem as learning a mixture of \(M\) linear reward functions on the
-embedding space, where \(M\) is given. The reward of item \(j\) given by
-the expert \(i\) is given by: \(f_i(e_j) = w^\top_i e_j,\) where \(w_m\)
-is a vector of parameters corresponding to the \(m\)-th expert's
-preferences. An unknown distribution over the reward parameters exists,
-and we can represent this distribution with convex mixing coefficients
-\(\alpha = [\alpha_1, ..., \alpha_M]\). Consider a robot that performs
-the following trajectories and asks a user to rank all the trajectories.
-The robot will be given back a set of trajectory rankings from M humans,
-and the objective is to learn the underlying reward function. Given the
-ranking \((j_1 \succ ... \succ j_K | m)\) of expert \(m\) and define
-\(\theta = \{w_{1:M}, \alpha_{1:M}\}\), the probability of item \(j\)
-being preferred by \(m\) over all other alternatives is
-
-\[p(j_1 \succ ... \succ j_K | \theta) = \sum_{i = 1}^M \alpha_i \prod_{j = 1}^K  p_{ij}\]
-
-Then the parameters posterior is
-\(p(\theta | Q_{1:T}, x_{1:T}) \propto p(\theta) \prod_t p(x_t | Q_{\leq t}, \theta) = p(\theta) \prod_t p(x_t | \theta, Q_t)\).
-The first proportionality is from the Bayes rule and the assumption that
-the queries at timestamp \(t\) are conditionally independent of the
-parameters given history. This assumption is reasonable because the
-previous queries \& rankings ideally give all the information to inform
-the choice of the next set. The last proportionality term comes from the
-assumption that the ranked queries are conditionally independent given
-the parameters. The prior distribution is dependent on the use case. For
-example, in the user studies conducted by the authors to verify this
-method, they use a standard Gaussian for the reward weights and the
-mixing coefficients to be uniform on a \(M - 1\) simplex to ensure that
-they add up to 1. Then, we can use maximum likelihood estimation to
-compute the parameters with the simplified posterior.
-
-Another example setting multimodal preference is negotiations
-(\citeproc{ref-kwon2021targeted}{Kwon et al. 2021}). Let's say there are
-some shared items and two people with different utilities and desires
-for items, where each person only knows their utility. In a specific
-case of \textbf{?@fig-negotiation}, Bob as a proposing agent and Alice
-as a controlled agent who has many different ways of responding to Bob's
-proposals. Different methods can be used to design Alice as an AI agent.
-The first idea is reinforcement learning, where multiple rounds of
-negotiations are done, the model simulates game theory and sees how Bob
-reacts. Authors of this setting (\citeproc{ref-kwon2021targeted}{Kwon et
-al. 2021}) show that over time the model learns to ask for the same
-thing over and over again, as Alice is not trained to be human-like or
-negotiable, and just tries to maximize Alice's utility. The second
-approach is supervised learning, where the model can be trained on some
-dataset, learning the history of negotiations. This results in Alice
-being very agreeable, which demonstrates two polar results of the two
-approaches, and it would be ideal to find a middle ground and combine
-both of them. The authors proposed the Targeted acquisition approach,
-which is based on active learning ideas. The model asks diverse
-questions at different cases and stages of negotiations like humans,
-determining which questions are more valuable to be asked throughout
-learning. Such an approach ended up in more fair and optimal results
-than supervised or reinforcement learning
-(\citeproc{ref-kwon2021targeted}{Kwon et al. 2021}).
-
-\subsection{Single Objective and Multi-Objective
-Utility}\label{single-objective-and-multi-objective-utility}
-
-The industry has centered around optimizing for two primary reward
-signals: helpfulness and harmlessness (safety). There are also other
-axes, such as factuality, reasoning, tool use, code, and
-multilingualism, but these are out of scope for us. The Llama2 paper
-collected preference data from humans for each quality, with separate
-guidelines. This presents a challenge for co-optimizing the final LLM
-towards both goals. Two main approaches can be taken for RLHF in this
-context. Train a unified reward model that integrates both datasets or
-train two separate reward models, one for each quality, and optimize the
-LLM toward both. Option 1 is difficult because of the tension between
-helpfulness and harmlessness. They trade off against each other,
-confusing an RM trained in both. The chosen solution was item 2, where
-two RMs are used to train the LLM piecewise. The helpfulness RM is used
-as the primary optimization term, while the harmlessness RM acts as a
-penalty term, driving the behavior of the LLM away from unsafe territory
-only when the LLM veers beyond a certain threshold. This is formalized
-as follows, where \(R_s\), \(R_h\), and \(R_c\) are the safety,
-helpfulness, and combined reward, respectively. \(g\) and \(p\) are the
-model generation and the user prompt:
-
-\[
-\begin{aligned}
-    R_c(g \mid p) =
-    \begin{cases}
-        R_s(g \mid p) & \text{if } \text{is\_safety}(p) \text{ or } R_s(g \mid p) < 0.15 \\
-        R_h(g \mid p) & \text{otherwise}
-    \end{cases}
-\end{aligned}
-\]
-
-\subsection{Pretraining}\label{pretraining}
-
-RL often stumbles when it comes to devising reward functions aligning
-with human intentions. Preference-based RL aims to solve this by
-learning from human feedback, but this often demands a \emph{highly
-impractical number of queries} or leads to oversimplified reward
-functions that don't hold up in real-world tasks. As discussed in the
-previous section, one may apply meta-learning so that the RL agent can
-adapt to new tasks with fewer human queries to address the impractical
-requirement of human queries. (\citeproc{ref-hejna2023few}{Hejna III and
-Sadigh 2023}) proposes pre-training models on previous tasks with the
-meta-learning method MAML (\citeproc{ref-finn2017model}{Finn, Abbeel,
-and Levine 2017}), and then the meta-trained model can adapt to new
-tasks with fewer queries. We consider settings where a state is denoted
-as \(s\in S\), and action is denoted as \(a\in A\), for state space
-\(S\) and action space \(A\). The reward function
-\(r: S\times A \to \mathbb{R}\) is unknown and needs to be learned from
-eliciting human preferences. There are multiple tasks, each with its own
-reward function and transition probabilities. The reward model is
-parameterized by \(\psi\). We denote \(\hat{r}_\psi(s, a)\) to be a
-learned estimate of an unknown ground-truth reward function \(r(s, a)\),
-parameterized by \(\psi\). Accordingly, a reward model determines an RL
-policy \(\phi\) by maximizing the accumulated rewards. The preferences
-is learned via pair. For each pre-training task, there is a dataset
-\(D\) consists of binary preference between pair of trajectory.
-Bradley-Terry model is used to predict the preferred trajectory.
-
-To efficiently approximate the reward function \(r_\text{new}\) for a
-new task with minimal queries, Hejna III and Sadigh
-(\citeproc{ref-hejna2023few}{2023}) utilizes a pre-trained reward
-function \(\hat{r}_\psi\) that can be quickly fine-tuned using just a
-few preference comparisons by leveraging the common structure across
-tasks by pre-training on data from prior tasks. Although any
-meta-learning method is compatible, (\citeproc{ref-hejna2023few}{Hejna
-III and Sadigh 2023}) opts for Model Agnostic Meta-Learning (MAML) due
-to its simplicity. With the aforementioned pre-training with meta
-learning, the meta-learned reward model can then be used for few-shot
-preference-based RL during an online adaptation phase. Given a
-pre-trained reward model \(\psi\), the the active few-shot adaption
-iterates between finding informative pair of trajectory to query human
-and update reward model and corresponding policy with new data.
-Informative pair is selected using the disagreement of an ensemble of
-reward functions over the preference predictors. Specifically,
-comparisons that maximize \(\mathbb{V}(p(e_j \succ e_{j'}))\) are
-selected each time feedback is collected.
-
-The experiment tests the proposed method on the Meta-World benchmark
-(\citeproc{ref-yu2020meta}{Yu et al. 2020}). Three baselines compared
-with the proposed method are (1) Soft-Actor Critic (SAC) trained from
-ground truth rewards, representing performance upper bound, PEBBLE
-(\citeproc{ref-lee2021pebble}{Lee, Smith, and Abbeel 2021}), which does
-not use information from prior tasks, and (3) Init, which initializes
-the reward model with the pretrained weights from meta learning but
-instead of adapting the reward model to the new task, it performs
-standard updates as in PEBBLE. The results show that the proposed method
-outperforms all of the baseline methods. There are still some drawbacks.
-For example, many of the queries the model picks to elicit human
-preference are almost identical. Moreover, despite the improved query
-complexity, an impractical number of queries still need to be made. In
-addition, it is mentioned in the paper that the proposed method may be
-even worse than training from scratch if the new task is too
-out-of-distribution. Designing a method that automatically balances
-between using the prior information or training from scratch is an
-important future direction.
-
-Zhou et al. (\citeproc{ref-zhou2019watch}{2019}) studies a related
-problem by asking the question, ``How can we efficiently learn both from
-expert demonstrations and from trials where we only get binary feedback
-from a human?'' This paper seeks to learn new tasks with the following
-general problem setting: We only get one expert demonstration of the
-target task; after seeing the expert demonstration, robots try to solve
-the task 1 or more times; then the user (or some pre-defined reward
-function) annotates each trial as a success/failure; the agent learns
-from both the demos and the annotated trials to perform well on the
-target task. A task \(i\) is described by the tuple
-\(\{S, A, r_i, P_i\}\). \(S\) and \(A\) represents all possible states
-and action, respectively. \(r_i\) is the reward function
-\(r_i : S \times A \to \mathbb{R}\), and \(P_i\) is the transition
-dynamics function. \(S\) and \(A\) are shared across tasks. Learning
-occurs in 3 phases. During the watch phase, we give the agent \(K=1\)
-demonstrations of the target tasks and all demonstrations are
-successful. In the Try phase, we use the agent learned during the Watch
-phase to attempt the task for \(L\) trials. After the agent completes
-the trials, humans (or pre-programmed reward functions) provide one
-binary reward for each trial, indicating whether the trial was
-successful. The expected output of this phase is \(L\) trajectories and
-corresponding feedback. After completing the trials, the agent must
-learn from both the original expert demonstrations and the trials to
-solve the target task.
-
-First, we are given a dataset of expert demonstrations containing
-multiple demos for each task and the dataset contains hundreds of tasks.
-Importantly, no online interaction is needed for training, and this
-method trains only with supervised learning. This section describes how
-this paper trains an agent from the given expert demonstrations, and how
-to incorporate the trials and human feedback into the loop. What we want
-to obtain out of the Watch phase is a policy conditioned on a set of
-expert demonstrations via meta-imitation learning. Given the
-demonstrations \(\{d_{i,k}\}\) for task \(i\), we sample another
-different demonstration coming from the same task \(d_i^{\text{test}}\),
-where \(d_i^{\text{test}}\) is an example of optimal behavior given the
-demonstrations. The policy is obtained by imitating actions taken on
-\(d_i^{\text{test}}\) via maximum likelihood:
-
-\[\mathcal{L}^\text{watch}(\theta, \mathcal{D}_i^*) = \mathbb{E}_{\{d_{i,k}\} \sim \mathcal{D}_i^*} \mathbb{E}_{\{d_{i,k}^{\text{test}}\} \sim \mathcal{D}_i^*  \{d_{i,k}\}} \mathbb{E}_{(s_t, a_t) \sim d_i^{\text{test}}} \log \pi_\theta^{\text{watch}} (a_t | s_t, \{d_{i,k}\})\]
-
-This corresponds to imitation learning by minimizing the negative
-log-likelihood of the test trajectory actions, conditioning the policy
-on the entire demo set. However, how is the conditioning on the demo set
-achieved? In addition to using features obtained from the images of the
-current state, the architecture uses features from frames sampled (in
-order) from the demonstration episodes, which are concatenated together.
-On the Try phase when the agent is given a set of demonstrations
-\(\{d_{i,k}\}\), we deploy the policy
-\(\pi_\theta^{\text{watch}}(a | s, \{d_{i,k}\})\) to collect \(L\)
-trials. There is no training involved in the Try phase; we simply
-condition the policy on the given demonstrations. During the Watch
-phase, the objective was to train a policy conditioned on demonstrations
-\(\pi_\theta^{\text{watch}}(a | s, \{d_{i,k}\})\). The authors of Watch,
-Try, Learn uses a similar strategy as the Watch phase for the Learn
-phase. We now want to train a policy that is conditioned on the
-demonstrations, as well as the trials and binary feedback. We want to
-learn
-\(\pi_\phi^{\text{watch}}(a | s, \{d_{i,k}\}, \{\mathbf{\tau}_{i, l}\})\).
-To train the policy, we again use meta-imitation learning, where we
-additionally sample yet another trajectory from the same task.
-Concretely, we train policy parameters \(\phi\) to minimize the
-following loss:
-\[\mathcal{L}^{\text{learn}}(\phi, \mathcal{D}_i, \mathcal{D}_i^*) = \mathbb{E}_{(\{d_{i,k}\}, \{\mathbf{\tau}_{i,l}\}) \sim \mathcal{D}_i} \mathbb{E}_{\{d_{i,k}^{\text{test}}\} \sim \mathcal{D}_i^* \{d_{i,k}\}} \mathbb{E}_{(s_t, a_t) \sim d_i^{\text{test}}} \big[- \log \pi_\theta^{\text{learn}} (a_t | s_t, \{d_{i,k}\}, \{\tau_{i,l}\}) \big]\]
-
-Three baselines are considered: (1) behavior cloning is simple imitation
-learning based on maximum log-likelihood training using data from all
-tasks, (2) meta-imitation learning corresponds to simply running the
-policy from the Watch step without using any trial data. We only
-condition on the set of expert demonstrations, but no online trials, and
-(3) behavior cloning + SAC pretrains a policy with behavior cloning on
-all data, and follow that with RL fine-tuning for the specific target
-task, using the maximum-entropy algorithm SAC
-(\citeproc{ref-haarnoja2018soft}{Haarnoja et al. 2018}). The proposed
-approach significantly outperforms baselines on every task family: it is
-far superior to behavior cloning and it significantly surpasses
-Meta-Imitation Learning on 3 out of 4 task families.
-
-\subsection{Others Consideration}\label{others-consideration}
-
-One key challenge is managing the bias and variance trade-off. Bias
-refers to assumptions made during model design and training that can
-skew predictions. For example, in Ideal Point Models, we make the
-assumption that the representations we use for individuals and choices
-are aligned in the embedding space and that this representation is
-sufficient to capture human preferences using distance metrics. However,
-there are myriad cases in which this may break down, for example, if the
-two sets of vectors follow different distributions, each with their own
-unique biases. If the representations do not come from the same domain,
-one may have little visibility into how a distance metric computes the
-final reward value for a choice for a given individual. Some ways to
-mitigate bias in human preference models include increasing the number
-of parameters in a model (allowing for better learning of patterns in
-the data) or removing inductive biases based on our assumptions of the
-underlying data. On the other hand, variance refers to the model's
-sensitivity to small changes in the input, which leads to significant
-changes in the output. This phenomenon is often termed `overfitting' or
-`overparameterization.' This behavior can occur in models that have many
-parameters and learn correlations in the data that do not contribute to
-learning human preferences but are artifacts of noise in the dataset
-that one should ultimately ignore. One can address variance in models by
-reducing the number of parameters or incorporating biases in the model
-based on factors we can assume about the data.
-
-Another important consideration unique to human preference models is
-that we wish to model individual preferences, and we may choose to do so
-at arbitrary granularity. For example, we can fit models to a specific
-individual or even multiple models for an individual, each for different
-purposes or contexts. On the other end of the spectrum, we may create a
-model to capture human preferences across large populations or the
-world. Individual models may prove to be more powerful, as they do not
-need to generalize across multiple individuals and can dedicate all of
-their parameters to learning the preferences of a single user. In the
-context of human behavior, this can be a significant advantage as any
-two individuals can be arbitrarily different or even opposite in their
-preferences. On the other hand, models that fit only one person can
-tremendously overfit the training distribution and capture noise in the
-data, which is not truly representative of human preferences. On the end
-of the spectrum, models fit to the entire world may be inadequate to
-model human preferences for arbitrary individuals, especially those
-whose data it has not been fit to. As such, models may underfit the
-given training distribution. These models aim to generalize to many
-people but may fail to capture the nuances of individual preferences,
-especially for those whose data is not represented in the training set.
-As a result, they may not perform well for arbitrary individuals within
-the target population. Choosing the appropriate scope for a model is
-crucial. It must balance the trade-off between overfitting to noise in
-highly granular models and underfitting in broader models that may not
-capture individual nuances.
-
-When training or using a reward model, LLM Distribution Shift is an
-important factor to consider. With each finetune of the LLM, the RM
-should be updated through a collection of fresh human preferences using
-generations from the new LLM. This ensures that the RM stays aligned
-with the current distribution of the LLM and avoids drifting
-off-distribution. In addition, RM and LLM are coupled: An RM is
-generally optimized to distinguish human preferences more efficiently
-within the specific distribution of the LLM to be optimized. However,
-this specialization poses a challenge: such an RM will underperform when
-dealing with generations not aligned with this specific LLM
-distribution, such as generations from a completely different LLM. Last
-but not least, training RMs can be unstable and prone to overfitting,
-especially with multiple training epochs. It's generally advisable to
-limit the number of epochs during RM training to avoid this issue.
-
-\section{Exercises}\label{exercises}
-
-\subsection*{Question 1: Choice Modeling (15
-points)}\label{question-1-choice-modeling-15-points}
-\addcontentsline{toc}{subsection}{Question 1: Choice Modeling (15
-points)}
-
-We discussed discrete choice modeling in the context of reward being a
-linear function. Suppose we are deciding between \(N\) choices and that
-the reward for each choice is given by
-\(U_i=\beta_i\mathbf{x}+\epsilon_i\) for \(i=1, 2, \cdots, N\). We view
-\(\mathbf{x}\) as the data point that is being conditioned on for
-deciding which choice to select, and \(\beta_i\) as the weights driving
-the linear reward model. The noise \(\epsilon_i\) is i.i.d. sampled from
-a type of extreme value distribution called the \emph{Gumbel}
-distribution. The standard Gumbel distribution is given by the density
-function \(f(x)=e^{-(x+e^{-x})}\) and cumulative distribution function
-\(F(x)=e^{-e^{-x}}.\) Fix \(i\). Our objective is to calculate
-\(p(U_i\,\, \text{has max reward})\).
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{(Written, 2 points)}. Set \(U_i=t\) and compute \(p(U_j<t)\)
-  for \(j\neq i\) in terms of \(F\). Use this probability to derive an
-  integral for \(p(U_i\,\,  \text{has max reward})\) over \(t\) in terms
-  of \(f\) and \(F\). Example of solution environment.
-\item
-  \textbf{(Written, 4 points)}. Compute the integral derived in part (a)
-  with the appropriate \(u\)-substitution. You should arrive at
-  multi-class logistic regression!
-\end{enumerate}
-
-Next, you will implement logistic regression to predict preferred
-completions. We will use the preference dataset from
-\href{https://huggingface.co/datasets/allenai/reward-bench}{RewardBench}.
-Notice the provided \texttt{data/chosen\_embeddings.pt} and
-\texttt{data/rejected\_embeddings.pt} files. These files were
-constructed by feeding the prompt alongside the chosen/rejected
-responses through Llama3-8B-Instruct and selecting the last token's
-final hidden embedding. Let \(e_1\) and \(e_2\) be two hidden embeddings
-with \(e_1\succ e_2\). We assume reward is a linear function of
-embedding \(u_j=w^\top e_j\) and use the Bradley-Terry model to predict
-the preferred item. We can view maximum likelihood across the preference
-dataset with this model as logistic regression on \(e_1-e_2\) and all
-labels being \(1\). Here, we are given a dataset \(X\) with \(N\) rows
-of datapoints and \(D\) features per datapoint. The weights of the model
-are parametrized by \(w\), a \(d\)-dimensional column vector. Given
-binary labels \(y\) of shape \(N\) by \(1\), the negative log likelihood
-function and the corresponding gradient is
-
-\[p(y, X| w)=-\frac{1}{N}(y^\top \log(\sigma(X^\top w)) + (1-y)^\tau \log(1-\sigma(X^\top w))), \quad \nabla_w p(y, X | w)=\frac{1}{N}X^T(\sigma(X^\top w)-y),\]
-
-where \(\sigma\) is the sigmoid function and is applied element-wise
-along with \(\log\). As usual, we use maximum likelihood to learn the
-parameter.
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\tightlist
-\item
-  \textbf{(Coding, 5 points)}. Implement the functions \texttt{train}
-  and the \texttt{predict\_probs} in \texttt{LogisticRegression} class.
-  The starter code is provided below.
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{from sklearn.model\_selection import train\_test\_split}
-\NormalTok{import torch}
-
-\NormalTok{class LogisticRegression:}
-\NormalTok{    def \_\_init\_\_(self):}
-\NormalTok{        self.weights = None  \# Initialized during training}
-
-\NormalTok{    def train(self, X, y, learning\_rate, num\_iterations):}
-\NormalTok{        """}
-\NormalTok{        Train the logistic regression model using full batch gradient{-}based optimization.}
-
-\NormalTok{        Parameters:}
-\NormalTok{        {-} X (torch.Tensor): Training data of shape (n\_samples, n\_features).}
-\NormalTok{        {-} y (torch.Tensor): Target labels of shape (n\_samples,).}
-\NormalTok{        """}
-\NormalTok{        n\_samples, n\_features = X.shape}
-
-\NormalTok{        \# Initialize weights without the bias term}
-\NormalTok{        self.weights = torch.zeros(n\_features)}
-
-\NormalTok{        for i in range(num\_iterations):}
-\NormalTok{            \# YOUR CODE HERE (\textasciitilde{}4{-}5 lines)}
-\NormalTok{                pass}
-\NormalTok{            \# END OF YOUR CODE}
-
-\NormalTok{    def predict\_probs(self, X):}
-\NormalTok{        """}
-\NormalTok{        Predict probabilities for samples in X.}
-
-\NormalTok{        Parameters:}
-\NormalTok{        {-} X (torch.Tensor): Input data of shape (n\_samples, n\_features).}
-
-\NormalTok{        Returns:}
-\NormalTok{        {-} y\_probs (torch.Tensor): Predicted probabilities.}
-\NormalTok{        """}
-\NormalTok{        y\_probs = None}
-
-\NormalTok{        \# YOUR CODE HERE (\textasciitilde{}2{-}3 lines)}
-\NormalTok{        pass}
-\NormalTok{        \# END OF YOUR CODE}
-
-\NormalTok{        return y\_probs}
-
-
-\NormalTok{if \_\_name\_\_ == "\_\_main\_\_":}
-\NormalTok{    \# Load in Llama3 embeddings of prompt + completions on RewardBench}
-\NormalTok{    chosen\_embeddings = torch.load(\textquotesingle{}data/chosen\_embeddings.pt\textquotesingle{})}
-\NormalTok{    rejected\_embeddings = torch.load(\textquotesingle{}data/rejected\_embeddings.pt\textquotesingle{})}
-
-\NormalTok{    \# Subtract the embeddings according to the Bradley{-}Terry reward model setup presented in the problem }
-\NormalTok{    X = (chosen\_embeddings {-} rejected\_embeddings).to(torch.float)}
-\NormalTok{    y = torch.ones(X.shape[0])}
-\NormalTok{    X\_train, X\_val, y\_train, y\_val = train\_test\_split(X, y, test\_size=0.2, random\_state=42)  }
-
-\NormalTok{    \# Tune the learning\_rate and num\_iterations}
-\NormalTok{    learning\_rate = None}
-\NormalTok{    num\_iterations = None}
-\NormalTok{    model = LogisticRegression()}
-\NormalTok{    model.train(X\_train, y\_train, learning\_rate=learning\_rate, num\_iterations=num\_iterations)}
-\NormalTok{    print(f"Expected Train Accuracy: \{model.predict\_probs(X\_train).mean()\}")}
-\NormalTok{    print(f"Expected Validation Accuracy: \{del.predict\_probs(X\_val).mean()\}") \# Should reach at least 90\%}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\setcounter{enumi}{2}
-\tightlist
-\item
-  \textbf{(Written, 4 points)}. Open the notebook
-  \texttt{rewardbench\_preferences.ipynb} and run all the cells. Make
-  sure to tune the \texttt{learning\_rate} and \texttt{num\_iterations}.
-  Report your final expected accuracy on the training and validation
-  sets. How close are the two expected accuracies? You should be able to
-  achieve \(\approx 90\%\) expected accuracy on validation. You may add
-  loss reporting to the \texttt{train} function to verify your model is
-  improving over time.
-\end{enumerate}
-
-\subsection*{Question 2: Revealed and Stated Preferences (20
-points)}\label{question-2-revealed-and-stated-preferences-20-points}
-\addcontentsline{toc}{subsection}{Question 2: Revealed and Stated
-Preferences (20 points)}
-
-Alice and Bob are running for president. For \(R\) voters, we can access
-their revealed candidate preferences through some means (e.g., social
-media, blogs, event history). Assume there is an unknown probability
-\(z\) of voting for Alice among the population. The aim of this question
-is to estimate \(z\) through \emph{maximum likelihood estimation} by
-also incorporating stated preferences. In this scenario, we collect
-stated preferences through surveys. When surveyed, voters tend to be
-more likely to vote for Alice with probability \(\frac{z+1}{2}\) for
-reasons of ``political correctness.''
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{(Written, 5 points)}. Suppose there are \(R_A\) revealed
-  preferences for Alice, \(R_B\) revealed preferences for Bob, \(S_A\)
-  stated preferences for Alice, and \(S_B\) stated preferences for Bob.
-  Note \(R=R_A+R_B\). Compute the log-likelihood of observing such
-  preferences in terms of \(z, R_A, R_B, S_A, S_B\).
-\item
-  \textbf{(Coding, 1 point)}. Implement the short function
-  \texttt{stated\_prob} in the file \texttt{voting/simulation.py}.
-\item
-  \textbf{(Coding, 5 points)}. Implement the class
-  \texttt{VotingSimulation}.
-\item
-  \textbf{(Coding, 7 points)}. Implement your derived expression from
-  part (a) in the \texttt{log\_likelihoods} function.
-\item
-  \textbf{(Written, 2 points)}. Finally, implement the
-  \texttt{average\_mae\_mle} method that will allow us to visualize the
-  mean absolute error (MAE) of our maximum likelihood estimate
-  \(\hat{z}\) (i.e., \(|\hat{z}-z|\)) as the number of voters surveyed
-  increases. Open \texttt{voting/visualize\_sim.ipynb} and run the cells
-  to get a plot of MAE vs.~voters surveyed averaged across \(100\)
-  simulations. Attach the plot to this question and briefly explain what
-  you notice.
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{import torch}
-\NormalTok{import random}
-\NormalTok{import matplotlib.pyplot as plt}
-\NormalTok{from tqdm import tqdm}
-\NormalTok{random.seed(42)}
-\NormalTok{torch.manual\_seed(42)}
-
-\NormalTok{def stated\_prob(z\_values):}
-\NormalTok{    """}
-\NormalTok{    Computes the probability of stated preferences based on z values.}
-    
-\NormalTok{    Args:}
-\NormalTok{        z\_values (torch.Tensor): The z value(s), where z represents the true probability of voting for Alice.}
-
-\NormalTok{    Returns:}
-\NormalTok{        torch.Tensor: Probability for stated preferences, derived from z values.}
-\NormalTok{    """}
-\NormalTok{    \# YOUR CODE HERE (\textasciitilde{}1 line)}
-\NormalTok{    \# END OF YOUR CODE}
-
-\NormalTok{class VotingSimulation:}
-\NormalTok{    """}
-\NormalTok{    A class to simulate the voting process where revealed and stated preferences are generated.}
-    
-\NormalTok{    Attributes:}
-\NormalTok{        R (int): Number of revealed preferences.}
-\NormalTok{        z (float): The true probability of voting for Alice.}
-\NormalTok{        revealed\_preferences (torch.Tensor): Simulated revealed preferences of R voters using Bernoulli distribution.}
-\NormalTok{                                             Takes on 1 for Alice, and 0 for Bob.}
-\NormalTok{        stated\_preferences (torch.Tensor): Simulated stated preferences, initialized as an empty tensor.}
-\NormalTok{                                           Takes on 1 for Alice, and 0 for Bob.}
-\NormalTok{    """}
-\NormalTok{    def \_\_init\_\_(self, R, z):}
-\NormalTok{        self.R = R}
-\NormalTok{        self.z = z}
-\NormalTok{        self.revealed\_preferences = None \# YOUR CODE HERE (\textasciitilde{}1 line)}
-\NormalTok{        self.stated\_preferences = torch.tensor([])}
-
-\NormalTok{    def add\_survey(self):}
-\NormalTok{        """}
-\NormalTok{        Simulates an additional stated preference based on stated\_prob and adds it to the list.}
-\NormalTok{        This updates the self.stated\_preferences tensor by concatenating on a new simulated survey result.}
-\NormalTok{        """}
-\NormalTok{        \# YOUR CODE HERE (\textasciitilde{}3 lines)}
-\NormalTok{        \# END OF YOUR CODE}
-
-\NormalTok{def log\_likelihoods(revealed\_preferences, stated\_preferences, z\_values):}
-\NormalTok{    """}
-\NormalTok{    Computes the log likelihoods across both revealed and stated preferences.}
-\NormalTok{    Use your answer in part (a) to help.}
-    
-\NormalTok{    Args:}
-\NormalTok{        revealed\_preferences (torch.Tensor): Tensor containing revealed preferences (0 or 1).}
-\NormalTok{        stated\_preferences (torch.Tensor): Tensor containing stated preferences (0 or 1).}
-\NormalTok{        z\_values (torch.Tensor): Tensor of underlying z values to calculate likelihood for.}
-
-\NormalTok{    Returns:}
-\NormalTok{        torch.Tensor: Log likelihood for each z value.}
-\NormalTok{    """}
-\NormalTok{    \# YOUR CODE HERE (\textasciitilde{}10{-}16 lines)}
-\NormalTok{    pass}
-\NormalTok{    \# END OF YOUR CODE }
-
-\NormalTok{def average\_mae\_mle(R, z, survey\_count, num\_sims, z\_sweep):}
-\NormalTok{    """}
-\NormalTok{    Runs multiple simulations to compute the average mean absolute error (MAE) of Maximum Likelihood Estimation (MLE) }
-\NormalTok{    for z after increasing number of surveys.}
-    
-\NormalTok{    Args:}
-\NormalTok{        R (int): Number of revealed preferences.}
-\NormalTok{        z (float): The true probability of voting for Alice.}
-\NormalTok{        survey\_count (int): Number of additional surveys to perform.}
-\NormalTok{        num\_sims (int): Number of simulation runs to average over.}
-\NormalTok{        z\_sweep (torch.Tensor): Range of z values to consider for maximum likelihood estimation.}
-
-\NormalTok{    Returns:}
-\NormalTok{        torch.Tensor: Tensor of mean absolute errors averaged over simulations.}
-\NormalTok{                      Should have shape (survey\_count, )}
-\NormalTok{    """}
-\NormalTok{    all\_errors = []}
-\NormalTok{    for \_ in tqdm(range(num\_sims)):}
-\NormalTok{        errors = []}
-\NormalTok{        vote\_simulator = VotingSimulation(R=R, z=z)}
-
-\NormalTok{        for \_ in range(survey\_count):}
-\NormalTok{            revealed\_preferences = vote\_simulator.revealed\_preferences}
-\NormalTok{            stated\_preferences = vote\_simulator.stated\_preferences}
-
-\NormalTok{            \# YOUR CODE HERE (\textasciitilde{}6{-}8 lines)}
-\NormalTok{            pass \# Compute log\_likelihoods across z\_sweep. Argmax to find MLE for z. }
-\NormalTok{                 \# Append the absolute error to errors and add a survey to the simulator.}
-\NormalTok{            \# END OF YOUR CODE}
-
-\NormalTok{        errors\_tensor = torch.stack(errors) }
-\NormalTok{        all\_errors.append(errors\_tensor)}
-
-\NormalTok{    \# Calculate the average error across simulations }
-\NormalTok{    mean\_errors = torch.stack(all\_errors).mean(dim=0)}
-\NormalTok{    return mean\_errors}
-
-\NormalTok{if \_\_name\_\_ == "\_\_main\_\_":}
-\NormalTok{    \# DO NOT CHANGE!}
-\NormalTok{    max\_surveys = 2000}
-\NormalTok{    z = 0.5}
-\NormalTok{    R = 10}
-\NormalTok{    num\_sims = 100}
-\NormalTok{    z\_sweep = torch.linspace(0.01, 0.99, 981)}
-
-\NormalTok{    \# Compute and plot the errors. Attach this plot to part (d).}
-\NormalTok{    mean\_errors = average\_mae\_mle(R, z, max\_surveys, num\_sims, z\_sweep)}
-\NormalTok{    plt.plot(mean\_errors)}
-
-\NormalTok{    plt.xlabel(\textquotesingle{}Surveys Conducted\textquotesingle{})}
-\NormalTok{    plt.ylabel(\textquotesingle{}Average Error\textquotesingle{})}
-\NormalTok{    plt.title(f\textquotesingle{}MLE MAE Error (z=\{z\}, \{num\_sims\} simulations)\textquotesingle{})}
-\NormalTok{    plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\subsection*{Question 3: Probabilistic Multi-modal Preferences (25
-points)}\label{question-3-probabilistic-multi-modal-preferences-25-points}
-\addcontentsline{toc}{subsection}{Question 3: Probabilistic Multi-modal
-Preferences (25 points)}
-
-Suppose you are part of the ML team on the movie streaming site
-CardinalStreams. After taking CS329H, you collect a movie preferences
-dataset with \(30000\) examples of the form
-\((m_1, m_2, \text{user id})\) where \(m_1\) and \(m_2\) are movies with
-\(m_1\succ m_2\). The preferences come from \(600\) distinct users with
-\(50\) examples per user. Each movie has a \(10\)-dimensional feature
-vector \(m\), and each user has a \(10\)-dimensional weight vector
-\(u\). Given movie features \(m_1, m_2\) and user weights \(u\), the
-user's preference between the movies is given by a Bradley-Terry reward
-model:
-\[P(m_1\succ m_2)=\frac{e^{u\cdot m_1}}{e^{u\cdot m_1} + e^{u\cdot m_2}}=\frac{1}{1+e^{u\cdot (m_2-m_1)}}=\sigma(u\cdot (m_1-m_2)).\]
-
-You realize that trying to estimate the weights for each user with only
-\(50\) examples will not work due to the lack of data. Instead, you
-choose to drop the user IDs column and shuffle the dataset in order to
-take a \emph{multi-modal preferences} approach. For simplicity, you
-assume a model where a proportion \(p\) of the users have weights
-\(w_1\) and the other \(1-p\) have weights \(w_2\). In this setting,
-each user belongs to one of two groups: users with weights \(w_1\) are
-part of Group 1, and users with weights \(w_2\) are part of Group 2.
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{(Written, 3 points)}. For a datapoint \((m_1, m_2)\) with
-  label \(m_1\succ m_2\), compute the data likelihood
-  \(P(m_1\succ m_2 | p, w_1, w_2)\) assuming \(p, w_1, w_2\) are given.
-\item
-  \textbf{(Written, 3 points)}. As a follow up, use the likelihood to
-  simplify the posterior distribution of \(p, w_1, w_2\) after updating
-  on \((m_1, m_2)\) leaving terms for the priors unchanged.
-\item
-  \textbf{(Written, 4 points)}. Assume priors \(p\sim B(1, 1)\),
-  \(w_1\sim\mathcal{N}(0, \mathbf{I})\), and
-  \(w_2\sim\mathcal{N}(0, \mathbf{I})\) where \(B\) represents the Beta
-  distribution and \(\mathcal{N}\) represents the normal distribution.
-  You will notice that the posterior from part (b) has no simple
-  closed-form. As a result, we must resort to \emph{Markov Chain Monte
-  Carlo (MCMC)} approaches to sample from the posterior. These
-  approaches allow sampling from highly complex distributions by
-  constructing a Markov chain \(\{x_t\}_{t=1}^\infty\) so that
-  \(\lim_{t\to\infty}x_t\) act as desired samples from the target
-  distribution. You can think of a Markov chain as a sequence with the
-  special property that \(x_{t+1}\) only depends on \(x_t\) for all
-  \(t\ge 1\).
-\end{enumerate}
-
-The most basic version of MCMC is known as Metropolis-Hastings. Assume
-\(\pi\) is the target distribution we wish to sample from where
-\(\pi(z)\) represents the probability density at point \(z\).
-Metropolis-Hastings constructs the approximating Markov chain \(x_t\) as
-follows: a proposal \(P\) for \(x_{t+1}\) is made via sampling from a
-chosen distribution \(Q(\,\cdot\,| x_t)\) (e.g., adding Gaussian noise).
-The acceptance probability of the proposal is given by
-
-\[A= \min \left( 1, \frac{\pi(P)Q(x_t | P)}{\pi(x_t)Q(P | x_t)} \right). \text{ That is } x_{t+1}=\begin{cases} P & \text{with probability } A, \\ x_t & \text{with probability } 1 - A. \end{cases}\]
-To extract our samples from \(\pi\), we run the Markov chain for \(N\)
-timesteps and disregard the first \(T<N\) timesteps in what is called
-the \emph{burn-in or mixing time} (i.e., our final samples are
-\(x_{T+1}, x_{T+2},\cdots, x_{N}\)). The mixing time is needed to ensure
-that the Markov chain elements are representative of the distribution
-\(\pi\) -- initial elements of the chain will not be a good
-approximation of \(\pi\) and depend more on the choice of initialization
-\(x_1\). To build some intuition, suppose we have a biased coin that
-turns heads with probability \(p_{\text{heads}}\). We observe \(12\)
-coin flips to have \(9\) heads (H) and \(3\) tails (T). If our prior for
-\(p_{\text{H}}\) was \(B(1, 1)\), then our posterior will be
-\(B(1+9, 1+3)=B(10, 4)\). The Bayesian update is given by
-
-\[p(p_{\text{H}}|9\text{H}, 3\text{T}) = \frac{p(9\text{H}, 3\text{T} | p_{\text{H}})B(1, 1)(p_{\text{H}})}{\int_0^1 P(9\text{H}, 3\text{T} | p_{\text{H}})B(1, 1)(p_{\text{H}}) dp_{\text{H}}} =\frac{p(9\text{H}, 3\text{T} | p_{\text{H}})}{\int_0^1 p(9\text{H}, 3\text{T} | p_{\text{H}})  dp_{\text{H}}}.\]
-
-\textbf{Find the acceptance probablity} \(A\) in the setting of the
-biased coin assuming the proposal distribution
-\(Q(\cdot|x_t)=x_t+N(0,\sigma)\) for given \(\sigma\). Notice that this
-choice of \(Q\) is symmetric, i.e., \(Q(x_t|P)=Q(P|x_t)\). In addition,
-you will realize that is unnecessary to compute the normalizing constant
-of the Bayesian update (i.e., the integral in the denominator) which is
-why MCMC is commonly used to sample from posteriors!
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\setcounter{enumi}{3}
-\tightlist
-\item
-  \textbf{(Written + Coding, 6 points)}. Implement Metropolis-Hastings
-  to sample from the posterior distribution of the biased coin in
-  \texttt{multimodal\_preferences/biased\_coin.py}. Attach a histogram
-  of your MCMC samples overlayed on top of the true posterior
-  \(B(10, 4)\) by running \texttt{python\ biased\_coin.py}.
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{import numpy as np}
-\NormalTok{import matplotlib.pyplot as plt}
-\NormalTok{from scipy.stats import beta}
-
-\NormalTok{def likelihood(p: float) {-}\textgreater{} float:}
-\NormalTok{    """}
-\NormalTok{    Computes the likelihood of 9 heads and 3 tails assuming p\_heads is p.}
-
-\NormalTok{    Args:}
-\NormalTok{    p (float): A value between 0 and 1 representing the probability of heads.}
-
-\NormalTok{    Returns:}
-\NormalTok{    float: The likelihood value at p\_heads=p. Return 0 if p is outside the range [0, 1].}
-\NormalTok{    """}
-\NormalTok{    \# YOUR CODE HERE (\textasciitilde{}1{-}3 lines)}
-\NormalTok{    pass}
-\NormalTok{    \# END OF YOUR CODE}
-
-
-\NormalTok{def propose(x\_current: float, sigma: float) {-}\textgreater{} float:}
-\NormalTok{    """}
-\NormalTok{    Proposes a new sample from the proposal distribution Q.}
-\NormalTok{    Here, Q is a normal distribution centered at x\_current with standard deviation sigma.}
-
-\NormalTok{    Args:}
-\NormalTok{    x\_current (float): The current value in the Markov chain.}
-\NormalTok{    sigma (float): Standard deviation of the normal proposal distribution.}
-
-\NormalTok{    Returns:}
-\NormalTok{    float: The proposed new sample.}
-\NormalTok{    """}
-\NormalTok{    \# YOUR CODE HERE (\textasciitilde{}1{-}3 lines)}
-\NormalTok{    pass}
-\NormalTok{    \# END OF YOUR CODE}
-
-
-\NormalTok{def acceptance\_probability(x\_current: float, x\_proposed: float) {-}\textgreater{} float:}
-\NormalTok{    """}
-\NormalTok{    Computes the acceptance probability A for the proposed sample.}
-\NormalTok{    Since the proposal distribution is symmetric, Q cancels out.}
-
-\NormalTok{    Args:}
-\NormalTok{    x\_current (float): The current value in the Markov chain.}
-\NormalTok{    x\_proposed (float): The proposed new value.}
-
-\NormalTok{    Returns:}
-\NormalTok{    float: The acceptance probability}
-\NormalTok{    """}
-\NormalTok{    \# YOUR CODE HERE (\textasciitilde{}4{-}6 lines)}
-\NormalTok{    pass}
-\NormalTok{    \# END OF YOUR CODE}
-
-
-\NormalTok{def metropolis\_hastings(N: int, T: int, x\_init: float, sigma: float) {-}\textgreater{} np.ndarray:}
-\NormalTok{    """}
-\NormalTok{    Runs the Metropolis{-}Hastings algorithm to sample from a posterior distribution.}
-
-\NormalTok{    Args:}
-\NormalTok{    N (int): Total number of iterations.}
-\NormalTok{    T (int): Burn{-}in period (number of initial samples to discard).}
-\NormalTok{    x\_init (float): Initial value of the chain.}
-\NormalTok{    sigma (float): Standard deviation of the proposal distribution.}
-
-\NormalTok{    Returns:}
-\NormalTok{    list: Samples collected after the burn{-}in period.}
-\NormalTok{    """}
-\NormalTok{    samples = []}
-\NormalTok{    x\_current = x\_init}
-
-\NormalTok{    for t in range(N):}
-\NormalTok{        \# YOUR CODE HERE (\textasciitilde{}7{-}10 lines)}
-\NormalTok{        \# Use the propose and acceptance\_probability functions to get x\_\{t+1\} and store it in samples after the burn{-}in period T}
-\NormalTok{        pass}
-\NormalTok{        \# END OF YOUR CODE}
-
-\NormalTok{    return samples}
-
-
-\NormalTok{def plot\_results(samples: np.ndarray) {-}\textgreater{} None:}
-\NormalTok{    """}
-\NormalTok{    Plots the histogram of MCMC samples along with the true Beta(10, 4) PDF.}
-
-\NormalTok{    Args:}
-\NormalTok{    samples (np.ndarray): Array of samples collected from the Metropolis{-}Hastings algorithm.}
-
-\NormalTok{    Returns:}
-\NormalTok{    None}
-\NormalTok{    """}
-\NormalTok{    \# Histogram of the samples from the Metropolis{-}Hastings algorithm}
-\NormalTok{    plt.hist(samples, bins=50, density=True, alpha=0.5, label="MCMC Samples")}
-
-\NormalTok{    \# True Beta(10, 4) distribution for comparison}
-\NormalTok{    p = np.linspace(0, 1, 1000)}
-\NormalTok{    beta\_pdf = beta.pdf(p, 10, 4)}
-\NormalTok{    plt.plot(p, beta\_pdf, "r{-}", label="Beta(10, 4) PDF")}
-
-\NormalTok{    plt.xlabel("p\_heads")}
-\NormalTok{    plt.ylabel("Density")}
-\NormalTok{    plt.title("Metropolis{-}Hastings Sampling of Biased Coin Posterior")}
-\NormalTok{    plt.legend()}
-\NormalTok{    plt.show()}
-
-
-\NormalTok{if \_\_name\_\_ == "\_\_main\_\_":}
-\NormalTok{    \# MCMC Parameters (DO NOT CHANGE!)}
-\NormalTok{    N = 50000  \# Total number of iterations}
-\NormalTok{    T = 10000  \# Burn{-}in period to discard}
-\NormalTok{    x\_init = 0.5  \# Initial guess for p\_heads}
-\NormalTok{    sigma = 0.1  \# Standard deviation of the proposal distribution}
-
-\NormalTok{    \# Run Metropolis{-}Hastings and plot the results}
-\NormalTok{    samples = metropolis\_hastings(N, T, x\_init, sigma)}
-\NormalTok{    plot\_results(samples)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\setcounter{enumi}{4}
-\tightlist
-\item
-  \textbf{(Coding, 9 points)}. Implement Metropolis-Hastings in the
-  movie setting
-  inside~\texttt{multimodal\_preferences/movie\_metropolis.py}. The
-  movie dataset we use for grading will not be provided. However,
-  randomly constructed datasets can be used to test your implementation
-  by running \texttt{python\ movie\_metropolis.py}. You should be able
-  to achieve a \(90\%\) success rate with most
-  \texttt{fraction\_accepted} values above \(0.1\). Success is measured
-  by thresholded closeness of predicted parameters to true parameters.
-  You may notice occasional failures that occur due to lack of
-  convergence which we will account for in grading.
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{import torch}
-\NormalTok{import torch.distributions as dist}
-\NormalTok{import math}
-\NormalTok{from tqdm import tqdm}
-\NormalTok{from typing import Tuple}
-
-\NormalTok{def make\_data(}
-\NormalTok{    true\_p: torch.Tensor, true\_weights\_1: torch.Tensor, true\_weights\_2: torch.Tensor, num\_movies: int, feature\_dim: int}
-\NormalTok{) {-}\textgreater{} Tuple[torch.Tensor, torch.Tensor]:}
-\NormalTok{    """}
-\NormalTok{    Generates a synthetic movie dataset according to the CardinalStreams model.}
-
-\NormalTok{    Args:}
-\NormalTok{        true\_p (torch.Tensor): Probability of coming from Group 1.}
-\NormalTok{        true\_weights\_1 (torch.Tensor): Weights for Group 1.}
-\NormalTok{        true\_weights\_2 (torch.Tensor): Weights for Group 2.}
-
-\NormalTok{    Returns:}
-\NormalTok{        Tuple[torch.Tensor, torch.Tensor]: A tuple containing the dataset and labels.}
-\NormalTok{    """}
-\NormalTok{    \# Create movie features}
-\NormalTok{    first\_movie\_features = torch.randn((num\_movies, feature\_dim))}
-\NormalTok{    second\_movie\_features = torch.randn((num\_movies, feature\_dim))}
-
-\NormalTok{    \# Only care about difference of features for Bradley{-}Terry}
-\NormalTok{    dataset = first\_movie\_features {-} second\_movie\_features}
-
-\NormalTok{    \# Get probabilities that first movie is preferred assuming Group 1 or Group 2}
-\NormalTok{    weight\_1\_probs = torch.sigmoid(dataset @ true\_weights\_1)}
-\NormalTok{    weight\_2\_probs = torch.sigmoid(dataset @ true\_weights\_2)}
-
-\NormalTok{    \# Probability that first movie is preferred overall can be viewed as sum of conditioning on Group 1 and Group 2}
-\NormalTok{    first\_movie\_preferred\_probs = (}
-\NormalTok{        true\_p * weight\_1\_probs + (1 {-} true\_p) * weight\_2\_probs}
-\NormalTok{    )}
-\NormalTok{    labels = dist.Bernoulli(first\_movie\_preferred\_probs).sample()}
-\NormalTok{    return dataset, labels}
-
-
-\NormalTok{def compute\_likelihoods(}
-\NormalTok{    dataset: torch.Tensor,}
-\NormalTok{    labels: torch.Tensor,}
-\NormalTok{    p: torch.Tensor,}
-\NormalTok{    w\_1: torch.Tensor,}
-\NormalTok{    w\_2: torch.Tensor,}
-\NormalTok{) {-}\textgreater{} torch.Tensor:}
-\NormalTok{    """}
-\NormalTok{    Computes the likelihood of each datapoint. Use your calculation from part (a) to help.}
-
-\NormalTok{    Args:}
-\NormalTok{        dataset (torch.Tensor): The dataset of differences between movie features.}
-\NormalTok{        labels (torch.Tensor): The labels where 1 indicates the first movie is preferred, and 0 indicates preference of the second movie.}
-\NormalTok{        p (torch.Tensor): The probability of coming from Group 1.}
-\NormalTok{        w\_1 (torch.Tensor): Weights for Group 1.}
-\NormalTok{        w\_2 (torch.Tensor): Weights for Group 2.}
-
-\NormalTok{    Returns:}
-\NormalTok{        torch.Tensor: The likelihoods for each datapoint. Should have shape (dataset.shape[0], )}
-\NormalTok{    """}
-\NormalTok{    \# YOUR CODE HERE (\textasciitilde{}6{-}8 lines)}
-\NormalTok{    pass}
-\NormalTok{    \# END OF YOUR CODE}
-
-\NormalTok{def compute\_prior\_density(}
-\NormalTok{    p: torch.Tensor, w\_1: torch.Tensor, w\_2: torch.Tensor}
-\NormalTok{) {-}\textgreater{} torch.Tensor:}
-\NormalTok{    """}
-\NormalTok{    Computes the prior density of the parameters.}
-
-\NormalTok{    Args:}
-\NormalTok{        p (torch.Tensor): The probability of preferring model 1.}
-\NormalTok{        w\_1 (torch.Tensor): Weights for model 1.}
-\NormalTok{        w\_2 (torch.Tensor): Weights for model 2.}
-
-\NormalTok{    Returns:}
-\NormalTok{        torch.Tensor: The prior densities of p, w\_1, and w\_2.}
-\NormalTok{    """}
-\NormalTok{    \# Adjusts p to stay in the range [0.3, 0.7] to prevent multiple equilibria issues at p=0 and p=1}
-\NormalTok{    p\_prob = torch.tensor([2.5]) if 0.3 \textless{}= p \textless{}= 0.7 else torch.tensor([0.0])}
-
-\NormalTok{    def normal\_pdf(x: torch.Tensor) {-}\textgreater{} torch.Tensor:}
-\NormalTok{        """Computes the PDF of the standard normal distribution at x."""}
-\NormalTok{        return (1.0 / torch.sqrt(torch.tensor(2 * math.pi))) * torch.exp({-}0.5 * x**2)}
-
-\NormalTok{    weights\_1\_prob = normal\_pdf(w\_1)}
-\NormalTok{    weights\_2\_prob = normal\_pdf(w\_2)}
-
-\NormalTok{    \# Concatenate the densities}
-\NormalTok{    concatenated\_prob = torch.cat([p\_prob, weights\_1\_prob, weights\_2\_prob])}
-\NormalTok{    return concatenated\_prob}
-
-
-\NormalTok{def metropolis\_hastings(}
-\NormalTok{    dataset: torch.Tensor,}
-\NormalTok{    labels: torch.Tensor,}
-\NormalTok{    sigma: float = 0.01,}
-\NormalTok{    num\_iters: int = 30000,}
-\NormalTok{    burn\_in: int = 20000,}
-\NormalTok{) {-}\textgreater{} Tuple[torch.Tensor, torch.Tensor, torch.Tensor, float]:}
-\NormalTok{    """}
-\NormalTok{    Performs the Metropolis{-}Hastings algorithm to sample from the posterior distribution.}
-\NormalTok{    DO NOT CHANGE THE DEFAULT VALUES!}
-
-\NormalTok{    Args:}
-\NormalTok{        dataset (torch.Tensor): The dataset of differences between movie features.}
-\NormalTok{        labels (torch.Tensor): The labels indicating which movie is preferred.}
-\NormalTok{        sigma (float, optional): Standard deviation for proposal distribution.}
-\NormalTok{            Defaults to 0.01.}
-\NormalTok{        num\_iters (int, optional): Total number of iterations. Defaults to 30000.}
-\NormalTok{        burn\_in (int, optional): Number of iterations to discard as burn{-}in.}
-\NormalTok{            Defaults to 20000.}
-
-\NormalTok{    Returns:}
-\NormalTok{        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, float]: Samples of p,}
-\NormalTok{        w\_1, w\_2, and the fraction of accepted proposals.}
-\NormalTok{    """}
-\NormalTok{    feature\_dim = dataset.shape[1]}
-
-\NormalTok{    \# Initialize random starting parameters by sampling priors}
-\NormalTok{    curr\_p = 0.3 + 0.4 * torch.rand(1)}
-\NormalTok{    curr\_w\_1 = torch.randn(feature\_dim)}
-\NormalTok{    curr\_w\_2 = torch.randn(feature\_dim)}
-
-\NormalTok{    \# Keep track of samples and total number of accepted proposals}
-\NormalTok{    p\_samples = []}
-\NormalTok{    w\_1\_samples = []}
-\NormalTok{    w\_2\_samples = []}
-\NormalTok{    accept\_count = 0 }
-
-\NormalTok{    for T in tqdm(range(num\_iters)):}
-\NormalTok{        \# YOUR CODE HERE (\textasciitilde{}3 lines)}
-\NormalTok{        pass \# Sample proposals for p, w\_1, w\_2}
-\NormalTok{        \# END OF YOUR CODE}
-
-\NormalTok{        \# YOUR CODE HERE (\textasciitilde{}4{-}6 lines)}
-\NormalTok{        pass \# Compute likehoods and prior densities on both the proposed and current samples}
-\NormalTok{        \# END OF YOUR CODE}
-
-\NormalTok{        \# YOUR CODE HERE (\textasciitilde{}2{-}4 lines)}
-\NormalTok{        pass \# Obtain the ratios of the likelihoods and prior densities between the proposed and current samples }
-\NormalTok{        \# END OF YOUR CODE }
-
-\NormalTok{        \# YOUR CODE HERE (\textasciitilde{}1{-}2 lines)}
-\NormalTok{        pass \# Multiply all ratios (both likelihoods and prior densities) and use this to calculate the acceptance probability of the proposal}
-\NormalTok{        \# END OF YOUR CODE}
-
-\NormalTok{        \# YOUR CODE HERE (\textasciitilde{}4{-}6 lines)}
-\NormalTok{        pass \# Sample randomness to determine whether the proposal should be accepted to update curr\_p, curr\_w\_1, curr\_w\_2, and accept\_count}
-\NormalTok{        \# END OF YOUR CODE }
-
-\NormalTok{        \# YOUR CODE HERE (\textasciitilde{}4{-}6 lines)}
-\NormalTok{        pass \# Update p\_samples, w\_1\_samples, w\_2\_samples if we have passed the burn in period T}
-\NormalTok{        \# END OF YOUR CODE }
-
-\NormalTok{    fraction\_accepted = accept\_count / num\_iters}
-\NormalTok{    print(f"Fraction of accepted proposals: \{fraction\_accepted\}")}
-\NormalTok{    return (}
-\NormalTok{        torch.stack(p\_samples),}
-\NormalTok{        torch.stack(w\_1\_samples),}
-\NormalTok{        torch.stack(w\_2\_samples),}
-\NormalTok{        fraction\_accepted,}
-\NormalTok{    )}
-
-
-\NormalTok{def evaluate\_metropolis(num\_sims: int, num\_movies: int, feature\_dim: int) {-}\textgreater{} None:}
-\NormalTok{    """}
-\NormalTok{    Runs the Metropolis{-}Hastings algorithm N times and compare estimated parameters}
-\NormalTok{    with true parameters to obtain success rate. You should attain a success rate of around 90\%. }
-
-\NormalTok{    Note that there are two successful equilibria to converge to. They are true\_weights\_1 and true\_weights\_2 with probabilities}
-\NormalTok{    p and 1{-}p in addition to true\_weights\_2 and true\_weights\_1 with probabilities 1{-}p and p. This is why even though it may appear your}
-\NormalTok{    predicted parameters don\textquotesingle{}t match the true parameters, they are in fact equivalent. }
-
-\NormalTok{    Args:}
-\NormalTok{        num\_sims (int): Number of simulations to run.}
-
-\NormalTok{    Returns:}
-\NormalTok{        None}
-\NormalTok{    """}
-    
-\NormalTok{    success\_count = 0}
-\NormalTok{    for \_ in range(num\_sims):}
-\NormalTok{        \# Sample random ground truth parameters}
-\NormalTok{        true\_p = 0.3 + 0.4 * torch.rand(1)}
-\NormalTok{        true\_weights\_1 = torch.randn(feature\_dim)}
-\NormalTok{        true\_weights\_2 = torch.randn(feature\_dim)}
-
-\NormalTok{        print("\textbackslash{}n{-}{-}{-}{-} MCMC Simulation {-}{-}{-}{-}")}
-\NormalTok{        print("True parameters:", true\_p, true\_weights\_1, true\_weights\_2)}
-
-\NormalTok{        dataset, labels = make\_data(true\_p, true\_weights\_1, true\_weights\_2, num\_movies, feature\_dim)}
-\NormalTok{        p\_samples, w\_1\_samples, w\_2\_samples, \_ = metropolis\_hastings(dataset, labels)}
-
-\NormalTok{        p\_pred = p\_samples.mean(dim=0)}
-\NormalTok{        w\_1\_pred = w\_1\_samples.mean(dim=0)}
-\NormalTok{        w\_2\_pred = w\_2\_samples.mean(dim=0)}
-
-\NormalTok{        print("Predicted parameters:", p\_pred, w\_1\_pred, w\_2\_pred)}
-
-\NormalTok{        \# Do casework on two equilibria cases to check for success}
-\NormalTok{        p\_diff\_case\_1 = torch.abs(p\_pred {-} true\_p)}
-\NormalTok{        p\_diff\_case\_2 = torch.abs(p\_pred {-} (1 {-} true\_p))}
-
-\NormalTok{        w\_1\_diff\_case\_1 = torch.max(torch.abs(w\_1\_pred {-} true\_weights\_1))}
-\NormalTok{        w\_1\_diff\_case\_2 = torch.max(torch.abs(w\_1\_pred {-} true\_weights\_2))}
-
-\NormalTok{        w\_2\_diff\_case\_1 = torch.max(torch.abs(w\_2\_pred {-} true\_weights\_2))}
-\NormalTok{        w\_2\_diff\_case\_2 = torch.max(torch.abs(w\_2\_pred {-} true\_weights\_1))}
-
-\NormalTok{        pass\_case\_1 = (}
-\NormalTok{            p\_diff\_case\_1 \textless{} 0.1 and w\_1\_diff\_case\_1 \textless{} 0.5 and w\_2\_diff\_case\_1 \textless{} 0.5}
-\NormalTok{        )}
-\NormalTok{        pass\_case\_2 = (}
-\NormalTok{            p\_diff\_case\_2 \textless{} 0.1 and w\_1\_diff\_case\_2 \textless{} 0.5 and w\_2\_diff\_case\_2 \textless{} 0.5}
-\NormalTok{        )}
-\NormalTok{        passes = pass\_case\_1 or pass\_case\_2}
-
-\NormalTok{        print(f\textquotesingle{}Result: \{"Success" if passes else "FAILED"\}\textquotesingle{})}
-\NormalTok{        if passes:}
-\NormalTok{            success\_count += 1}
-\NormalTok{    print(f\textquotesingle{}Success rate: \{success\_count / num\_sims\}\textquotesingle{})}
-
-
-\NormalTok{if \_\_name\_\_ == "\_\_main\_\_":}
-\NormalTok{    evaluate\_metropolis(num\_sims=10, num\_movies=30000, feature\_dim=10)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\section*{References}\label{bibliography-1}
-\addcontentsline{toc}{section}{References}
-
-\markright{References}
-
-\phantomsection\label{refs-1}
-\begin{CSLReferences}{1}{0}
-\bibitem[\citeproctext]{ref-book_estimation_bock}
-Bock, Hans Georg, Thomas Carraro, Willi Jäger, Stefan Körkel, Rolf
-Rannacher, and Johannes P. Schlöder. 2015. \emph{Model Based Parameter
-Estimation: Theory and Applications}. Springer.
-\url{https://api.semanticscholar.org/CorpusID:60333071}.
-
-\bibitem[\citeproctext]{ref-bolt2009}
-Bolt, Daniel M., and James A. Wollack. 2009. {``Application of a
-Multidimensional Nested Logit Model to Multiple-Choice Test Items.''}
-\emph{Journal of Educational Measurement} 46 (3): 181--98.
-\url{https://doi.org/10.1111/j.1745-3984.2009.00081.x}.
-
-\bibitem[\citeproctext]{ref-bradley-terry-model}
-Bradley, Ralph Allan, and Milton E. Terry. 1952. {``Rank Analysis of
-Incomplete Block Designs: I. The Method of Paired Comparisons.''}
-\emph{Biometrika} 39 (3/4): 324--45.
-\url{http://www.jstor.org/stable/2334029}.
-
-\bibitem[\citeproctext]{ref-campbell2015}
-Campbell, Danny, and Seda Erdem. 2015. {``Position Bias in Best-Worst
-Scaling Surveys: A Case Study on Trust in Institutions.''}
-\emph{American Journal of Agricultural Economics} 97 (2): 526--45.
-\url{https://doi.org/10.1093/ajae/aau112}.
-
-\bibitem[\citeproctext]{ref-book_estimation_casella}
-Casella, George, and Roger L. Berger. 1990. \emph{Statistical
-Inference}. Springer.
-\url{https://api.semanticscholar.org/CorpusID:125727004}.
-
-\bibitem[\citeproctext]{ref-finn2017model}
-Finn, Chelsea, Pieter Abbeel, and Sergey Levine. 2017. {``Model-Agnostic
-Meta-Learning for Fast Adaptation of Deep Networks.''} In
-\emph{International Conference on Machine Learning}, 1126--35. PMLR.
-
-\bibitem[\citeproctext]{ref-idealpoints}
-Greiner, James. 2005. {``Ideal Points.''} Harvard IQSS Blog.
-\url{https://blogs.iq.harvard.edu/ideal_points_1}.
-
-\bibitem[\citeproctext]{ref-haarnoja2018soft}
-Haarnoja, Tuomas, Aurick Zhou, Pieter Abbeel, and Sergey Levine. 2018.
-{``Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement
-Learning with a Stochastic Actor.''} In \emph{International Conference
-on Machine Learning}, 1861--70. PMLR.
-
-\bibitem[\citeproctext]{ref-harpe2015}
-Harpe, Spencer E. 2015. {``How to Analyze Likert and Other Rating Scale
-Data.''} \emph{Currents in Pharmacy Teaching and Learning} 7 (5):
-836--50. \url{http://dx.doi.org/10.1016/j.cptl.2015.08.001}.
-
-\bibitem[\citeproctext]{ref-hejna2023few}
-Hejna III, Donald Joseph, and Dorsa Sadigh. 2023. {``Few-Shot Preference
-Learning for Human-in-the-Loop Rl.''} In \emph{Conference on Robot
-Learning}, 2014--25. PMLR.
-
-\bibitem[\citeproctext]{ref-huber1976ideal}
-Huber, Joel. 1976. {``Ideal Point Models of Preference.''} In
-\emph{Advances in Consumer Research}, 03:138--42. Association for
-Consumer Research.
-
-\bibitem[\citeproctext]{ref-ideal_point}
-Jamieson, Kevin G, and Robert Nowak. 2011. {``Active Ranking Using
-Pairwise Comparisons.''} In \emph{Advances in Neural Information
-Processing Systems}, edited by J. Shawe-Taylor, R. Zemel, P. Bartlett,
-F. Pereira, and K. Q. Weinberger. Vol. 24. Curran Associates, Inc.
-\url{https://proceedings.neurips.cc/paper_files/paper/2011/file/6c14da109e294d1e8155be8aa4b1ce8e-Paper.pdf}.
-
-\bibitem[\citeproctext]{ref-kwon2021targeted}
-Kwon, Minae, Siddharth Karamcheti, Mariano-Florentino Cuellar, and Dorsa
-Sadigh. 2021. {``Targeted Data Acquisition for Evolving Negotiation
-Agents.''} \url{https://arxiv.org/abs/2106.07728}.
-
-\bibitem[\citeproctext]{ref-lee2021pebble}
-Lee, Kimin, Laura Smith, and Pieter Abbeel. 2021. {``Pebble:
-Feedback-Efficient Interactive Reinforcement Learning via Relabeling
-Experience and Unsupervised Pre-Training.''} \emph{arXiv Preprint
-arXiv:2106.05091}.
-
-\bibitem[\citeproctext]{ref-Luce1977}
-Luce, R.Duncan. 1977. {``The Choice Axiom After Twenty Years.''}
-\emph{Journal of Mathematical Psychology} 15 (3): 215--33.
-\url{https://doi.org/10.1016/0022-2496(77)90032-3}.
-
-\bibitem[\citeproctext]{ref-mcfadden_conditional_1974}
-McFadden, Daniel. 1974. {``Conditional Logit Analysis of Qualitative
-Choice Behavior.''} In \emph{Frontiers in Econometrics}, edited by Paul
-Zarembka, 105--42. New York: Academic Press.
-
-\bibitem[\citeproctext]{ref-miljkovic2005rational}
-Miljkovic, Dragan. 2005. {``Rational Choice and Irrational Individuals
-or Simply an Irrational Theory: A Critical Review of the Hypothesis of
-Perfect Rationality.''} \emph{The Journal of Socio-Economics} 34 (5):
-621--34. \url{https://doi.org/10.1016/j.socec.2003.12.031}.
-
-\bibitem[\citeproctext]{ref-myers2022learning}
-Myers, Vivek, Erdem Biyik, Nima Anari, and Dorsa Sadigh. 2022.
-{``Learning Multimodal Rewards from Rankings.''} In \emph{Conference on
-Robot Learning}, 342--52. PMLR.
-
-\bibitem[\citeproctext]{ref-myers2021learning}
-Myers, Vivek, Erdem Bıyık, Nima Anari, and Dorsa Sadigh. 2021.
-{``Learning Multimodal Rewards from Rankings.''}
-\url{https://arxiv.org/abs/2109.12750}.
-
-\bibitem[\citeproctext]{ref-plackett_luce}
-Plackett, R. L. 1975. {``The Analysis of Permutations.''} \emph{Journal
-of the Royal Statistical Society. Series C (Applied Statistics)} 24 (2):
-193--202. \url{http://www.jstor.org/stable/2346567}.
-
-\bibitem[\citeproctext]{ref-ragain2019}
-Ragain, Stephen, and Johan Ugander. 2019. {``Choosing to Rank.''}
-\emph{arXiv Preprint arXiv:1809.05139}.
-\url{https://arxiv.org/abs/1809.05139}.
-
-\bibitem[\citeproctext]{ref-gradient_descent}
-Ruder, Sebastian. 2016. {``An Overview of Gradient Descent Optimization
-Algorithms.''} \emph{ArXiv} abs/1609.04747.
-\url{https://api.semanticscholar.org/CorpusID:17485266}.
-
-\bibitem[\citeproctext]{ref-simon1972theories}
-Simon, Herbert A. 1972. {``Theories of Bounded Rationality.''} In
-\emph{Decision and Organization}, edited by C. B. McGuire and Roy
-Radner, 161--76. North-Holland Publishing Company.
-
-\bibitem[\citeproctext]{ref-tatli2022distancepreferences}
-Tatli, Gokcan, Rob Nowak, and Ramya Korlakai Vinayak. 2022. {``Learning
-Preference Distributions from Distance Measurements.''} In \emph{2022
-58th Annual Allerton Conference on Communication, Control, and Computing
-(Allerton)}, 1--8.
-\url{https://doi.org/10.1109/Allerton49937.2022.9929404}.
-
-\bibitem[\citeproctext]{ref-2307.09288}
-Touvron, Hugo et al. 2023. {``Llama 2: Open Foundation and Fine-Tuned
-Chat Models.''} \url{https://arxiv.org/abs/2307.09288}.
-
-\bibitem[\citeproctext]{ref-yu2020meta}
-Yu, Tianhe, Deirdre Quillen, Zhanpeng He, Ryan Julian, Karol Hausman,
-Chelsea Finn, and Sergey Levine. 2020. {``Meta-World: A Benchmark and
-Evaluation for Multi-Task and Meta Reinforcement Learning.''} In
-\emph{Conference on Robot Learning}, 1094--1100. PMLR.
-
-\bibitem[\citeproctext]{ref-zhou2019watch}
-Zhou, Allan, Eric Jang, Daniel Kappler, Alex Herzog, Mohi Khansari, Paul
-Wohlhart, Yunfei Bai, Mrinal Kalakrishnan, Sergey Levine, and Chelsea
-Finn. 2019. {``Watch, Try, Learn: Meta-Learning from Demonstrations and
-Rewards.''} In \emph{International Conference on Learning
-Representations}.
-
-\end{CSLReferences}
-
-\bookmarksetup{startatroot}
-
-\chapter{Learning}\label{learning-1}
-
-Designing a good utility function (or reward function) by hand for a
-complex AI or robotics task is notoriously difficult and error-prone.
-Instead of manually specifying what is ``good'' behavior, we can learn a
-utility function from human preferences. In this chapter, we explore how
-an agent can infer a human's underlying utility function (their
-preferences or reward criteria) from various forms of feedback. We
-discuss both supervised learning and Bayesian approaches to utility
-learning, and examine techniques motivated by robotics---learning from
-demonstrations, physical corrections, trajectory evaluations, and
-pairwise comparisons. Throughout, we include mathematical formulations
-and code examples to illustrate the learning process.
-
-\section{The Supervised Learning
-Problem}\label{the-supervised-learning-problem}
-
-Supervised learning approaches treat human feedback as labeled data to
-directly fit a utility function. The core idea is to assume there exists
-a true utility function \(u^*(x)\) (over states, outcomes, or
-trajectories \(x\)) that explains a human's choices. We then choose a
-parameterized model \(u_\theta(x)\) and adjust \(\theta\) so that
-\(u_\theta\) agrees with the human-provided preferences.
-
-A common feedback format is pairwise comparisons: the human is shown two
-options (outcomes or trajectories) \(A\) and \(B\) and indicates which
-is preferred. We can model the probability that the human prefers \(A\)
-over \(B\) using a logistic or Bradley--Terry model:
-
-\[
-P(A \succ B \mid u_\theta) \;=\; \sigma\!\Big(u_\theta(A) - u_\theta(B)\Big)\,,
-\]
-
-where \(\sigma(z)=\frac{1}{1+e^{-z}}\) is the sigmoid function. This
-implies the human is more likely to prefer \(A\) if \(u_\theta(A)\) is
-much larger than \(u_\theta(B)\).
-
-At the heart of learning from human preferences lies a latent utility
-function --- a function that assigns numerical value to states,
-trajectories, or outcomes according to a human's (possibly unspoken)
-preferences. The goal of a learning algorithm is to infer this function
-from observed feedback, which may come in the form of demonstrations,
-ratings, rankings, or pairwise comparisons. But how exactly do we
-represent and update our belief about this hidden utility function?
-
-Two major paradigms in statistical learning provide different answers:
-point estimation and posterior estimation.
-
-In point estimation, we seek a single ``best guess'' for the utility
-function --- typically a function \(u_\theta(x)\) from a parameterized
-family (e.g.~linear models, neural nets), with parameters
-\(\theta \in \mathbb{R}^d\). Given data \(\mathcal{D}\) from human
-feedback (e.g.~preferences), we choose the parameter \(\hat{\theta}\)
-that best explains the observed behavior. Formally:
-
-\[
-\hat{\theta} = \arg\max_\theta \; p(\mathcal{D} \mid \theta)
-\]
-
-This is maximum likelihood estimation (MLE): we pick the parameters that
-make the observed data most probable under our model. Once
-\(\hat{\theta}\) is selected, we treat \(u_{\hat{\theta}}\) as the
-agent's utility function, and optimize or sample behavior accordingly.
-This approach is straightforward and computationally efficient. It is
-the foundation of most supervised learning methods (like logistic
-regression or deep learning), and it provides a natural interpretation:
-we're directly finding the utility function that agrees with the human
-feedback. However, it discards uncertainty: it assumes the data is
-sufficient to pin down a single utility function, which may not be true
-in practice.
-
-In contrast, posterior estimation takes a fully Bayesian view. Instead
-of committing to one estimate, we maintain a distribution over utility
-functions. That is, we place a prior \(p(\theta)\) over parameters (or
-over functions \(u\) more generally), and update this to a posterior
-after observing data \(\mathcal{D}\):
-
-\[
-p(\theta \mid \mathcal{D}) \;=\; \frac{p(\mathcal{D} \mid \theta)\, p(\theta)}{p(\mathcal{D})}
-\]
-
-This posterior expresses our uncertainty over which utility functions
-are compatible with the human feedback. From this distribution, we can
-make predictions (e.g., using the posterior mean utility), quantify
-confidence, or even actively select new queries to reduce uncertainty
-(active learning). For instance, if we model utilities with a Gaussian
-Process (GP), then the posterior over \(u(x)\) is also a GP after
-observing comparisons or evaluations. If we use a neural network for
-\(u_\theta(x)\), we can approximate the posterior with ensembles,
-variational inference, or MCMC. Posterior estimation is especially
-valuable when human feedback is sparse, noisy, or ambiguous --- as is
-often the case in real-world preference learning. It allows the agent to
-reason about what it doesn't know and to take cautious or exploratory
-actions accordingly.
-
-The next two sections instantiate these two perspectives. In Section
-4.1, we explore point estimation via supervised learning --- treating
-preference data as labeled examples and fitting a utility model. In
-Section 4.2, we shift to posterior estimation with Bayesian methods like
-Gaussian processes and Bayesian neural networks, which model both our
-current estimate and the uncertainty around it.
-
-\section{Point Estimation via Maximum
-Likelihood}\label{point-estimation-via-maximum-likelihood}
-
-Given a dataset of comparisons \(D=\{(A_i, B_i, y_i)\}\) (with \(y_i=1\)
-if \(A_i\) was preferred and \(0\) if \(B_i\) was preferred), we can fit
-\(\theta\) by maximizing the likelihood of the human's choices.
-Equivalently, we minimize a binary cross-entropy loss:
-
-\[
-\mathcal{L}(\theta) = -\sum_{i} \Big[\,y_i \log \sigma(u_\theta(A_i)\!-\!u_\theta(B_i)) + (1-y_i)\log(1-\sigma(u_\theta(A_i)\!-\!u_\theta(B_i)))\Big]\,,
-\]
-
-often with a regularization term to prevent overfitting. This is a
-straightforward supervised learning problem -- essentially logistic
-regression -- on pairwise difference features.
-
-Example: Suppose a human's utility for an outcome can be described by a
-quadratic function (unknown to the learning algorithm). We collect some
-pairwise preferences and then train a utility model \(u_\theta(x)\) to
-predict those preferences. The code below simulates this scenario:
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-
-\CommentTok{\# True utility function (unknown to learner), e.g. u*(x) = {-}(x{-}5)\^{}2 + constant }
-\KeywordTok{def}\NormalTok{ true\_utility(x):}
-    \ControlFlowTok{return} \OperatorTok{{-}}\NormalTok{(x}\OperatorTok{{-}}\DecValTok{5}\NormalTok{)}\OperatorTok{**}\DecValTok{2}  \CommentTok{\# (peak at x=5)}
-
-\CommentTok{\# Generate synthetic pairwise preference data}
-\NormalTok{np.random.seed(}\DecValTok{42}\NormalTok{)}
-\NormalTok{n\_pairs }\OperatorTok{=} \DecValTok{20}
-\NormalTok{X1 }\OperatorTok{=}\NormalTok{ np.random.uniform(}\DecValTok{0}\NormalTok{, }\DecValTok{10}\NormalTok{, size}\OperatorTok{=}\NormalTok{n\_pairs)  }\CommentTok{\# 20 random x{-}values}
-\NormalTok{X2 }\OperatorTok{=}\NormalTok{ np.random.uniform(}\DecValTok{0}\NormalTok{, }\DecValTok{10}\NormalTok{, size}\OperatorTok{=}\NormalTok{n\_pairs)  }\CommentTok{\# 20 more random x{-}values}
-\CommentTok{\# Determine preferences according to true utility}
-\NormalTok{prefs }\OperatorTok{=}\NormalTok{ (true\_utility(X1) }\OperatorTok{\textgreater{}}\NormalTok{ true\_utility(X2)).astype(}\BuiltInTok{int}\NormalTok{)  }\CommentTok{\# 1 if X1 preferred, else 0}
-
-\CommentTok{\# Parametric model for utility: u\_theta(x) = w0 + w1*x + w2*x\^{}2  (quadratic form)}
-\CommentTok{\# Initialize weights}
-\NormalTok{w }\OperatorTok{=}\NormalTok{ np.zeros(}\DecValTok{3}\NormalTok{)}
-\NormalTok{lr }\OperatorTok{=} \FloatTok{0.01}       \CommentTok{\# learning rate}
-\NormalTok{reg }\OperatorTok{=} \FloatTok{1e{-}3}      \CommentTok{\# L2 regularization strength}
-\ControlFlowTok{for}\NormalTok{ epoch }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{1000}\NormalTok{):}
-    \CommentTok{\# Compute predictions via logistic model}
-\NormalTok{    util\_diff }\OperatorTok{=}\NormalTok{ (w[}\DecValTok{0}\NormalTok{] }\OperatorTok{+}\NormalTok{ w[}\DecValTok{1}\NormalTok{]}\OperatorTok{*}\NormalTok{X1 }\OperatorTok{+}\NormalTok{ w[}\DecValTok{2}\NormalTok{]}\OperatorTok{*}\NormalTok{X1}\OperatorTok{**}\DecValTok{2}\NormalTok{) }\OperatorTok{{-}}\NormalTok{ (w[}\DecValTok{0}\NormalTok{] }\OperatorTok{+}\NormalTok{ w[}\DecValTok{1}\NormalTok{]}\OperatorTok{*}\NormalTok{X2 }\OperatorTok{+}\NormalTok{ w[}\DecValTok{2}\NormalTok{]}\OperatorTok{*}\NormalTok{X2}\OperatorTok{**}\DecValTok{2}\NormalTok{)}
-\NormalTok{    pred }\OperatorTok{=} \DecValTok{1} \OperatorTok{/}\NormalTok{ (}\DecValTok{1} \OperatorTok{+}\NormalTok{ np.exp(}\OperatorTok{{-}}\NormalTok{util\_diff))      }\CommentTok{\# σ(w·(phi(X1){-}phi(X2)))}
-    \CommentTok{\# Gradient of cross{-}entropy loss}
-\NormalTok{    grad }\OperatorTok{=}\NormalTok{ np.array([}\FloatTok{0.0}\NormalTok{, }\FloatTok{0.0}\NormalTok{, }\FloatTok{0.0}\NormalTok{])}
-\NormalTok{    error }\OperatorTok{=}\NormalTok{ pred }\OperatorTok{{-}}\NormalTok{ prefs  }\CommentTok{\# (sigma {-} y)}
-    \CommentTok{\# Features for X1 and X2}
-\NormalTok{    phi1 }\OperatorTok{=}\NormalTok{ np.vstack([np.ones(n\_pairs), X1, X1}\OperatorTok{**}\DecValTok{2}\NormalTok{]).T}
-\NormalTok{    phi2 }\OperatorTok{=}\NormalTok{ np.vstack([np.ones(n\_pairs), X2, X2}\OperatorTok{**}\DecValTok{2}\NormalTok{]).T}
-\NormalTok{    phi\_diff }\OperatorTok{=}\NormalTok{ phi1 }\OperatorTok{{-}}\NormalTok{ phi2}
-    \CommentTok{\# Gradient: derivative of loss w.rt w = (sigma {-} y)*φ\_diff (averaged) + reg}
-\NormalTok{    grad }\OperatorTok{=}\NormalTok{ phi\_diff.T.dot(error) }\OperatorTok{/}\NormalTok{ n\_pairs }\OperatorTok{+}\NormalTok{ reg }\OperatorTok{*}\NormalTok{ w}
-    \CommentTok{\# Update weights}
-\NormalTok{    w }\OperatorTok{{-}=}\NormalTok{ lr }\OperatorTok{*}\NormalTok{ grad}
-
-\BuiltInTok{print}\NormalTok{(}\StringTok{"Learned weights:"}\NormalTok{, w)}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-Learned weights: [ 0.          2.74417195 -0.22129969]
-\end{verbatim}
-
-After training, we can compare the learned utility function
-\(u_\theta(x)\) to the true utility \(u^*(x)\). Below we plot the two
-functions:
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-
-\CommentTok{\# Plot true vs learned utility curves}
-\NormalTok{xs }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{200}\NormalTok{)}
-\NormalTok{true\_vals }\OperatorTok{=}\NormalTok{ true\_utility(xs)}
-\NormalTok{learned\_vals }\OperatorTok{=}\NormalTok{ w[}\DecValTok{0}\NormalTok{] }\OperatorTok{+}\NormalTok{ w[}\DecValTok{1}\NormalTok{]}\OperatorTok{*}\NormalTok{xs }\OperatorTok{+}\NormalTok{ w[}\DecValTok{2}\NormalTok{]}\OperatorTok{*}\NormalTok{xs}\OperatorTok{**}\DecValTok{2}
-
-\NormalTok{plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{6}\NormalTok{,}\DecValTok{4}\NormalTok{))}
-\NormalTok{plt.plot(xs, true\_vals, label}\OperatorTok{=}\StringTok{"True Utility"}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{3}\NormalTok{)}
-\NormalTok{plt.plot(xs, learned\_vals, label}\OperatorTok{=}\StringTok{"Learned Utility"}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{"{-}{-}"}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{3}\NormalTok{)}
-\NormalTok{plt.xlabel(}\StringTok{"State x"}\NormalTok{)}
-\NormalTok{plt.ylabel(}\StringTok{"Utility"}\NormalTok{)}
-\NormalTok{plt.title(}\StringTok{"True vs. Learned Utility Function"}\NormalTok{)}
-\NormalTok{plt.legend()}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/chap3_files/figure-pdf/cell-3-output-1.pdf}}
-
-The learned curve closely matches the true utility up to an arbitrary
-scaling factor (utility is only defined up to affine transform when
-inferred from comparisons). The algorithm successfully recovered a
-utility function that orders states almost the same as the true utility
-\(u^*(x)\). In general, learning from comparisons can infer the
-\emph{relative} utility of options (which item is preferred), although
-the absolute scale of \(u_\theta\) is unidentifiable without further
-assumptions. Supervised learning on preferences has been widely used for
-ranking problems and preference-based reward learning.
-
-In standard preference learning, we often learn a utility function and
-then use it to define a policy. However, in some settings---especially
-those involving large models like language models---it is more effective
-to directly learn a policy that aligns with human preferences, bypassing
-the intermediate reward model. One such method is Direct Preference
-Optimization (DPO), which offers a simple, stable way to align a policy
-to preference data through supervised learning.
-
-To understand DPO, consider the following setting:
-
-\begin{itemize}
-\tightlist
-\item
-  We are given a reference policy \(\pi_{\text{ref}}\), such as a
-  pre-trained language model.
-\item
-  We want to learn a new policy \(\pi_\theta\) that improves upon
-  \(\pi_{\text{ref}}\) by better reflecting human preferences.
-\item
-  Our data consists of pairwise comparisons: for each prompt \(x\), a
-  human expresses a preference between two outputs \(y_+ \succ y_-\),
-  where \(y_+\) is the preferred response.
-\end{itemize}
-
-Rather than learning an explicit reward function \(R(x, y)\) and using
-it to optimize the policy via reinforcement learning, DPO treats this as
-a classification problem: we want to encourage the policy to assign
-higher likelihood to the preferred response.
-
-To formalize this, we define a preference score: \[
-s_\theta(x, y_+, y_-) = \log \pi_\theta(y_+ \mid x) - \log \pi_\theta(y_- \mid x)
-\] This is the difference in log-likelihood between the preferred and
-dispreferred outputs. We can then define the DPO loss as a logistic
-regression objective: \[
-\mathcal{L}_{\text{DPO}}(\theta) = -\log \sigma\left(s_\theta(x, y_+, y_-)\right)
-\] where \(\sigma(z) = \frac{1}{1 + e^{-z}}\) is the sigmoid function.
-
-This loss encourages \(\pi_\theta\) to assign greater probability mass
-to \(y_+\) than \(y_-\), pushing the policy toward outputs that align
-with human preferences. Because this is a differentiable, supervised
-loss, it can be optimized with standard gradient-based techniques,
-without needing to sample from the environment or estimate advantages,
-as in traditional RL.
-
-Although DPO does not explicitly define or optimize a reward function,
-we can interpret it as doing so implicitly. Suppose we define a reward
-function: \[
-R_\theta(y \mid x) = \log \pi_\theta(y \mid x) - \log \pi_{\text{ref}}(y \mid x)
-\] This reward encourages \(\pi_\theta\) to move away from
-\(\pi_{\text{ref}}\) in directions that increase the probability of
-preferred outputs. Under this formulation, the DPO objective can be
-interpreted as optimizing this reward difference directly from
-preferences.
-
-To understand why this implicit reward leads to a stable and
-interpretable policy, we can connect DPO to the principle of maximum
-entropy. This principle says that, among all distributions that satisfy
-certain constraints (e.g., achieving a particular expected reward), we
-should prefer the one with maximum entropy---that is, the most uncertain
-or uncommitted distribution consistent with our knowledge.
-
-Formally, consider the space \(\mathcal{P}\) of distributions over
-responses \(y\), and a reward function \(R(y)\). The maximum entropy
-distribution that satisfies a reward constraint is the solution to:
-
-\[
-p^*(y) = \arg\max_{p \in \mathcal{P}} H(p) \quad \text{subject to} \quad \mathbb{E}_p[R(y)] \geq \rho
-\]
-
-The solution to this constrained optimization problem is a Boltzmann
-distribution: \[
-p^*(y) \propto \exp\left(\frac{R(y)}{\tau}\right)
-\] for some temperature \(\tau > 0\), where \(\tau\) controls how
-deterministic the distribution is. As \(\tau \to 0\), the distribution
-concentrates on the highest-reward outputs; as \(\tau \to \infty\), it
-becomes uniform.
-
-Now suppose our reference policy \(\pi_{\text{ref}}(y \mid x)\) already
-represents a reasonable starting point. Then the optimal policy
-\(\pi_\theta\) can be viewed as a reward-weighted version of this
-reference policy:
-
-\[
-\pi_\theta(y \mid x) \propto \pi_{\text{ref}}(y \mid x) \cdot \exp\left(R_\theta(y \mid x)\right)
-\]
-
-This form ensures that \(\pi_\theta\) remains close to
-\(\pi_{\text{ref}}\) (via the KL term), while still assigning more mass
-to high-reward (preferred) outputs. Importantly, this form arises
-naturally from maximum entropy inference when the reference distribution
-is used as a baseline.
-
-DPO thus combines reward maximization with entropy regularization,
-encouraging the learned policy to prefer outcomes favored by human
-feedback while preserving diversity and stability. It sidesteps the
-challenges of explicitly learning a reward model or tuning complex RL
-pipelines, offering a direct and scalable method for preference-based
-alignment.
-
-In practice, DPO has been shown to achieve similar or better alignment
-performance compared to reinforcement learning from human feedback
-(RLHF) while being more stable and easier to implement. It avoids the
-need to sample from the model during training or tune delicate
-hyperparameters of RL. Conceptually, DPO demonstrates that if we
-structure our utility model cleverly (here, as the log-ratio of policy
-and reference), we can extract an optimal policy in closed-form and
-learn utilities via supervised learning.
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{import}\NormalTok{ matplotlib.animation }\ImportTok{as}\NormalTok{ animation}
-\ImportTok{from}\NormalTok{ scipy.special }\ImportTok{import}\NormalTok{ logsumexp}
-
-\CommentTok{\# {-}{-}{-} Setup: 1D input x, discrete actions y {-}{-}{-}}
-\NormalTok{np.random.seed(}\DecValTok{0}\NormalTok{)}
-\NormalTok{x }\OperatorTok{=} \FloatTok{5.0}  \CommentTok{\# fixed input}
-\NormalTok{Y }\OperatorTok{=}\NormalTok{ np.linspace(}\OperatorTok{{-}}\DecValTok{4}\NormalTok{, }\DecValTok{4}\NormalTok{, }\DecValTok{100}\NormalTok{)  }\CommentTok{\# discrete action space}
-\NormalTok{n\_actions }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(Y)}
-
-\CommentTok{\# {-}{-}{-} True reward function (unknown to learner) {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ true\_reward(x, y):}
-    \ControlFlowTok{return} \OperatorTok{{-}}\NormalTok{((y }\OperatorTok{{-}}\NormalTok{ np.sin(x))}\OperatorTok{**}\DecValTok{2}\NormalTok{)  }\CommentTok{\# reward peak near y = sin(x)}
-
-\NormalTok{R\_true }\OperatorTok{=}\NormalTok{ true\_reward(x, Y)}
-
-\CommentTok{\# {-}{-}{-} Reference policy: fixed Gaussian{-}like distribution {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ ref\_policy(y):}
-\NormalTok{    logits }\OperatorTok{=} \OperatorTok{{-}}\FloatTok{0.5} \OperatorTok{*}\NormalTok{ (y }\OperatorTok{/} \FloatTok{2.0}\NormalTok{)}\OperatorTok{**}\DecValTok{2}  \CommentTok{\# log probs of N(0, 2\^{}2)}
-    \ControlFlowTok{return}\NormalTok{ np.exp(logits }\OperatorTok{{-}}\NormalTok{ logsumexp(logits))}
-
-\NormalTok{pi\_ref }\OperatorTok{=}\NormalTok{ ref\_policy(Y)}
-
-\CommentTok{\# {-}{-}{-} Preference data from reward samples {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ sample\_preference(x, Y, R\_fn, temperature}\OperatorTok{=}\FloatTok{1.0}\NormalTok{):}
-\NormalTok{    logits }\OperatorTok{=}\NormalTok{ R\_fn(x, Y) }\OperatorTok{/}\NormalTok{ temperature}
-\NormalTok{    probs }\OperatorTok{=}\NormalTok{ np.exp(logits }\OperatorTok{{-}}\NormalTok{ logsumexp(logits))}
-\NormalTok{    sampled }\OperatorTok{=}\NormalTok{ np.random.choice(}\BuiltInTok{len}\NormalTok{(Y), size}\OperatorTok{=}\DecValTok{2}\NormalTok{, replace}\OperatorTok{=}\VariableTok{False}\NormalTok{, p}\OperatorTok{=}\NormalTok{probs)}
-\NormalTok{    y\_plus, y\_minus }\OperatorTok{=}\NormalTok{ sampled }\ControlFlowTok{if}\NormalTok{ R\_fn(x, Y[sampled[}\DecValTok{0}\NormalTok{]]) }\OperatorTok{\textgreater{}}\NormalTok{ R\_fn(x, Y[sampled[}\DecValTok{1}\NormalTok{]]) }\ControlFlowTok{else}\NormalTok{ sampled[::}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}
-    \ControlFlowTok{return}\NormalTok{ y\_plus, y\_minus}
-
-\NormalTok{n\_pairs }\OperatorTok{=} \DecValTok{100}
-\NormalTok{pair\_indices }\OperatorTok{=}\NormalTok{ [sample\_preference(x, Y, true\_reward) }\ControlFlowTok{for}\NormalTok{ \_ }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(n\_pairs)]}
-
-\CommentTok{\# {-}{-}{-} DPO loss and gradient {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ dpo\_loss\_and\_grad(theta, y\_pos\_idx, y\_neg\_idx, pi\_ref):}
-\NormalTok{    logits }\OperatorTok{=}\NormalTok{ theta }\OperatorTok{+}\NormalTok{ np.log(pi\_ref }\OperatorTok{+} \FloatTok{1e{-}8}\NormalTok{)}
-\NormalTok{    logp\_pos }\OperatorTok{=}\NormalTok{ logits[y\_pos\_idx] }\OperatorTok{{-}}\NormalTok{ logsumexp(logits)}
-\NormalTok{    logp\_neg }\OperatorTok{=}\NormalTok{ logits[y\_neg\_idx] }\OperatorTok{{-}}\NormalTok{ logsumexp(logits)}
-\NormalTok{    s }\OperatorTok{=}\NormalTok{ logp\_pos }\OperatorTok{{-}}\NormalTok{ logp\_neg}
-\NormalTok{    sigma }\OperatorTok{=} \DecValTok{1} \OperatorTok{/}\NormalTok{ (}\DecValTok{1} \OperatorTok{+}\NormalTok{ np.exp(}\OperatorTok{{-}}\NormalTok{s))}
-\NormalTok{    loss }\OperatorTok{=} \OperatorTok{{-}}\NormalTok{np.log(sigma }\OperatorTok{+} \FloatTok{1e{-}8}\NormalTok{)}
-\NormalTok{    softmax }\OperatorTok{=}\NormalTok{ np.exp(logits }\OperatorTok{{-}}\NormalTok{ logsumexp(logits))}
-\NormalTok{    grad }\OperatorTok{=} \OperatorTok{{-}}\NormalTok{ (}\DecValTok{1} \OperatorTok{{-}}\NormalTok{ sigma) }\OperatorTok{*}\NormalTok{ (np.eye(n\_actions)[y\_pos\_idx] }\OperatorTok{{-}}\NormalTok{ np.eye(n\_actions)[y\_neg\_idx]) }\OperatorTok{+}\NormalTok{ sigma }\OperatorTok{*}\NormalTok{ softmax}
-    \ControlFlowTok{return}\NormalTok{ loss, grad}
-
-\CommentTok{\# {-}{-}{-} Training loop with history tracking {-}{-}{-}}
-\NormalTok{theta }\OperatorTok{=}\NormalTok{ np.zeros(n\_actions)}
-\NormalTok{lr }\OperatorTok{=} \FloatTok{0.05}
-\NormalTok{n\_steps }\OperatorTok{=} \DecValTok{100}
-\NormalTok{history }\OperatorTok{=}\NormalTok{ []}
-
-\ControlFlowTok{for}\NormalTok{ step }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(n\_steps):}
-\NormalTok{    total\_grad }\OperatorTok{=}\NormalTok{ np.zeros\_like(theta)}
-    \ControlFlowTok{for}\NormalTok{ y\_pos\_idx, y\_neg\_idx }\KeywordTok{in}\NormalTok{ pair\_indices:}
-\NormalTok{        \_, grad }\OperatorTok{=}\NormalTok{ dpo\_loss\_and\_grad(theta, y\_pos\_idx, y\_neg\_idx, pi\_ref)}
-\NormalTok{        total\_grad }\OperatorTok{+=}\NormalTok{ grad}
-\NormalTok{    theta }\OperatorTok{{-}=}\NormalTok{ lr }\OperatorTok{*}\NormalTok{ total\_grad }\OperatorTok{/}\NormalTok{ n\_pairs}
-\NormalTok{    logits\_snapshot }\OperatorTok{=}\NormalTok{ theta }\OperatorTok{+}\NormalTok{ np.log(pi\_ref }\OperatorTok{+} \FloatTok{1e{-}8}\NormalTok{)}
-\NormalTok{    pi\_snapshot }\OperatorTok{=}\NormalTok{ np.exp(logits\_snapshot }\OperatorTok{{-}}\NormalTok{ logsumexp(logits\_snapshot))}
-\NormalTok{    history.append(pi\_snapshot)}
-
-\CommentTok{\# {-}{-}{-} Animation setup {-}{-}{-}}
-\NormalTok{fig, ax }\OperatorTok{=}\NormalTok{ plt.subplots(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{7}\NormalTok{, }\DecValTok{4}\NormalTok{))}
-\NormalTok{line\_true, }\OperatorTok{=}\NormalTok{ ax.plot(Y, R\_true, }\StringTok{\textquotesingle{}k{-}{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}True Reward\textquotesingle{}}\NormalTok{)}
-\NormalTok{line\_ref, }\OperatorTok{=}\NormalTok{ ax.plot(Y, pi\_ref, }\StringTok{\textquotesingle{}g{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Reference Policy\textquotesingle{}}\NormalTok{)}
-\NormalTok{line\_learned, }\OperatorTok{=}\NormalTok{ ax.plot([], [], }\StringTok{\textquotesingle{}b{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Learned Policy\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# Add preference pair indicators}
-\NormalTok{pref\_lines }\OperatorTok{=}\NormalTok{ [ax.axvline(Y[idx], color}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}:\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.3}\NormalTok{) }\ControlFlowTok{for}\NormalTok{ idx, \_ }\KeywordTok{in}\NormalTok{ pair\_indices]}
-\NormalTok{pref\_lines }\OperatorTok{+=}\NormalTok{ [ax.axvline(Y[idx], color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}:\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.3}\NormalTok{) }\ControlFlowTok{for}\NormalTok{ \_, idx }\KeywordTok{in}\NormalTok{ pair\_indices]}
-
-\NormalTok{ax.set\_ylim(}\OperatorTok{{-}}\FloatTok{0.025}\NormalTok{, }\FloatTok{0.025}\NormalTok{)}
-\NormalTok{ax.set\_title(}\StringTok{"DPO Policy Evolution"}\NormalTok{)}
-\NormalTok{ax.set\_ylabel(}\StringTok{"Probability"}\NormalTok{)}
-\NormalTok{ax.set\_xlabel(}\StringTok{"y"}\NormalTok{)}
-\NormalTok{ax.legend()}
-
-\KeywordTok{def}\NormalTok{ update(frame):}
-\NormalTok{    pi\_snapshot }\OperatorTok{=}\NormalTok{ history[frame]}
-\NormalTok{    line\_learned.set\_data(Y, pi\_snapshot)}
-\NormalTok{    ax.set\_title(}\SpecialStringTok{f"DPO Policy Evolution (Step }\SpecialCharTok{\{}\NormalTok{frame }\OperatorTok{+} \DecValTok{1}\SpecialCharTok{\}}\SpecialStringTok{)"}\NormalTok{)}
-    \ControlFlowTok{return}\NormalTok{ [line\_learned]}
-
-\ImportTok{from}\NormalTok{ IPython.display }\ImportTok{import}\NormalTok{ HTML}
-\ImportTok{from}\NormalTok{ matplotlib }\ImportTok{import}\NormalTok{ rc}
-\NormalTok{rc(}\StringTok{\textquotesingle{}animation\textquotesingle{}}\NormalTok{, html}\OperatorTok{=}\StringTok{\textquotesingle{}jshtml\textquotesingle{}}\NormalTok{)}
-
-\NormalTok{ani }\OperatorTok{=}\NormalTok{ animation.FuncAnimation(fig, update, frames}\OperatorTok{=}\NormalTok{n\_steps, interval}\OperatorTok{=}\DecValTok{100}\NormalTok{, blit}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-\NormalTok{HTML(ani.to\_jshtml())}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-<IPython.core.display.HTML object>
-\end{verbatim}
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/chap3_files/figure-pdf/cell-4-output-2.pdf}}
-
-The pairwise logistic approach can be extended to other feedback types.
-If humans provide numeric \emph{ratings} or \emph{scores} for options,
-one can treat utility learning as a regression problem: fit
-\(u_\theta(x)\) to predict those scores (perhaps with a suitable bounded
-output or ordinal regression if scores are ordinal). If humans rank
-multiple options at once, algorithms like \emph{RankNet} or
-\emph{RankSVM} generalize the pairwise approach to listwise ranking
-losses. All these methods boil down to defining a loss that penalizes
-disagreements between the predicted utility order and the human-provided
-preferences, then optimizing \(\theta\) to minimize that loss.
-
-Supervised learning of utility is powerful due to its simplicity, but it
-typically provides point estimates of \(u_\theta\). Next, we consider
-Bayesian approaches that maintain uncertainty over the utility function.
-
-\section{Posterior Estimation}\label{posterior-estimation}
-
-When feedback data is sparse, as is common in preference learning, it
-can be advantageous to model uncertainty over the utility function.
-Bayesian approaches place a prior on the utility function and update a
-posterior as human feedback is observed. This yields not only a
-best-guess utility function but also a measure of confidence or
-uncertainty, which is valuable for active learning (deciding which
-queries to ask next) and for safety (knowing when the learned reward
-might be wrong).
-
-A popular Bayesian approach assumes that the human's utility function
-can be modeled as a Gaussian Process (GP) -- a distribution over
-functions. A GP prior is defined by a mean function (often taken to be 0
-for convenience) and a kernel (covariance function) \(k(x,x')\) which
-encodes assumptions about the smoothness or structure of the utility
-function. For example, one might assume \(u(x)\) is a smooth function of
-state, and choose a radial basis function (RBF) kernel
-\(k(x,x') = \sigma_f^2 \exp(-\|x-x'\|^2/(2\ell^2))\) with some
-length-scale \(\ell\).
-
-After observing some preference data, Bayes' rule gives a posterior over
-the function \(u(x)\). In the case of pairwise comparisons, the
-likelihood of a comparison \((A \succ B)\) given an underlying utility
-function \(u\) can be modeled via the same logistic function:
-\(P(A \succ B \mid u) = \sigma(u(A)-u(B))\). Combining this likelihood
-with the GP prior is analytically intractable (due to the non-Gaussian
-logistic likelihood), but one can use approximation techniques (Laplace
-approximation or MCMC) to obtain a posterior GP. The result is a
-\emph{Gaussian process preference model} that can predict the utility of
-any new option with an uncertainty interval.
-
-If we have direct evaluations of utility (e.g., the human provides a
-numeric reward for some states), the GP inference is simpler -- it
-reduces to standard GP regression. However, in many real-world
-scenarios, humans are better at making relative judgments than assigning
-absolute utility values. This change in feedback type transforms the
-inference problem fundamentally. Instead of having a Gaussian likelihood
-(as in standard GP regression), we now have a non-Gaussian likelihood,
-typically modeled using a probit or logistic function. The observed data
-no longer provide direct samples of the latent utility function, but
-instead impose constraints on the \emph{relative} ordering of latent
-values.
-
-Due to this non-Gaussian likelihood, exact Bayesian inference is no
-longer tractable: the posterior over the latent utility function given
-the pairwise data does not have a closed-form expression. The GP prior
-is still Gaussian, but the posterior becomes non-Gaussian and
-multi-modal, particularly as the number of comparisons grows.
-
-To address this, we must turn to approximate inference methods. One
-common and computationally efficient choice is the Laplace
-approximation, which approximates the true posterior with a Gaussian
-centered at the maximum a posteriori (MAP) estimate. This involves: 1.
-Finding the mode of the posterior (i.e., the most probable utility
-values given the data), 2. Approximating the curvature of the
-log-posterior around this mode using the Hessian (second derivative), 3.
-Using this local curvature to construct a Gaussian approximation.
-
-While not exact, this method works well in practice, especially when the
-posterior is unimodal and the number of comparison pairs is moderate.
-Other alternatives such as variational inference or sampling-based
-methods (e.g., Hamiltonian Monte Carlo) can yield more accurate results
-but often require more complex implementation and computational
-resources.
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ scipy.stats }\ImportTok{import}\NormalTok{ norm}
-\ImportTok{from}\NormalTok{ scipy.optimize }\ImportTok{import}\NormalTok{ minimize}
-
-\CommentTok{\# {-}{-}{-} True latent utility function {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ true\_u(x):}
-    \ControlFlowTok{return}\NormalTok{ np.sin(x) }\OperatorTok{+} \FloatTok{0.1} \OperatorTok{*}\NormalTok{ x}
-
-\CommentTok{\# {-}{-}{-} RBF Kernel function {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ rbf\_kernel(x1, x2, length\_scale}\OperatorTok{=}\FloatTok{0.8}\NormalTok{, sigma\_f}\OperatorTok{=}\FloatTok{1.0}\NormalTok{):}
-\NormalTok{    x1, x2 }\OperatorTok{=}\NormalTok{ np.atleast\_2d(x1).T, np.atleast\_2d(x2).T}
-\NormalTok{    sqdist }\OperatorTok{=}\NormalTok{ (x1 }\OperatorTok{{-}}\NormalTok{ x2.T) }\OperatorTok{**} \DecValTok{2}
-    \ControlFlowTok{return}\NormalTok{ sigma\_f}\OperatorTok{**}\DecValTok{2} \OperatorTok{*}\NormalTok{ np.exp(}\OperatorTok{{-}}\FloatTok{0.5} \OperatorTok{*}\NormalTok{ sqdist }\OperatorTok{/}\NormalTok{ length\_scale}\OperatorTok{**}\DecValTok{2}\NormalTok{)}
-
-\CommentTok{\# {-}{-}{-} Generate synthetic preference data {-}{-}{-}}
-\NormalTok{np.random.seed(}\DecValTok{42}\NormalTok{)}
-\NormalTok{num\_pairs }\OperatorTok{=} \DecValTok{10}
-\NormalTok{X\_candidates }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{100}\NormalTok{)}
-\NormalTok{true\_utilities }\OperatorTok{=}\NormalTok{ true\_u(X\_candidates)}
-
-\CommentTok{\# Sample preference pairs}
-\NormalTok{idx\_pairs }\OperatorTok{=}\NormalTok{ np.random.choice(}\BuiltInTok{len}\NormalTok{(X\_candidates), size}\OperatorTok{=}\NormalTok{(num\_pairs, }\DecValTok{2}\NormalTok{), replace}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-\NormalTok{X\_pref\_pairs }\OperatorTok{=}\NormalTok{ []}
-\ControlFlowTok{for}\NormalTok{ i, j }\KeywordTok{in}\NormalTok{ idx\_pairs:}
-\NormalTok{    xi, xj }\OperatorTok{=}\NormalTok{ X\_candidates[i], X\_candidates[j]}
-    \ControlFlowTok{if}\NormalTok{ true\_utilities[i] }\OperatorTok{\textgreater{}}\NormalTok{ true\_utilities[j]:}
-\NormalTok{        X\_pref\_pairs.append((xi, xj))}
-    \ControlFlowTok{else}\NormalTok{:}
-\NormalTok{        X\_pref\_pairs.append((xj, xi))}
-\NormalTok{X\_pref\_pairs }\OperatorTok{=}\NormalTok{ np.array(X\_pref\_pairs)}
-
-\CommentTok{\# {-}{-}{-} Unique x values and indexing {-}{-}{-}}
-\NormalTok{X\_all }\OperatorTok{=}\NormalTok{ np.unique(X\_pref\_pairs.flatten())}
-\NormalTok{n }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(X\_all)}
-\NormalTok{x\_to\_idx }\OperatorTok{=}\NormalTok{ \{x: i }\ControlFlowTok{for}\NormalTok{ i, x }\KeywordTok{in} \BuiltInTok{enumerate}\NormalTok{(X\_all)\}}
-
-\CommentTok{\# {-}{-}{-} GP prior kernel matrix {-}{-}{-}}
-\NormalTok{length\_scale }\OperatorTok{=} \FloatTok{0.8}
-\NormalTok{sigma\_f }\OperatorTok{=} \FloatTok{1.0}
-\NormalTok{sigma\_noise }\OperatorTok{=} \FloatTok{1e{-}6}
-\NormalTok{K }\OperatorTok{=}\NormalTok{ rbf\_kernel(X\_all, X\_all, length\_scale, sigma\_f) }\OperatorTok{+}\NormalTok{ sigma\_noise }\OperatorTok{*}\NormalTok{ np.eye(n)}
-
-\CommentTok{\# {-}{-}{-} Negative log{-}posterior function {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ neg\_log\_posterior(f):}
-\NormalTok{    prior\_term }\OperatorTok{=} \FloatTok{0.5} \OperatorTok{*}\NormalTok{ f.T }\OperatorTok{@}\NormalTok{ np.linalg.solve(K, f)}
-\NormalTok{    lik\_term }\OperatorTok{=} \FloatTok{0.0}
-    \ControlFlowTok{for}\NormalTok{ xi, xj }\KeywordTok{in}\NormalTok{ X\_pref\_pairs:}
-\NormalTok{        fi, fj }\OperatorTok{=}\NormalTok{ f[x\_to\_idx[xi]], f[x\_to\_idx[xj]]}
-\NormalTok{        delta }\OperatorTok{=}\NormalTok{ (fi }\OperatorTok{{-}}\NormalTok{ fj) }\OperatorTok{/}\NormalTok{ np.sqrt(}\DecValTok{2}\NormalTok{)}
-\NormalTok{        lik\_term }\OperatorTok{{-}=}\NormalTok{ np.log(norm.cdf(delta) }\OperatorTok{+} \FloatTok{1e{-}6}\NormalTok{)}
-    \ControlFlowTok{return}\NormalTok{ prior\_term }\OperatorTok{+}\NormalTok{ lik\_term}
-
-\CommentTok{\# {-}{-}{-} MAP estimation of latent utilities {-}{-}{-}}
-\NormalTok{f\_init }\OperatorTok{=}\NormalTok{ np.zeros(n)}
-\NormalTok{res }\OperatorTok{=}\NormalTok{ minimize(neg\_log\_posterior, f\_init, method}\OperatorTok{=}\StringTok{"L{-}BFGS{-}B"}\NormalTok{)}
-\NormalTok{f\_map }\OperatorTok{=}\NormalTok{ res.x}
-
-\CommentTok{\# {-}{-}{-} Laplace approximation: compute W (Hessian of neg log likelihood) {-}{-}{-}}
-\NormalTok{W }\OperatorTok{=}\NormalTok{ np.zeros((n, n))}
-\ControlFlowTok{for}\NormalTok{ xi, xj }\KeywordTok{in}\NormalTok{ X\_pref\_pairs:}
-\NormalTok{    i, j }\OperatorTok{=}\NormalTok{ x\_to\_idx[xi], x\_to\_idx[xj]}
-\NormalTok{    fi, fj }\OperatorTok{=}\NormalTok{ f\_map[i], f\_map[j]}
-\NormalTok{    delta }\OperatorTok{=}\NormalTok{ (fi }\OperatorTok{{-}}\NormalTok{ fj) }\OperatorTok{/}\NormalTok{ np.sqrt(}\DecValTok{2}\NormalTok{)}
-\NormalTok{    phi }\OperatorTok{=}\NormalTok{ norm.pdf(delta)}
-\NormalTok{    Phi }\OperatorTok{=}\NormalTok{ norm.cdf(delta) }\OperatorTok{+} \FloatTok{1e{-}6}
-\NormalTok{    w }\OperatorTok{=}\NormalTok{ (phi }\OperatorTok{/}\NormalTok{ Phi)}\OperatorTok{**}\DecValTok{2} \OperatorTok{+}\NormalTok{ delta }\OperatorTok{*}\NormalTok{ phi }\OperatorTok{/}\NormalTok{ Phi}
-\NormalTok{    w }\OperatorTok{/=} \DecValTok{2}  \CommentTok{\# adjust for sqrt(2)}
-\NormalTok{    W[i, i] }\OperatorTok{+=}\NormalTok{ w}
-\NormalTok{    W[j, j] }\OperatorTok{+=}\NormalTok{ w}
-\NormalTok{    W[i, j] }\OperatorTok{{-}=}\NormalTok{ w}
-\NormalTok{    W[j, i] }\OperatorTok{{-}=}\NormalTok{ w}
-
-\CommentTok{\# {-}{-}{-} Posterior covariance approximation {-}{-}{-}}
-\NormalTok{L }\OperatorTok{=}\NormalTok{ np.linalg.cholesky(K)}
-\NormalTok{K\_inv }\OperatorTok{=}\NormalTok{ np.linalg.solve(L.T, np.linalg.solve(L, np.eye(n)))}
-\NormalTok{H }\OperatorTok{=}\NormalTok{ K\_inv }\OperatorTok{+}\NormalTok{ W}
-\NormalTok{H\_inv }\OperatorTok{=}\NormalTok{ np.linalg.inv(H)}
-
-\CommentTok{\# {-}{-}{-} Prediction at test points {-}{-}{-}}
-\NormalTok{X\_test }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{200}\NormalTok{)}
-\NormalTok{K\_s }\OperatorTok{=}\NormalTok{ rbf\_kernel(X\_all, X\_test, length\_scale, sigma\_f)}
-\NormalTok{K\_ss\_diag }\OperatorTok{=}\NormalTok{ np.diag(rbf\_kernel(X\_test, X\_test, length\_scale, sigma\_f))}
-
-\CommentTok{\# Posterior mean and variance}
-\NormalTok{posterior\_mean }\OperatorTok{=}\NormalTok{ K\_s.T }\OperatorTok{@}\NormalTok{ K\_inv }\OperatorTok{@}\NormalTok{ f\_map}
-\NormalTok{temp }\OperatorTok{=}\NormalTok{ np.linalg.solve(H, K\_s)}
-\NormalTok{posterior\_var }\OperatorTok{=}\NormalTok{ K\_ss\_diag }\OperatorTok{{-}}\NormalTok{ np.}\BuiltInTok{sum}\NormalTok{(K\_s }\OperatorTok{*}\NormalTok{ temp, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-\NormalTok{posterior\_std }\OperatorTok{=}\NormalTok{ np.sqrt(np.maximum(posterior\_var, }\DecValTok{0}\NormalTok{))}
-
-\CommentTok{\# {-}{-}{-} Visualization {-}{-}{-}}
-\NormalTok{plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{8}\NormalTok{, }\DecValTok{4}\NormalTok{))}
-\NormalTok{plt.plot(X\_test, true\_u(X\_test), }\StringTok{\textquotesingle{}k{-}{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{"True utility"}\NormalTok{)}
-\NormalTok{plt.plot(X\_test, posterior\_mean, }\StringTok{\textquotesingle{}b{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{"Posterior mean"}\NormalTok{)}
-\NormalTok{plt.fill\_between(X\_test,}
-\NormalTok{                 posterior\_mean }\OperatorTok{{-}} \FloatTok{1.96} \OperatorTok{*}\NormalTok{ posterior\_std,}
-\NormalTok{                 posterior\_mean }\OperatorTok{+} \FloatTok{1.96} \OperatorTok{*}\NormalTok{ posterior\_std,}
-\NormalTok{                 color}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.2}\NormalTok{, label}\OperatorTok{=}\StringTok{"95\% CI"}\NormalTok{)}
-\NormalTok{plt.scatter(X\_all, [true\_u(x) }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ X\_all], c}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, marker}\OperatorTok{=}\StringTok{\textquotesingle{}x\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{"Observed x"}\NormalTok{)}
-\NormalTok{plt.title(}\StringTok{"GP Preference Learning (Laplace Approximation, 100 Pairs)"}\NormalTok{)}
-\NormalTok{plt.xlabel(}\StringTok{"x"}\NormalTok{)}
-\NormalTok{plt.ylabel(}\StringTok{"Utility"}\NormalTok{)}
-\NormalTok{plt.legend()}
-\NormalTok{plt.tight\_layout()}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/chap3_files/figure-pdf/cell-5-output-1.pdf}}
-
-Gaussian Process posterior for a utility function (blue mean with 95\%
-confidence band) after observing 5 points of noisy utility data (red ×).
-The true utility function (black dashed) is non-trivial. The GP
-correctly captures the function's value around observed regions and
-expresses high uncertainty in the unobserved middle region. In practice,
-this uncertainty could guide an algorithm to query more feedback in the
-region \(x\approx [4,7]\) to reduce ambiguity.
-
-Gaussian processes are a flexible way to learn utility functions. They
-naturally handle irregular data and provide principled uncertainty
-estimates. GP-based preference learning has been applied to tasks like
-interactive Bayesian optimization, where an algorithm seeks to find the
-maximum of \(u(x)\) by iteratively querying a human which of two options
-is better (\citeproc{ref-christiano2023deep}{Christiano et al. 2023}).
-
-Instead of GPs, one can use Bayesian neural networks or ensemble methods
-to model uncertainty in \(u_\theta(x)\). For instance, a neural network
-can be trained on preference data, and techniques like Monte Carlo
-dropout or deep ensembles can provide uncertainty estimates for its
-predictions. These approaches scale to high-dimensional inputs (where
-GPs may be less practical) while still capturing epistemic uncertainty
-about the utility.
-
-One principled way to capture uncertainty in Bayesian neural networks is
-via Markov Chain Monte Carlo (MCMC) methods, which seek to approximate
-the posterior distribution over model parameters given the data. In this
-setting, we place a prior over the neural network weights,
-\(p(\theta)\), and define a likelihood function based on observed
-preferences---typically using a probabilistic choice model such as the
-Bradley-Terry or probit model. Given a dataset
-\(\mathcal{D} = \{(x_i, x_j) : x_i \succ x_j\}\), the posterior is
-defined as
-
-\[
-p(\theta \mid \mathcal{D}) \propto p(\mathcal{D} \mid \theta) \cdot p(\theta),
-\]
-
-where \(p(\mathcal{D} \mid \theta)\) is the likelihood of observing the
-pairwise comparisons under the utility function \(u_\theta(x)\), and
-\(p(\theta)\) is the prior over the parameters.
-
-Unlike Gaussian processes, for which posterior inference is tractable in
-closed form under Gaussian likelihoods, inference in BNNs with
-non-Gaussian likelihoods is generally intractable. This is due to the
-non-conjugate nature of the neural network likelihood and the
-high-dimensional, nonlinear structure of the weight space. As a result,
-approximate inference methods are required.
-
-MCMC provides a general-purpose approach to approximate sampling from
-the posterior. The key idea is to construct a Markov chain whose
-stationary distribution is the target posterior. One of the most widely
-used algorithms is the Metropolis-Hastings (MH) algorithm. Given a
-current state \(\theta_t\), a new proposal \(\theta'\) is generated from
-a proposal distribution \(q(\theta' \mid \theta_t)\), and accepted with
-probability
-
-\[
-A = \min\left(1, \frac{p(\mathcal{D} \mid \theta') \, p(\theta') \, q(\theta_t \mid \theta')}{p(\mathcal{D} \mid \theta_t) \, p(\theta_t) \, q(\theta' \mid \theta_t)}\right).
-\]
-
-When the proposal distribution is symmetric, i.e.,
-\(q(\theta' \mid \theta_t) = q(\theta_t \mid \theta')\), the acceptance
-probability simplifies to a ratio of posterior densities. Over time, the
-chain yields samples
-\(\theta^{(1)}, \dots, \theta^{(T)} \sim p(\theta \mid \mathcal{D})\),
-which can be used to compute posterior predictive estimates for the
-utility function:
-
-\[
-\mathbb{E}[u(x)] \approx \frac{1}{T} \sum_{t=1}^T u_{\theta^{(t)}}(x),
-\]
-
-with corresponding uncertainty estimates captured via the variance of
-the predictions across samples.
-
-MCMC methods are particularly appealing for preference learning because
-they directly quantify epistemic uncertainty in the utility function,
-which is crucial for downstream tasks such as decision-making, active
-learning, and safe exploration. Furthermore, MCMC makes no restrictive
-assumptions on the form of the posterior and can be used with non-convex
-and multi-modal distributions that arise from complex neural network
-architectures.
-
-However, MCMC also faces significant computational challenges in
-practice. First, the convergence of the Markov chain can be slow,
-especially in high-dimensional parameter spaces. Second, naive
-random-walk proposals (as in the basic Metropolis-Hastings algorithm)
-may suffer from low acceptance rates and poor mixing. More advanced MCMC
-methods such as Hamiltonian Monte Carlo (HMC) and No-U-Turn Sampling
-(NUTS) can help address these issues by using gradient information to
-propose more efficient moves through the parameter space.
-
-Despite these limitations, MCMC remains a valuable tool for principled
-Bayesian inference in preference modeling, particularly in settings
-where uncertainty quantification is critical and computational cost is
-acceptable. In lower-dimensional settings or as a pedagogical tool, even
-simple MH-based approaches can offer intuitive and effective
-approximations to the posterior over preference functions.
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ scipy.stats }\ImportTok{import}\NormalTok{ norm}
-\ImportTok{from}\NormalTok{ IPython.display }\ImportTok{import}\NormalTok{ HTML}
-\ImportTok{from}\NormalTok{ matplotlib }\ImportTok{import}\NormalTok{ rc}
-\ImportTok{import}\NormalTok{ matplotlib.animation }\ImportTok{as}\NormalTok{ animation}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ scipy.stats }\ImportTok{import}\NormalTok{ norm, gaussian\_kde}
-\ImportTok{from}\NormalTok{ sklearn.decomposition }\ImportTok{import}\NormalTok{ PCA}
-
-\CommentTok{\# {-}{-}{-} True latent utility function {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ true\_u(x):}
-    \ControlFlowTok{return}\NormalTok{ np.sin(x) }\OperatorTok{+} \FloatTok{0.1} \OperatorTok{*}\NormalTok{ x}
-
-\CommentTok{\# {-}{-}{-} Generate synthetic preference data {-}{-}{-}}
-\NormalTok{np.random.seed(}\DecValTok{0}\NormalTok{)}
-\NormalTok{X }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{40}\NormalTok{)}
-\NormalTok{y\_true }\OperatorTok{=}\NormalTok{ true\_u(X)}
-
-\CommentTok{\# Create pairwise comparisons}
-\NormalTok{pairs }\OperatorTok{=}\NormalTok{ []}
-\ControlFlowTok{for}\NormalTok{ \_ }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{50}\NormalTok{):}
-\NormalTok{    i, j }\OperatorTok{=}\NormalTok{ np.random.choice(}\BuiltInTok{len}\NormalTok{(X), }\DecValTok{2}\NormalTok{, replace}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
-    \ControlFlowTok{if}\NormalTok{ y\_true[i] }\OperatorTok{\textgreater{}}\NormalTok{ y\_true[j]:}
-\NormalTok{        pairs.append((X[i], X[j], }\DecValTok{1}\NormalTok{))  }\CommentTok{\# x\_i preferred over x\_j}
-    \ControlFlowTok{else}\NormalTok{:}
-\NormalTok{        pairs.append((X[j], X[i], }\DecValTok{1}\NormalTok{))}
-
-\CommentTok{\# {-}{-}{-} Define a deep neural network: 3 hidden layers {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ init\_deep\_params(hidden\_dims}\OperatorTok{=}\NormalTok{[}\DecValTok{4}\NormalTok{, }\DecValTok{4}\NormalTok{, }\DecValTok{4}\NormalTok{]):}
-\NormalTok{    params }\OperatorTok{=}\NormalTok{ \{\}}
-\NormalTok{    layer\_dims }\OperatorTok{=}\NormalTok{ [}\DecValTok{1}\NormalTok{] }\OperatorTok{+}\NormalTok{ hidden\_dims }\OperatorTok{+}\NormalTok{ [}\DecValTok{1}\NormalTok{]}
-    \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\BuiltInTok{len}\NormalTok{(layer\_dims) }\OperatorTok{{-}} \DecValTok{1}\NormalTok{):}
-\NormalTok{        W\_key }\OperatorTok{=} \SpecialStringTok{f"W}\SpecialCharTok{\{}\NormalTok{i}\OperatorTok{+}\DecValTok{1}\SpecialCharTok{\}}\SpecialStringTok{"}
-\NormalTok{        b\_key }\OperatorTok{=} \SpecialStringTok{f"b}\SpecialCharTok{\{}\NormalTok{i}\OperatorTok{+}\DecValTok{1}\SpecialCharTok{\}}\SpecialStringTok{"}
-\NormalTok{        params[W\_key] }\OperatorTok{=}\NormalTok{ np.random.randn(layer\_dims[i}\OperatorTok{+}\DecValTok{1}\NormalTok{], layer\_dims[i]) }\OperatorTok{*} \FloatTok{0.1}
-\NormalTok{        params[b\_key] }\OperatorTok{=}\NormalTok{ np.zeros((layer\_dims[i}\OperatorTok{+}\DecValTok{1}\NormalTok{], }\DecValTok{1}\NormalTok{))}
-    \ControlFlowTok{return}\NormalTok{ params}
-
-\KeywordTok{def}\NormalTok{ deep\_forward(x, params):}
-\NormalTok{    x }\OperatorTok{=}\NormalTok{ x.reshape(}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)}
-\NormalTok{    num\_layers }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(params) }\OperatorTok{//} \DecValTok{2}
-\NormalTok{    h }\OperatorTok{=}\NormalTok{ x}
-    \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{1}\NormalTok{, num\_layers):}
-\NormalTok{        h }\OperatorTok{=}\NormalTok{ np.tanh(params[}\SpecialStringTok{f"W}\SpecialCharTok{\{}\NormalTok{i}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{] }\OperatorTok{@}\NormalTok{ h }\OperatorTok{+}\NormalTok{ params[}\SpecialStringTok{f"b}\SpecialCharTok{\{}\NormalTok{i}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{])}
-\NormalTok{    out }\OperatorTok{=}\NormalTok{ params[}\SpecialStringTok{f"W}\SpecialCharTok{\{}\NormalTok{num\_layers}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{] }\OperatorTok{@}\NormalTok{ h }\OperatorTok{+}\NormalTok{ params[}\SpecialStringTok{f"b}\SpecialCharTok{\{}\NormalTok{num\_layers}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{]}
-    \ControlFlowTok{return}\NormalTok{ out.squeeze()}
-
-\KeywordTok{def}\NormalTok{ deep\_utility(x, params):}
-    \ControlFlowTok{return}\NormalTok{ np.array([deep\_forward(np.array([xi]), params) }\ControlFlowTok{for}\NormalTok{ xi }\KeywordTok{in}\NormalTok{ x])}
-
-\CommentTok{\# {-}{-}{-} Log likelihood (Bradley{-}Terry) {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ deep\_log\_likelihood(params, pairs):}
-\NormalTok{    ll }\OperatorTok{=} \FloatTok{0.0}
-    \ControlFlowTok{for}\NormalTok{ xi, xj, \_ }\KeywordTok{in}\NormalTok{ pairs:}
-\NormalTok{        ui }\OperatorTok{=}\NormalTok{ deep\_forward(np.array([xi]), params)}
-\NormalTok{        uj }\OperatorTok{=}\NormalTok{ deep\_forward(np.array([xj]), params)}
-\NormalTok{        ll }\OperatorTok{+=}\NormalTok{ np.log(norm.cdf((ui }\OperatorTok{{-}}\NormalTok{ uj) }\OperatorTok{/}\NormalTok{ np.sqrt(}\DecValTok{2}\NormalTok{)) }\OperatorTok{+} \FloatTok{1e{-}6}\NormalTok{)}
-    \ControlFlowTok{return}\NormalTok{ ll}
-
-\CommentTok{\# {-}{-}{-} Gaussian prior on weights {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ deep\_log\_prior(params):}
-\NormalTok{    lp }\OperatorTok{=} \FloatTok{0.0}
-    \ControlFlowTok{for}\NormalTok{ v }\KeywordTok{in}\NormalTok{ params.values():}
-\NormalTok{        lp }\OperatorTok{{-}=} \FloatTok{0.5} \OperatorTok{*}\NormalTok{ np.}\BuiltInTok{sum}\NormalTok{(v}\OperatorTok{**}\DecValTok{2}\NormalTok{)}
-    \ControlFlowTok{return}\NormalTok{ lp}
-
-\CommentTok{\# {-}{-}{-} Proposal distribution {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ deep\_propose(params, sigma}\OperatorTok{=}\FloatTok{0.1}\NormalTok{):}
-\NormalTok{    new\_params }\OperatorTok{=}\NormalTok{ \{\}}
-    \ControlFlowTok{for}\NormalTok{ k, v }\KeywordTok{in}\NormalTok{ params.items():}
-\NormalTok{        new\_params[k] }\OperatorTok{=}\NormalTok{ v }\OperatorTok{+}\NormalTok{ np.random.randn(}\OperatorTok{*}\NormalTok{v.shape) }\OperatorTok{*}\NormalTok{ sigma}
-    \ControlFlowTok{return}\NormalTok{ new\_params}
-
-\CommentTok{\# {-}{-}{-} Metropolis{-}Hastings sampling {-}{-}{-}}
-\KeywordTok{def}\NormalTok{ deep\_mh(init\_params, pairs, num\_iters}\OperatorTok{=}\DecValTok{2000}\NormalTok{, burn\_in}\OperatorTok{=}\DecValTok{500}\NormalTok{):}
-\NormalTok{    samples }\OperatorTok{=}\NormalTok{ []}
-\NormalTok{    current }\OperatorTok{=}\NormalTok{ init\_params}
-\NormalTok{    current\_lp }\OperatorTok{=}\NormalTok{ deep\_log\_likelihood(current, pairs) }\OperatorTok{+}\NormalTok{ deep\_log\_prior(current)}
-
-    \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(num\_iters):}
-\NormalTok{        proposal }\OperatorTok{=}\NormalTok{ deep\_propose(current)}
-\NormalTok{        proposal\_lp }\OperatorTok{=}\NormalTok{ deep\_log\_likelihood(proposal, pairs) }\OperatorTok{+}\NormalTok{ deep\_log\_prior(proposal)}
-\NormalTok{        accept\_prob }\OperatorTok{=}\NormalTok{ np.exp(proposal\_lp }\OperatorTok{{-}}\NormalTok{ current\_lp)}
-        \ControlFlowTok{if}\NormalTok{ np.random.rand() }\OperatorTok{\textless{}}\NormalTok{ accept\_prob:}
-\NormalTok{            current }\OperatorTok{=}\NormalTok{ proposal}
-\NormalTok{            current\_lp }\OperatorTok{=}\NormalTok{ proposal\_lp}
-        \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}=}\NormalTok{ burn\_in:}
-\NormalTok{            samples.append(current)}
-
-    \ControlFlowTok{return}\NormalTok{ samples}
-
-\CommentTok{\# {-}{-}{-} Run MCMC {-}{-}{-}}
-\NormalTok{deep\_samples }\OperatorTok{=}\NormalTok{ deep\_mh(init\_deep\_params(), pairs, num\_iters}\OperatorTok{=}\DecValTok{40000}\NormalTok{, burn\_in}\OperatorTok{=}\DecValTok{1000}\NormalTok{)}
-
-\CommentTok{\# {-}{-}{-} Posterior predictions {-}{-}{-}}
-\NormalTok{X\_test }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{200}\NormalTok{)}
-\NormalTok{deep\_preds }\OperatorTok{=}\NormalTok{ np.array([deep\_utility(X\_test, s) }\ControlFlowTok{for}\NormalTok{ s }\KeywordTok{in}\NormalTok{ deep\_samples])}
-\NormalTok{deep\_mean }\OperatorTok{=}\NormalTok{ deep\_preds.mean(axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-\NormalTok{deep\_std }\OperatorTok{=}\NormalTok{ deep\_preds.std(axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-
-\CommentTok{\# {-}{-}{-} Plot results {-}{-}{-}}
-\NormalTok{plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{8}\NormalTok{, }\DecValTok{4}\NormalTok{))}
-\NormalTok{plt.plot(X\_test, true\_u(X\_test), }\StringTok{\textquotesingle{}k{-}{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}True utility\textquotesingle{}}\NormalTok{)}
-\NormalTok{plt.plot(X\_test, deep\_mean, }\StringTok{\textquotesingle{}b{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Posterior mean\textquotesingle{}}\NormalTok{)}
-\NormalTok{plt.fill\_between(X\_test, deep\_mean }\OperatorTok{{-}} \FloatTok{1.96} \OperatorTok{*}\NormalTok{ deep\_std, deep\_mean }\OperatorTok{+} \FloatTok{1.96} \OperatorTok{*}\NormalTok{ deep\_std,}
-\NormalTok{                 color}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.2}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}95\% CI\textquotesingle{}}\NormalTok{)}
-\NormalTok{plt.title(}\StringTok{"3{-}layer Bayesian Neural Network via MCMC on Preference Data"}\NormalTok{)}
-\NormalTok{plt.xlabel(}\StringTok{"x"}\NormalTok{)}
-\NormalTok{plt.ylabel(}\StringTok{"Utility"}\NormalTok{)}
-\NormalTok{plt.legend()}
-\NormalTok{plt.tight\_layout()}
-\NormalTok{plt.show()}
-
-\CommentTok{\# Flatten and PCA}
-\KeywordTok{def}\NormalTok{ flatten\_params(params):}
-    \ControlFlowTok{return}\NormalTok{ np.concatenate([v.flatten() }\ControlFlowTok{for}\NormalTok{ v }\KeywordTok{in}\NormalTok{ params.values()])}
-
-\NormalTok{flat\_samples }\OperatorTok{=}\NormalTok{ np.array([flatten\_params(p) }\ControlFlowTok{for}\NormalTok{ p }\KeywordTok{in}\NormalTok{ deep\_samples])}
-\NormalTok{pca }\OperatorTok{=}\NormalTok{ PCA(n\_components}\OperatorTok{=}\DecValTok{2}\NormalTok{)}
-\NormalTok{proj\_samples }\OperatorTok{=}\NormalTok{ pca.fit\_transform(flat\_samples)}
-
-\CommentTok{\# Density heatmap from KDE}
-\NormalTok{kde }\OperatorTok{=}\NormalTok{ gaussian\_kde(proj\_samples.T)}
-\NormalTok{xmin, xmax }\OperatorTok{=}\NormalTok{ proj\_samples[:, }\DecValTok{0}\NormalTok{].}\BuiltInTok{min}\NormalTok{(), proj\_samples[:, }\DecValTok{0}\NormalTok{].}\BuiltInTok{max}\NormalTok{()}
-\NormalTok{ymin, ymax }\OperatorTok{=}\NormalTok{ proj\_samples[:, }\DecValTok{1}\NormalTok{].}\BuiltInTok{min}\NormalTok{(), proj\_samples[:, }\DecValTok{1}\NormalTok{].}\BuiltInTok{max}\NormalTok{()}
-\NormalTok{xx, yy }\OperatorTok{=}\NormalTok{ np.meshgrid(np.linspace(xmin, xmax, }\DecValTok{100}\NormalTok{), np.linspace(ymin, ymax, }\DecValTok{100}\NormalTok{))}
-\NormalTok{zz }\OperatorTok{=}\NormalTok{ kde(np.vstack([xx.ravel(), yy.ravel()])).reshape(xx.shape)}
-
-\CommentTok{\# Downsample to 80 frames}
-\NormalTok{n\_frames }\OperatorTok{=} \DecValTok{80}
-\NormalTok{idx }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\BuiltInTok{len}\NormalTok{(proj\_samples) }\OperatorTok{{-}} \DecValTok{1}\NormalTok{, n\_frames).astype(}\BuiltInTok{int}\NormalTok{)}
-\NormalTok{proj\_subset }\OperatorTok{=}\NormalTok{ proj\_samples[idx]}
-
-\CommentTok{\# Animation with heatmap}
-\NormalTok{fig, ax }\OperatorTok{=}\NormalTok{ plt.subplots(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{6}\NormalTok{, }\DecValTok{5}\NormalTok{))}
-\NormalTok{ax.contourf(xx, yy, zz, levels}\OperatorTok{=}\DecValTok{30}\NormalTok{, cmap}\OperatorTok{=}\StringTok{"Blues"}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{)}
-
-\NormalTok{line, }\OperatorTok{=}\NormalTok{ ax.plot([], [], }\StringTok{\textquotesingle{}r{-}o\textquotesingle{}}\NormalTok{, markersize}\OperatorTok{=}\DecValTok{3}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.8}\NormalTok{, label}\OperatorTok{=}\StringTok{"Chain trajectory"}\NormalTok{)}
-\NormalTok{start\_point }\OperatorTok{=}\NormalTok{ ax.plot([], [], }\StringTok{\textquotesingle{}go\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Start\textquotesingle{}}\NormalTok{)[}\DecValTok{0}\NormalTok{]}
-\NormalTok{end\_point }\OperatorTok{=}\NormalTok{ ax.plot([], [], }\StringTok{\textquotesingle{}ko\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Current\textquotesingle{}}\NormalTok{)[}\DecValTok{0}\NormalTok{]}
-
-\NormalTok{ax.set\_xlim(xmin }\OperatorTok{{-}} \FloatTok{0.2}\NormalTok{, xmax }\OperatorTok{+} \FloatTok{0.2}\NormalTok{)}
-\NormalTok{ax.set\_ylim(ymin }\OperatorTok{{-}} \FloatTok{0.2}\NormalTok{, ymax }\OperatorTok{+} \FloatTok{0.2}\NormalTok{)}
-\NormalTok{ax.set\_title(}\StringTok{"Posterior exploration via Markov chain simulation"}\NormalTok{)}
-\NormalTok{ax.set\_xlabel(}\StringTok{"PCA1"}\NormalTok{)}
-\NormalTok{ax.set\_ylabel(}\StringTok{"PCA2"}\NormalTok{)}
-\NormalTok{ax.legend(loc}\OperatorTok{=}\StringTok{"upper left"}\NormalTok{)}
-
-\KeywordTok{def}\NormalTok{ update(frame):}
-\NormalTok{    line.set\_data(proj\_subset[:frame}\OperatorTok{+}\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{], proj\_subset[:frame}\OperatorTok{+}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{])}
-\NormalTok{    start\_point.set\_data([proj\_subset[}\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{]], [proj\_subset[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{]])}
-\NormalTok{    end\_point.set\_data([proj\_subset[frame, }\DecValTok{0}\NormalTok{]], [proj\_subset[frame, }\DecValTok{1}\NormalTok{]])}
-    \ControlFlowTok{return}\NormalTok{ line, start\_point, end\_point}
-
-\NormalTok{ani }\OperatorTok{=}\NormalTok{ animation.FuncAnimation(fig, update, frames}\OperatorTok{=}\NormalTok{n\_frames, interval}\OperatorTok{=}\DecValTok{80}\NormalTok{, blit}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-\NormalTok{HTML(ani.to\_jshtml())}
-\end{Highlighting}
-\end{Shaded}
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/chap3_files/figure-pdf/cell-6-output-1.pdf}}
-
-\begin{verbatim}
-<IPython.core.display.HTML object>
-\end{verbatim}
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/chap3_files/figure-pdf/cell-6-output-3.pdf}}
-
-Another Bayesian approach is Bayesian Inverse Reinforcement Learning
-(IRL), where a prior is placed on the parameters of a reward function
-and Bayes' rule is used to update this distribution given demonstrations
-or preferences
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}). Early
-work like Ramachandran \& Amir (2007) treated IRL as Bayesian inference,
-using MCMC to sample likely reward functions consistent with
-demonstrations. Such methods yield a posterior over reward functions,
-reflecting ambiguity when multiple rewards explain the human's behavior.
-
-In summary, Bayesian utility learning methods acknowledge that with
-limited human feedback, many possible utility functions might be
-compatible with the data. They keep track of this ambiguity, which is
-crucial for making cautious decisions and for actively gathering more
-feedback.
-
-\section*{Case Study: Learning from Human Feedback in
-Robotics}\label{case-study-learning-from-human-feedback-in-robotics}
-\addcontentsline{toc}{section}{Case Study: Learning from Human Feedback
-in Robotics}
-
-\markright{Case Study: Learning from Human Feedback in Robotics}
-
-Thus far, we discussed preference learning in general terms. We now
-focus on robotics, where an agent must learn a \emph{reward/utility
-function} that captures the human's objectives for a \emph{sequential
-decision-making} task. Robotics brings additional challenges: the
-utility often depends on a trajectory of states and actions, and
-feedback can come in multiple forms. We outline several key forms of
-human feedback for robot learning and how to learn from them:
-
-\begin{itemize}
-\tightlist
-\item
-  Learning from demonstrations -- inferring utility from expert
-  demonstrations of the task.
-\item
-  Learning from physical corrections -- updating utility when a human
-  physically intervenes in the robot's behavior.
-\item
-  Learning from trajectory evaluations -- using human-provided scores or
-  critiques of full trajectories.
-\item
-  Learning from pairwise trajectory comparisons -- inferring reward from
-  which of two trajectories a human prefers.
-\end{itemize}
-
-These are not mutually exclusive; in practice, combinations can be very
-powerful
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}). We
-describe each mode and how utility functions can be derived.
-
-\subsection*{Learning from Demonstrations (Inverse Reinforcement
-Learning)}\label{learning-from-demonstrations-inverse-reinforcement-learning}
-\addcontentsline{toc}{subsection}{Learning from Demonstrations (Inverse
-Reinforcement Learning)}
-
-In Learning from Demonstrations, also known as Inverse Reinforcement
-Learning, the human provides examples of desired behavior
-(e.g.~teleoperating a robot to show how to perform a task). The
-assumption is that the demonstrator is approximately optimizing some
-latent reward function \(R^*(s,a)\) (or utility for trajectories). IRL
-algorithms then search for a reward function \(R_\theta\) under which
-the given demonstrations \(\tau_{demo}\) have high expected return
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}).
-
-One classic approach is \emph{Maximum Margin IRL}, which finds a reward
-function that makes the return of the demonstration trajectories higher
-than that of any other trajectories by a large margin. Another is
-\emph{Maximum Entropy IRL}, which models the demonstrator as noisily
-optimal (Boltzmann-rational)
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}). In
-MaxEnt IRL, the probability of a trajectory \(\tau\) under reward
-parameters \(\theta\) is modeled as:
-
-\[
-P(\tau \mid \theta) = \frac{\exp\{R_\theta(\tau)\}}{\displaystyle \sum_{\tau'} \exp\{R_\theta(\tau')\}} \,,
-\]
-
-where \(R_\theta(\tau) = \sum_{t} r_\theta(s_t, a_t)\) is the cumulative
-reward of \(\tau\). The IRL algorithm then adjusts \(\theta\) to
-maximize the likelihood of the human demonstrations (while often using
-techniques to approximate the denominator, since summing over all
-trajectories is intractable). The end result is a reward function
-\(R_\theta(s,a)\) that rationalizes the demonstrations.
-
-\emph{Key challenge:} unless demonstrations are \emph{optimal} and cover
-the space well, IRL might recover an ambiguous or incorrect reward. In
-robotics, humans often have difficulty providing flawless demonstrations
-(due to hard-to-use interfaces or limited expertise)
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}). For
-example, users teleoperating a robot arm might move jerkily or only
-accomplish part of the task
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}). This
-makes sole reliance on demonstrations problematic. Nonetheless,
-demonstration data can provide a strong prior: it shows at least one way
-to succeed (or partial preferences for certain behaviors).
-
-\subsection*{Learning from Preferences and Rankings of
-Trajectories}\label{learning-from-preferences-and-rankings-of-trajectories}
-\addcontentsline{toc}{subsection}{Learning from Preferences and Rankings
-of Trajectories}
-
-When high-quality demonstrations are hard to obtain, preference queries
-on trajectories are a viable alternative
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}). In
-preference-based learning for robotics, the robot (or algorithm)
-presents two (or more) trajectories of the task outcome, and the human
-chooses which one is better. Each such comparison provides a bit of
-information about the true underlying reward. By asking many queries,
-the algorithm can home in on the reward function that explains the
-human's choices
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}).
-
-A concrete example is an agent learning to do a backflip in simulation
-(\citeproc{ref-christiano2023deep}{Christiano et al. 2023}). The agent
-initially performs random flails. The system then repeatedly shows the
-human \emph{two video clips} of the agent's behavior and asks which is
-closer to a proper backflip. From these comparisons, a reward model is
-learned that assigns higher value to behaviors more like backflips
-(\citeproc{ref-christiano2023deep}{Christiano et al. 2023}). The agent
-then uses reinforcement learning to optimize this learned reward,
-gradually performing better backflips. This process continues, with the
-human being asked comparisons on trajectories where the algorithm is
-most uncertain (to maximally inform the reward model)
-(\citeproc{ref-christiano2023deep}{Christiano et al. 2023}).
-
-(\href{https://openai.com/index/learning-from-human-preferences/}{Learning
-from human preferences \textbar{} OpenAI}) \emph{Framework for learning
-from human preferences in robotics: a reward predictor (utility
-function) is learned from human feedback on trajectory comparisons, and
-an RL algorithm uses this learned reward to improve the policy
-(\citeproc{ref-christiano2023deep}{Christiano et al. 2023}). The loop is
-iterative: as the policy improves, new queries focus on areas of
-uncertainty to refine the reward model.}
-
-Such preference-based reward learning has enabled complex skills without
-explicitly programmed rewards
-(\citeproc{ref-christiano2023deep}{Christiano et al. 2023}). Notably,
-Christiano \emph{et al.} (2017) showed that an agent can learn Atari
-game policies and robotic manipulations from a few hundred comparison
-queries, achieving goals that are hard to specify but easy to judge
-(\citeproc{ref-christiano2023deep}{Christiano et al. 2023}). Preferences
-are often easier for humans than demonstrations: choosing between
-options is simpler than generating one from scratch
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}).
-However, preference learning can be slow if each query only yields one
-bit of information. Active learning and combining preferences with other
-feedback can greatly improve efficiency
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}).
-
-\subsection*{Learning from Trajectory Evaluations (Critiques and
-Ratings)}\label{learning-from-trajectory-evaluations-critiques-and-ratings}
-\addcontentsline{toc}{subsection}{Learning from Trajectory Evaluations
-(Critiques and Ratings)}
-
-Sometimes humans provide feedback in the form of \emph{evaluative
-scores} or critiques on full trajectories (or partial trajectories). For
-example, after a robot finishes an attempt at a task, the human might
-give a reward signal (e.g.~+1/-1, or a rating 1--5 stars, or say ``too
-slow'' vs ``good job''). This is the premise of the TAMER framework
-(Training an Agent via Evaluative Reinforcement) and related approaches,
-where a human's scalar reward signals are directly treated as the reward
-function for the agent in reinforcement learning.
-
-From a utility learning perspective, such feedback can be used to
-directly fit a utility model \(u_\theta\) that predicts the human's
-rating for a given trajectory. For instance, if a human provides a score
-\(H(\tau)\) for trajectory \(\tau\), one can treat it as a training
-target for \(u_\theta(\tau)\) (possibly under a regression loss).
-However, because humans are inconsistent and may not precisely quantify
-their preferences, it's often useful to model \(H(\tau)\) as a noisy
-realization of the underlying utility, rather than a perfect label. A
-Bayesian approach could treat \(H(\tau)\) as a noisy observation of
-\(u(\tau)\) and update a posterior for \(u\). Alternatively,
-classification approaches can be used (e.g.~treat trajectories into
-``liked'' vs ``disliked'' based on thresholded ratings).
-
-A challenge with trajectory-level feedback is \emph{credit assignment}:
-the human's single score must be attributed to the entire sequence of
-actions. Algorithms like COACH (Continuous cOaching of Automated Control
-Handlers) address this by allowing humans to give feedback at
-intermediate steps, thereby guiding the agent which specific part of the
-behavior was good or bad. In either case, learning from trajectory
-evaluations turns the human into a \emph{reward function provider}, and
-the learning algorithm's job is to infer the latent reward function that
-the human's evaluations are trying to convey.
-
-\subsection*{Learning from Physical
-Corrections}\label{learning-from-physical-corrections}
-\addcontentsline{toc}{subsection}{Learning from Physical Corrections}
-
-Robots that physically collaborate with humans can receive physical
-corrections: the human may push the robot or otherwise intervene to
-adjust its behavior. Such corrections provide insight into the human's
-desired utility. For example, if a household robot is carrying a fragile
-object too recklessly and the human physically slows it down or
-re-routes it, that indicates the human's reward favors safety over speed
-at that moment.
-
-Learning from physical corrections can be formalized in different ways.
-One approach is to treat a correction as a demonstration on a small
-segment: the human's intervention suggests a better action or trajectory
-than what the robot was doing. This can be converted into a comparison:
-``the trajectory after correction is preferred over the original
-trajectory'' for that time segment. The robot can then update its reward
-function \(\theta\) to satisfy
-\(R_\theta(\text{human-corrected behavior}) > R_\theta(\text{robot’s initial behavior})\).
-Repeated corrections yield a dataset of such pairwise preferences,
-focused on the states where the robot was wrong
-(\citeproc{ref-losey2021corrections}{\textbf{losey2021corrections?}}).
-
-Another approach is to infer the human's intent through the sequence of
-corrections. Research by Losey \emph{et al.} (2021) formalized learning
-from \emph{sequences} of physical corrections, noting that each
-correction is not independent: a series of pushes might only make sense
-in aggregate
-(\citeproc{ref-losey2021corrections}{\textbf{losey2021corrections?}}).
-By analyzing the cumulative effect of multiple interventions, the
-algorithm can deduce the underlying objective more accurately (e.g.~the
-human consistently steers the robot away from the table edges, implying
-a high negative reward for collisions). Their algorithm introduced an
-auxiliary reward term to capture the human's trade-off: they will
-correct the robot if the immediate mistake is worth fixing relative to
-long-term performance
-(\citeproc{ref-losey2021corrections}{\textbf{losey2021corrections?}}).
-The conclusion was that reasoning over the sequence of corrections
-improved learning of the human's objective
-(\citeproc{ref-losey2021corrections}{\textbf{losey2021corrections?}}).
-
-Physical corrections are intuitive for humans -- we often instinctively
-guide others or objects when they err. For the robot, interpreting this
-guidance requires converting it into constraints or examples for the
-utility function. It is a powerful signal because it is \emph{active}:
-the human is not just telling preferences but directly imparting the
-desired direction of change.
-
-\subsection*{Combining Multiple Feedback
-Types}\label{combining-multiple-feedback-types}
-\addcontentsline{toc}{subsection}{Combining Multiple Feedback Types}
-
-Each feedback modality has strengths and weaknesses. Demonstrations
-provide a lot of information but can be hard to perform; preferences are
-easy for humans but yield information slowly; corrections are very
-informative locally but require physical interaction; trajectory
-evaluations are straightforward but coarse. Combining these modes can
-lead to faster and more robust reward learning
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}). For
-example, the DemPref algorithm
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}) first
-uses demonstrations to get an initial rough reward model, then uses
-preference queries to refine it quickly
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}). In user
-studies, such combined approaches learned better rewards with fewer
-queries than using either alone
-(\citeproc{ref-iliad2019learning}{\textbf{iliad2019learning?}}).
-
-In practical robot learning systems, one might start by asking for a
-demonstration. If the demo is suboptimal, the system can then ask
-preference questions on alternative attempts to clarify the true goal.
-During actual execution, if the human intervenes, the robot updates its
-reward function on the fly to avoid repeating the mistake. This
-\emph{interactive reward learning} loop continues until the robot's
-behavior aligns with human intent.
-
-\section{Summary}\label{summary}
-
-Learning utility functions from human preferences enables value
-alignment: aligning an AI system's objectives with what humans actually
-want, rather than what we \emph{think} we want in abstract. We covered
-how supervised learning can extract utilities from comparisons or
-scores, and how Bayesian methods like Gaussian processes and Bayesian
-neural nets can capture uncertainty in our inferences. In robotics, we
-saw that feedback can come in many forms -- demonstrations, comparisons,
-corrections, evaluations -- each providing a unique window into the
-human's utility function. By intelligently combining these signals,
-robots can efficiently learn complex reward functions that would be
-extremely difficult to hand-code.
-
-Key takeaways and best practices include:
-
-\begin{itemize}
-\tightlist
-\item
-  \emph{Use the right feedback for the problem:} If optimal examples are
-  available, demonstrations jump-start learning. If not, pairwise
-  preferences or scalar critiques might be easier to obtain.
-\item
-  \emph{Model uncertainty:} Knowing what the system doesn't know (via a
-  Bayesian model) allows for smart query selection and avoids
-  overconfidently optimizing the wrong objective.
-\item
-  \emph{Iterate with the human:} Preference learning is fundamentally an
-  interactive process. An agent can query a human in ambiguous cases and
-  continuously refine the utility estimate
-  (\citeproc{ref-christiano2023deep}{Christiano et al. 2023}).
-\item
-  \emph{Validate the learned utility:} Once a reward is learned, testing
-  the robot's policy and having humans verify or correct it is crucial.
-  Even a few manual corrections can reveal if the learned reward misses
-  a key aspect, allowing further refinement.
-\item
-  \emph{Be aware of scaling and bias:} Human feedback can be noisy or
-  biased. Techniques like DPO suggest ways to simplify learning and
-  avoid instability, but one should monitor for issues like reward
-  hacking or unintended solutions, intervening with additional feedback
-  as needed.
-\end{itemize}
-
-Learning from human preferences is a rich area of ongoing research. It
-lies at the intersection of machine learning, human-computer
-interaction, and ethics. As AI systems become more advanced, the
-importance of teaching them \emph{our} utility functions (and not
-mis-specified proxies) grows. The methods discussed in this chapter are
-building blocks toward AI that truly understands and pursues what humans
-value, acquired through learning \emph{with} humans in the loop rather
-than in isolation. By mastering these techniques, we move closer to AI
-and robots that can be trusted to make decisions aligned with human
-preferences and well-being.
-
-\section*{References}\label{bibliography-2}
-\addcontentsline{toc}{section}{References}
-
-\markright{References}
-
-\phantomsection\label{refs-2}
-\begin{CSLReferences}{1}{0}
-\bibitem[\citeproctext]{ref-christiano2023deep}
-Christiano, Paul, Jan Leike, Tom B. Brown, Miljan Martic, Shane Legg,
-and Dario Amodei. 2023. {``Deep Reinforcement Learning from Human
-Preferences.''} \url{https://arxiv.org/abs/1706.03741}.
-
-\end{CSLReferences}
-
-\bookmarksetup{startatroot}
-
-\chapter{Elicitation}\label{elicitation}
-
-\section{The Active Learning Problem}\label{the-active-learning-problem}
-
-Acquiring labeled data is expensive. Active learning (AL) is a learning
-paradigm that aims to reduce the amount of labeled data required to
-train a model to achieve high accuracy. AL algorithms iteratively select
-an input datapoint for an oracle (e.g., a human annotator) to label such
-that when the label is observed, the model improves the most. Two
-primary setups in AL is pool-based and stream-based. In pool-based AL,
-the model selects samples from a large unlabeled pool of data. For
-example, a model for text classification selects the most uncertain
-texts from a large pool to ask a human annotator to label. In
-stream-based AL, the model receives samples sequentially (one sample at
-a time) and decides whether to label them. The data is gone if the
-decision maker decides not to label it. In AL, a model is trained on the
-current dataset, and a set of candidate points is evaluated for
-potential inclusion. AL selects one of these points to add to the
-dataset based on an ``acquisition function'' defined with respect to the
-current model to estimate the value of each candidate point for
-improving model performance. The dataset is updated with the newly
-queried point, and the cycle repeats until the budget is exhausted or a
-predefined reliability criterion is met.
-
-AL has successfully enhance various real-world systems. For example, AL
-can improve the computer vision models used in autonomous vehicles
-(\citeproc{ref-AL_app_autonomous}{Jarl et al. 2021}). Probing a model to
-understand what type of data it would benefit from is more practical. In
-robotics, autonomous agents may query humans when unsure how to act when
-facing new situations (\citeproc{ref-AL_app_robotics}{Taylor, Berrueta,
-and Murphey 2021}). Here, collecting data often incurs significant
-financial and time costs because physical robot arm worns out over time.
-In meteorology, AL can help decide where to place additional sensors for
-weather predictions (\citeproc{ref-AL_app_sensors}{Singh, Nowak, and
-Ramanathan 2006}). Sensor placement involves deploying teams to remote
-locations and expensive construction for an extra data point. Choosing
-these locations and allocating resources wisely is of interest to
-governments and businesses. AL could also be employed to select data for
-fine-tuning large language models (LLMs) for specific downstream tasks
-(\citeproc{ref-AL_app_LLMs}{Margatina et al. 2023}). Here, it might be
-difficult to fully describe a targeted NLP task. Often, instead of
-defining a task via a dataset of examples, it may be easier for a human
-to interact with the LLM for a specific use case, identify gaps in the
-model, and address those using AL.
-
-Typically, in robotic, robots learn by observing human demonstrations.
-However, expert demonstrations are often limited, and training a
-supervised learning model would require vast amounts of demonstration
-data, which is difficult to obtain at scale. Demonstrations tend to be
-variable, reflecting the actions of individual humans, making the data
-collection process inconsistent. To address these limitations,
-alternative approaches have been proposed, such as using pairwise
-comparisons, where humans evaluate two action trajectories to determine
-the superior one, or employing physical corrections, in which reward
-functions are learned through human-robot interactions, with humans
-guiding the robot's actions during the task. AL algorithms can be
-employed in preference learning tasks, where the objective is to develop
-a model that aligns with human preferences while minimizing the need for
-extensive labeled data or reducing the high cost of annotations.
-
-Motivating by the pairwise preference setting, we consider a binary
-classification problem. The model is trained on a small labeled dataset
-\(\mathcal{D} = \{(x_i, y_i)\}_{i=1}^N\), where \(x_i\) represents the
-input data and \(y_i\) is the corresponding label. The model is
-uncertain about the class labels of some data points and can query an
-oracle to obtain the true labels of these data points. The goal is to
-minimize the number of queries to the oracle while maximizing the
-model's performance. Here, the value of a datapoint is in how much it
-helps identify the underlying model, and this notion of informativeness
-is often quantify with uncertainty. Two primary types of uncertainty are
-often considered: epistemic and aleatoric uncertainty. Epistemic
-uncertainty, or model uncertainty, arises from a lack of knowledge and
-can be reduced by acquiring more data. This type of uncertainty is
-especially significant when the model lacks confidence due to
-insufficient or incomplete information in its training set. On the other
-hand, aleatoric uncertainty, or data uncertainty, stems from the
-inherent randomness within the data itself. Unlike epistemic
-uncertainty, aleatoric uncertainty cannot be reduced, even with
-additional data, as it reflects noise or unpredictability in the real
-data-generating process. AL often focuses on selecting data that reduce
-the epistemic uncertainty.
-
-There are several method for quantify model uncertainty. Bayesian
-methods, such as Bayesian neural networks and Gaussian processes, offer
-a principled way of estimating uncertainty of parameter posterior
-distribution by iteratively updating a prior distribution over model.
-Exact posterior computation can become computationally prohibitive,
-especially for complex likelihood function, and approximated Bayesian
-computation is proposed to address this. For example, ensemble methods
-involve training multiple models and combining their predictions to
-provide an estimate of uncertainty. Ensemble methods are relatively easy
-to implement, but they are noisy and still somewhat expensive. Conformal
-prediction methods also provide a framework for estimating uncertainty
-by offering a measure of confidence in predictions based on the
-conformity of a given instance with the training data.
-
-\section{Estimating the Value of Additional Data with Acquisition
-Function}\label{estimating-the-value-of-additional-data-with-acquisition-function}
-
-Uncertainty quantification plays a vital role in acquisition functions,
-which are central to AL strategies. These functions determine which
-samples are most valuable to label by evaluating their utility based on
-the model's current uncertainty estimates. Common acquisition functions
-include uncertainty sampling (\citeproc{ref-AL_uncertainty}{Zhu et al.
-2010}), which selects samples the model is least confident about,
-query-by-committee (\citeproc{ref-AL_committee}{Beluch et al. 2018}),
-which utilizes a set of models to choose the most uncertain samples, and
-Bayesian AL by Disagreement (BALD) (\citeproc{ref-AL_BALD}{Houlsby et
-al. 2011}), which selects samples that maximize information gain by
-reducing model uncertainty. Through careful uncertainty quantification,
-acquisition functions guide the AL process, improving the model's
-efficiency in learning from limited data. Other acquisition functions
-that can be employed include:
-
-\begin{itemize}
-\item
-  Expected model change (\citeproc{ref-AL_expmodelchange}{Cai, Zhang,
-  and Zhou 2013}): This approach focuses on labeling points that would
-  have the most impact on changing the current model parameters.
-\item
-  Expected error reduction (\citeproc{ref-AL_experrorredn}{Mussmann et
-  al. 2022}): Points that would most effectively reduce the model's
-  generalization error are labeled using this strategy.
-\item
-  Variance reduction (\citeproc{ref-AL_variance}{Cohn, Ghahramani, and
-  Jordan 1996}): This approach labels points that would minimize output
-  variance, which is one component of error. By selecting points that
-  reduce variability in the model's predictions, it aims to improve
-  overall performance.
-\end{itemize}
-
-Uncertainty sampling (\citeproc{ref-AL_uncertainty}{Zhu et al. 2010})
-selects data points for which the model exhibits the greatest
-uncertainty, focusing labeling efforts on ambiguous samples where
-additional information is likely to yield the greatest benefit. Several
-acquisition strategies fall under uncertainty sampling, including
-entropy sampling, margin sampling, and least confidence sampling.
-Entropy sampling measures value of addition data by the entropy of the
-predicted probability distribution:
-\(\alpha(x) = - \sum_{y} p(y|x) \log p(y|x)\). Margin sampling focuses
-on the difference between the two highest predicted probabilities for a
-sample: \(\alpha(x) = p(y_1|x) - p(y_2|x)\), where \(y_1\) and \(y_2\)
-are two most likely classes. Least confidence sampling measures value of
-additional data by the lowest predicted probability for its most likely
-class: \(\alpha(x) = 1 - p(y_{\text{max}}|x)\), where \(y_{\text{max}}\)
-is the class with the highest probability. Consider a binary
-classification problem with three candidate \(x_1, x_2, x_3\). The code
-below demonstrate that uncertainty sampling methods yield the same
-conclusion of selecting \(x_1\).
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{import numpy as np}
-
-\NormalTok{\# Predictive distributions}
-\NormalTok{probs = np.array([}
-\NormalTok{    [0.6, 0.4],  \# p(y1|x1), p(y2|x1)}
-\NormalTok{    [0.3, 0.7],  \# p(y1|x2), p(y2|x2)}
-\NormalTok{    [0.8, 0.2],  \# p(y1|x3), p(y2|x3)}
-\NormalTok{])}
-
-\NormalTok{\# Entropy Sampling}
-\NormalTok{entropy = {-}np.sum(probs * np.log(probs), axis=1)}
-\NormalTok{for i, e in enumerate(entropy, start=1):}
-\NormalTok{    print(f"Entropy(x\_\{i\}) = \{e:.2f\}")}
-\NormalTok{\# Find the index with the highest entropy}
-\NormalTok{selected\_index = np.argmax(entropy)}
-\NormalTok{print(f"\textbackslash{}nSelect x\_\{selected\_index + 1\} for labeling (highest entropy = \{entropy[selected\_index]:.2f\})")}
-
-\NormalTok{\# Sort each row in descending order to get the top two class probabilities}
-\NormalTok{sorted\_probs = np.sort(probs, axis=1)[:, ::{-}1]}
-\NormalTok{margin = sorted\_probs[:, 0] {-} sorted\_probs[:, 1]}
-\NormalTok{for i, m in enumerate(margin, start=1):}
-\NormalTok{    print(f"Margin(x\_\{i\}) = \{m:.1f\}")}
-
-\NormalTok{\# Select the index with the smallest margin (most uncertain)}
-\NormalTok{selected\_index = np.argmin(margin)}
-\NormalTok{print(f"\textbackslash{}nSelect x\_\{selected\_index + 1\} for labeling (smallest margin = \{margin[selected\_index]:.1f\})")}
-
-\NormalTok{\# Least confidence sampling}
-\NormalTok{\# Get the highest predicted probability for each sample}
-\NormalTok{max\_probs = np.max(probs, axis=1)}
-\NormalTok{least\_confidence = 1 {-} max\_probs}
-\NormalTok{for i, lc in enumerate(least\_confidence, start=1):}
-\NormalTok{    print(f"alpha(x\_\{i\}) = \{lc:.1f\}")}
-
-\NormalTok{\# Select the index with the highest least confidence score (i.e., most uncertain)}
-\NormalTok{selected\_index = np.argmax(least\_confidence)}
-\NormalTok{print(f"\textbackslash{}nSelect x\_\{selected\_index + 1\} for labeling (highest least confidence = \{least\_confidence[selected\_index]:.1f\})")}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-Query-by-Committee (\citeproc{ref-AL_committee}{Beluch et al. 2018}) is
-selects samples for labeling based on the level of disagreement among
-members of a committee. Several acquisition functions can be employed
-under this framework to quantify the disagreement. The vote entropy
-measures the uncertainty based on how often the committee members vote
-for each class. The acquisition function is defined as
-\(\alpha(x) = \mathbb{H}\left[V(y)/C\right]\), where \(V(y)\) is the
-number of votes for class \(y\) and \(C\) is the number of committee
-members. Consensus Entropy measures the entropy of the average
-probability distribution across committee members. It is given by
-\(\alpha(x) = \mathbb{H}[p_C(y|x)]\), where \(p_C(y|x)\) is the average
-probability distribution for sample \(x\) across all committee members.
-The KL divergence quantifies the disagreement by comparing the
-probability distribution of each committee member to the average
-distribution. The acquisition function is given by
-\(\alpha(x) = \frac{1}{C} \sum_{c=1}^{C} D_{KL}[p_C(y|x) || p_C(y|x)]\),
-where \(p_C(y|x)\) is the probability distribution of committee member
-\(c\) and \(p_C(y|x)\) is the average distribution across the committee.
-As an example, consider a binary classification problem with three
-candidate \(x_1\), \(x_2\), and \(x_3\) and three committee members.
-Numerical result below show that all acquisition functions selects
-\(x_1\).
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{import numpy as np}
-
-\NormalTok{\# Predictive distributions from 3 committee members for 3 samples}
-\NormalTok{\# Shape: (num\_samples, num\_committee\_members, num\_classes)}
-\NormalTok{probs = np.array([}
-\NormalTok{    [[0.6, 0.4], [0.7, 0.3], [0.3, 0.7]],  \# x1}
-\NormalTok{    [[0.3, 0.7], [0.4, 0.6], [0.4, 0.6]],  \# x2}
-\NormalTok{    [[0.8, 0.2], [0.9, 0.1], [0.7, 0.3]],  \# x3}
-\NormalTok{])}
-
-\NormalTok{\# Vote entropy}
-\NormalTok{predicted\_labels = np.argmax(probs, axis=2)  \# Shape: (num\_samples, num\_committee\_members)}
-\NormalTok{num\_classes = probs.shape[2]}
-\NormalTok{vote\_counts = np.array([}
-\NormalTok{    [np.sum(predicted\_labels[i] == c) for c in range(num\_classes)]}
-\NormalTok{    for i in range(predicted\_labels.shape[0])}
-\NormalTok{])}
-\NormalTok{vote\_distributions = vote\_counts / vote\_counts.sum(axis=1, keepdims=True)}
-\NormalTok{vote\_entropy = {-}np.sum(vote\_distributions * np.log(vote\_distributions + 1e{-}12), axis=1)  \# add epsilon to avoid log(0)}
-\NormalTok{for i, ve in enumerate(vote\_entropy, start=1):}
-\NormalTok{    print(f"alpha(x\_\{i\}) = \{ve:.2f\}")}
-\NormalTok{selected\_index = np.argmax(vote\_entropy)}
-\NormalTok{print(f"\textbackslash{}nSelect x\_\{selected\_index + 1\} for labeling (highest vote entropy = \{vote\_entropy[selected\_index]:.2f\})")}
-
-\NormalTok{\# Consensus Entropy}
-\NormalTok{consensus\_probs = np.mean(probs, axis=1)  \# Shape: (num\_samples, num\_classes)}
-\NormalTok{consensus\_entropy = {-}np.sum(consensus\_probs * np.log(consensus\_probs + 1e{-}12), axis=1)  \# add epsilon to avoid log(0)}
-\NormalTok{for i, ce in enumerate(consensus\_entropy, start=1):}
-\NormalTok{    print(f"alpha(x\_\{i\}) = \{ce:.2f\}")}
-\NormalTok{selected\_index = np.argmax(consensus\_entropy)}
-\NormalTok{print(f"\textbackslash{}nSelect x\_\{selected\_index + 1\} for labeling (highest consensus entropy = \{consensus\_entropy[selected\_index]:.2f\})")}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-Bayesian AL by Disagreement (BALD) (\citeproc{ref-AL_BALD}{Houlsby et
-al. 2011}) selects the samples for which the model expects to gain the
-most Shannon information when corresponding labels are observed:
-
-\[
-\begin{aligned}
-&\mathbb{I}(\theta; y|x, \mathcal{D}) = \mathbb{H}[p(y|x, \mathcal{D})] - \mathbb{E}_{p(\theta | \mathcal{D})} [\mathbb{H}[p(y|x, \theta, \mathcal{D})]] \\
-&\mathbb{H}[p(y|x, \mathcal{D})] = \mathbb{H}\left[\int_{\theta} p(y|x, \theta, \mathcal{D}) p(\theta | \mathcal{D}) d\theta\right] \approx \mathbb{H}\left[\frac{1}{N}\sum_{i=1}^{N} p(y|x, \theta_i, \mathcal{D})\right] = \mathbb{H}\left[\overline{p}(y|x, \mathcal{D})\right] \\
-&\mathbb{E}_{p(\theta|\mathcal{D})} [\mathbb{H}[p(y|x, \theta, \mathcal{D})]] = \mathbb{E}_{p(\theta|\mathcal{D})} \left[ - \sum_{y} p(y|x, \theta, \mathcal{D}) \log p(y|x, \theta, \mathcal{D}) \right] \approx - \frac{1}{N} \sum_{i=1}^{N} \left( \sum_{y} p(y|x, \theta_i, \mathcal{D}) \log p(y|x, \theta_i, \mathcal{D}) \right)
-\end{aligned}
-\]
-
-When there is significant disagreement among models, the predictive
-entropy (the first term) will be large, while the expected entropy (the
-second term) will be smaller. This difference represents the degree to
-which the models disagree. BALD selects points where this disagreement
-is maximized. As an example, consider a binary classification problem
-with two classes, \(y_1\) and \(y_2\). We have two samples, \(x_1\) and
-\(x_2\). BALD selects \(x_1\) for labeling.
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\NormalTok{import numpy as np}
-
-\NormalTok{\# First and second time inferences for each sample}
-\NormalTok{\# Shape: (num\_samples, num\_draws, num\_classes)}
-\NormalTok{\# theta\_1 and theta\_2 samples}
-\NormalTok{probs = np.array([}
-\NormalTok{    [[0.6, 0.4], [0.8, 0.2]],  \# x1}
-\NormalTok{    [[0.4, 0.6], [0.5, 0.5]],  \# x2}
-\NormalTok{])}
-
-\NormalTok{\# Step 1: Compute the average predictive distribution (consensus probs)}
-\NormalTok{mean\_probs = np.mean(probs, axis=1)  \# shape: (num\_samples, num\_classes)}
-
-\NormalTok{\# Step 1 continued: Compute entropy of the consensus distribution}
-\NormalTok{consensus\_entropy = {-}np.sum(mean\_probs * np.log(mean\_probs + 1e{-}12), axis=1)}
-
-\NormalTok{\# Step 2: Compute entropy for each model draw}
-\NormalTok{individual\_entropies = {-}np.sum(probs * np.log(probs + 1e{-}12), axis=2)}
-
-\NormalTok{\# Step 2 continued: Average the entropies across model draws}
-\NormalTok{expected\_entropy = np.mean(individual\_entropies, axis=1)}
-
-\NormalTok{\# Step 3: Compute BALD = entropy of mean {-} mean of entropies}
-\NormalTok{bald\_scores = consensus\_entropy {-} expected\_entropy}
-
-\NormalTok{\# Print results}
-\NormalTok{for i, (h, eh, b) in enumerate(zip(consensus\_entropy, expected\_entropy, bald\_scores), start=1):}
-\NormalTok{    print(f"x\_\{i\}:")}
-\NormalTok{    print(f"  Predictive Entropy = \{h:.3f\}")}
-\NormalTok{    print(f"  Expected Entropy   = \{eh:.3f\}")}
-\NormalTok{    print(f"  BALD Score         = \{b:.3f\}")}
-
-\NormalTok{\# Select sample with highest BALD score}
-\NormalTok{selected\_index = np.argmax(bald\_scores)}
-\NormalTok{print(f"\textbackslash{}nSelect x\_\{selected\_index + 1\} for labeling (highest BALD = \{bald\_scores[selected\_index]:.3f\})")}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-AL by Variance Reduction (\citeproc{ref-AL_variance}{Cohn, Ghahramani,
-and Jordan 1996}) is an algorithm designed to select the next data point
-for labeling based on the anticipated reduction in the model's variance.
-The objective is to identify the point \(x \sim p(x)\) that, when
-labeled \(y_x\), will most effectively decrease the model's variance.
-The expected error at a given input \(x\) is
-\(\mathbb{E}_{\hat{y} \sim p(\hat{y} | \mathcal{D}; x), y \sim p(y|x)} (\hat{y} - y)^2\).
-\(\hat{y}\) represents the model's prediction, and \(y\) denotes the
-true label at \(x\). Using bias-variance decomposition
-(\citeproc{ref-bias_variance_orig_paper}{Geman, Bienenstock, and Doursat
-1992}), the expected error is decomposed as \[\begin{aligned}
-\mathbb{E} (\hat{y} - y)^2 = \mathbb{E}[(\hat{y} - \mathbb{E}[y|x]) + (\mathbb{E}[y|x] - y)]^2 = \mathbb{E} [(y - \mathbb{E}[y|x])^2] + 2\mathbb{E} [(\hat{y} - \mathbb{E}[y|x])(\mathbb{E}[y|x] - y)] + \mathbb{E}(\hat{y} - \mathbb{E}[y|x])^2
-\end{aligned}\] where the expectation is taken over
-\(\hat{y} \sim p(\hat{y} | \mathcal{D}; x), y \sim p(y|x)\). The first
-term represents the variance of the true label \(y\), the second term
-evaluates to zero since
-\(\mathbb{E}_{\hat{y}, y}[\mathbb{E}[y|x] - y] = 0\), and the third term
-accounts for the variance of the model's prediction \(\hat{y}\):
-\[\mathbb{E}(\hat{y} - \mathbb{E}[y|x])^2 = \mathbb{E}[(\hat{y} - \mathbb{E}[\hat{y}] + \mathbb{E}[\hat{y}] - \mathbb{E}[y|x])^2] = \mathbb{E}[(\hat{y} - \mathbb{E}[\hat{y}])^2] + (\mathbb{E}[\hat{y}] - \mathbb{E}[y|x])^2\]
-
-Hence,
-\[\mathbb{E} (\hat{y} - y)^2 = \mathbb{E}_{y} [(y - \mathbb{E}[y|x])^2] + (\mathbb{E}_{\hat{y}} [\hat{y} - \mathbb{E}[y|x]] )^2 + \mathbb{E}_{\hat{y}} [(\hat{y} - \mathbb{E}_{\hat{y}}[\hat{y}])^2]\]
-
-Here, the first term signifies the variance of the true label, which
-remains constant for a given \(x\). The second term captures how much
-the average model prediction deviates from the expected true label. The
-third term quantifies the model's uncertainty at \(x\). Cohn,
-Ghahramani, and Jordan (\citeproc{ref-AL_variance}{1996}) denotes the
-uncertainty term as
-\(\sigma^2_{\hat{y}} (x | \mathcal{D}) = \mathbb{E}_{\hat{y}} [(\hat{y} - \mathbb{E}_{\hat{y}}[\hat{y}])^2]\).
-The acquisition function is
-\(\mathbb{E}_{p(x)} [\sigma^2_{\hat{y}} (x | \tilde{\mathcal{D}})]\).
-One could rely on empirical measure like a loss on test labelled data to
-gauge model improvement, which can help decide the termination of data
-acquisition. The size of the data set and its relationship to the loss
-is tied to the model complexity. To evaluate the performance of variance
-reduction strategy, Cohn, Ghahramani, and Jordan
-(\citeproc{ref-AL_variance}{1996}) studies the Arm2D problem. Arm2D is a
-kinematics problem where learner has to predict the tip position of a
-robotic arm given a set of joint angles
-\(\mathbf{\theta_1}, \mathbf{\theta_2}\). In this analysis, the two
-models are the Gaussian mixture model and locally-weighted regression
-(LOESS). The results shown that the variance of the learner decreases
-because the authors selected points to minimize expected variance.
-Additionally, we observe a related decrease in the mean square error
-(MSE) of both models as the dataset size increases. This is a notable
-outcome because the expected learner variance for these models can be
-computed accurately and efficiently relative to a new point. When
-integrated into the general AL loop, this significantly enhances model
-performance. In the case of the locally-weighted regression model
-(\textbf{?@fig-empirical:regress}), it is surprising that if points were
-chosen randomly, the MSE would be highly unstable, with sharp
-fluctuations. However, when AL by variance reduction is applied, using
-expected learner variance as a proxy, the MSE decreases almost smoothly,
-aside from some initial instabilities.
-
-\section{Active Preference Learning with Ideal Point
-Model}\label{active-preference-learning-with-ideal-point-model}
-
-For any \(n\) elements to be ranked, there are \(n!\) possible orderings
-that can result in the correct complete ranking. Given that a lower
-bound on sorting is \(n\log n\), obtaining a guaranteed true rating over
-\(n\) items requires \(n\log n\) pairwise comparisons if those
-comparisons are chosen at random. This number can be quite high and
-costly in many applications, especially since most ranking information
-comes from humans. The more comparisons they have to make, the more
-money and time is spent. This process can also be inefficient, as some
-comparisons provide more value to the learning process than others,
-making some comparisons a waste. This inefficiency can be detrimental in
-fields like psychology and market research, where comparisons are
-heavily utilized, and a faster process could offer significant benefits.
-The reason the lower bound on the number of comparisons is \(n\log n\)
-is that it assumes no prior information about the underlying space and
-field, so comparisons are chosen at random. However, leveraging the
-structures within the comparison space can provide more information
-about which comparisons are most valuable. For example,
-(\citeproc{ref-geo_paper}{G. and Nowak 2011}) discusses how eye doctors
-have a wide range of options when assigning prescriptions for glasses,
-yet patients do not see them making many comparisons before deciding on
-the best option. This is because eye doctors incorporate domain
-knowledge into the process and only ask clients for comparisons when
-necessary. Applying similar knowledge in the ranking field leads to an
-AL approach that selects data based on the relevance of a comparison
-query toward finding the final \(\sigma(\Theta)\).
-
-G. and Nowak (\citeproc{ref-geo_paper}{2011}) explores AL within data
-that can be embedded in a \(d\)-dimensional embedding space, where
-comparisons between two different items divide the space into halves,
-with one object being superior in each half. By leveraging such
-geometry, the paper develops a geometric AL approach. Let \(\theta\) be
-the item representation in the embedding space. For each ranking
-\(\sigma\), there is a reference point \(r_{\sigma} \in \mathbb{R}^d\),
-such that if \(\theta_{i} \succ \theta_{j}\),
-\(||\theta_i - r_{\sigma}|| < ||\theta_j - r_{\sigma}||\). In other
-words, object \(i\) is closer to the reference point \(r_{\sigma}\) than
-object \(j\). \(\Sigma_{n,d}\) is the set of all possible rankings of
-the \(n\) items that satisfy the above embedding distances condition.
-Not all rankings will satisfy the embedding conditions, but multiple
-rankings might satisfy all those conditions. For every ranking
-\(\sigma\), there is \(M_n(\sigma)\), the number of pairwise comparisons
-needed to identify the ranking. When comparisons are done at random,
-\(\mathbb{E}[M_n(\sigma)] = n\log n\), and it can be reduced by
-incorporating geometry. \(q_{i,j}\) is the query of comparison between
-items \(i\) and \(j\).
-
-As an example, G. and Nowak (\citeproc{ref-geo_paper}{2011}) studies a
-2D space with three items: \(\theta_1\), \(\theta_2\), and \(\theta_3\).
-There are pairwise queries \(q_{1,3}\), \(q_{2,3}\), and \(q_{1,2}\)
-between them, denoted by solid lines equidistant from the two items they
-compare. These lines split the \(R^2\) space into halves, with each half
-closer to one of the two items. The paper colors the side of the worse
-object for each query in dark grey and takes the intersection of these
-halves, resulting in the dark grey region in the image. This region
-indicates \(\Sigma_{n,2}\) since all points follow the embedding
-conditions. Specifically, for every point \(r\) in the dark grey area,
-\(||\theta_3 - r|| < ||\theta_2 - r|| < ||\theta_1 - r||\), meaning
-\(\theta_3 < \theta_2 < \theta_1\). Thus, every point \(r\) is one of
-the \(r_\sigma\) representing their respective rankings
-\(\sigma \in \Sigma_{n,2}\). In other words, the paper aims to have the
-reference points and dark grey region closest to the worst object and
-furthest from the best object.
-
-The authors also denote the label for each query \(q_{i,j}\), such as
-label \(y_{i,j} = 1\{q_{i,j}\}\) (for example,
-\(y_{1,2} = 0, y_{3,2} = 1\)). This allows for deciding how to label new
-queries represented by dashed and dotted lines, depending on which items
-each query compares. Focusing on the dotted line, called \(q_{i,4}\),
-where \(i={1,2,3}\), and considering potential locations of
-\(\theta_4\), the line must be equidistant from one of the three items
-in the picture and \(\theta_4\), meaning \(\theta_4\) can be placed in
-three different locations. If the query performed is \(q_{2,4}\), then
-\(\theta_4\) will be closer to the dark grey area than \(\theta_2\),
-thus \(y_{2,4} = 0\). However, if \(q_{1,4}\) or \(q_{3,4}\) are
-performed, \(\theta_4\) will be further from the dark grey area than
-\(\theta_1\) or \(\theta_3\), meaning \(y_{1,4} = y_{3,4} = 1\). In this
-case, the labels are contradictory and depend on which object they are
-compared with, making such a query \(q_{i,4}\) ambiguous.
-
-In contrast, the authors analyze the dashed line, called \(q_{i,5}\),
-where \(i={1,2,3}\), and consider potential locations of \(\theta_5\).
-Since the line must be equidistant from one of the three items in the
-picture and \(\theta_5\), it can be placed in three different locations.
-If one of the three potential queries is performed, \(\theta_5\) will be
-closer to the dark grey area than \(\theta_1\), \(\theta_2\), and
-\(\theta_3\), meaning \(y_{1,5} = y_{2,5} = y_{3,5} = 0\). In this case,
-all labels are the same regardless of which object is used, meaning such
-a query will not be contradictory, as all agree on the label. The goal
-is to perform as many ambiguous queries as possible and skip
-non-ambiguous queries to decrease the total \(M_n(\sigma)\).
-Intuitively, if there is contradictory information about a query, it
-needs to be erformed so that a human can clarify its direction.
-Conversely, if all sources of information from the domain space agree on
-the query's label, that information can be used without asking a human,
-incorporating the knowledge of the embedding distances. Lastly, to
-consider the general case of the \(R^d\) space, rather than discussing
-halves of the image, it is essential to discuss half-spaces. Similarly,
-consider the half-space that assigns a label of \(1\) to the query and
-the half-space assigning a label of \(0\). If both half-spaces exist,
-they have conflicting information on the query, making the query
-ambiguous. However, if one of the half-spaces does not exist, it means
-the other is the full space, representing consistency in the label
-assignment and a non-ambiguous query.
-
-It is important to demonstrate that the number of comparisons decreases.
-Specifically, (\citeproc{ref-geo_paper}{G. and Nowak 2011}) shows that
-this algorithm has \(E[M_n(\sigma)] = O(d\log n)\), where \(d\) is the
-dimension of the space and \(d < n\), which improves on the
-\(O(n\log n)\) baseline. The proof can be studied in detail in the paper
-itself, but at a high level, it starts by reasoning about the
-probability of a query being ambiguous and a comparison being requested
-from a human, thus representing
-\(M_n = \Sigma_{k=1}^{n-1}\Sigma_{i=1}^k 1\{Requestq_{i,k+1}\}\). For
-that, the authors define \(Q(i,j)\), which represents the number of
-different rankings that exist for \(i\) elements in \(j\)-dimensional
-space (e.g., \(Q(1,d) = 1, Q(n,0) = 1, Q(n,1) = n!\)). In that case,
-\(|\Sigma_{n,d}| = Q(n,d)\). Further, using recurrence relations for
-\(Q(i,j)\), the authors derive that
-\(|\Sigma_{n,d}| = Q(n,d) = O(n^{2d})\), which is omitted here.
-Analogously, the authors define \(P(i,j)\), which represents the number
-of rankings in \(\Sigma_{n,d}\) that will still be possible with the
-addition of a new element \(i+1\) to the ranking items. \(P(i,j)\)
-estimates how much of the dark grey area will still exist after making a
-query for \(i+1\). As indicated there, the dotted line ambiguous query
-did not change the dark grey a rea at all (\(P(n,d) = Q(n,d)\)), whereas
-the dashed non-ambiguous query would cut a piece from it
-(\(P(n,d) < Q(n,d)\)). Thus, \(Request q_{i,k+1} = P(k,d) / Q(k,d)\), so
-a higher value indicates more possible rankings and an ambiguous query
-that needs to be requested to obtain more useful information. With this
-in mind, the authors derive that \(E[M_n(\sigma)] = O(d\log n)\),
-showing that fewer queries are needed for effective ranking.
-
-The issue with this algorithm is that only one human provides the
-answers to the requested queries, which means it does not account for
-their biases. An alternative approach is a Robust Query Selection
-Algorithm (RQSA) (\citeproc{ref-geo_paper}{G. and Nowak 2011}), which
-uses majority voting for every query to indicate the ground truth of the
-query's label. However, the authors consider that a group of people can
-still give incorrect or divided responses. If the votes for each answer
-are almost equal in number, the authors push that query to the end of
-the algorithm to see if it can become a non-ambiguous query with more
-information learned. If it does not, an odd number of voters is used to
-determine the final ranking.
-
-\begin{longtable}[]{@{}llcc@{}}
-\caption{Statistics for the Robust Query Selection Algorithm (RQSA)
-(\citeproc{ref-geo_paper}{G. and Nowak 2011}) and the baseline of
-conducting all comparisons. \(y\) serves as a noisy ground truth,
-\(\tilde{y}\) is the result of all comparisons, and \(\hat{y}\) is the
-output of the RQSA.}\label{tbl-geo_acc}\tabularnewline
-\toprule\noalign{}
-Dimension & & 2 & 3 \\
-\midrule\noalign{}
-\endfirsthead
-\toprule\noalign{}
-Dimension & & 2 & 3 \\
-\midrule\noalign{}
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-\% of queries & mean & 14.5 & 18.5 \\
-& std & 5.3 & 6 \\
-Average error & \(d(\bar{y}, y)\) & 0.23 & 0.21 \\
-& \(d(\bar{y}, y)\) & 0.31 & 0.29 \\
-\end{longtable}
-
-With regard to the accuracy and performance of the method, the authors
-did a ranking experiment on 100 different audio signals, results of
-which can be seen in Table~\ref{tbl-geo_acc}. The ground truth labels
-came from humans, indicated by \(y\) in the table. That resulted in the
-existence of noise and potential errors in the ground truth, which could
-influence the performance of both the baseline algorithm that does all
-comparisons (\(\tilde{y}\)) and the Robust Query Selection Algorithm
-(RQSA) (\(\hat{y}\)). As can be seen in both 2 and 3-dimensional spaces
-RQSA performed worse by \(8\%\) compared to the baseline, which
-indicates that AL that uses the domain information can still be
-erroneous due to the inference of certain comparisons that sometimes may
-not be entirely correct. However, as can be seen by the upper part of
-Table~\ref{tbl-geo_acc}, significantly less queries were requested
-compared to the baseline, which means that the approach can have a
-significant benefit at a cost of slight loss in accuracy.
-
-\subsubsection*{User Information as Domain Knowledge for Active
-Learning}\label{sec-geo_app}
-\addcontentsline{toc}{subsubsection}{User Information as Domain
-Knowledge for Active Learning}
-
-An alternative source of domain knowledge could be users themselves, who
-can indicate their uncertainty when it comes to comparing two items.
-Prior studies have shown (\citeproc{ref-unnoisy_humans}{Amershi et al.
-2014}) that when presented with only two options when selecting which
-object is better, but not being able to properly decide, users would get
-frustrated and tend to respond more faultyly, creating noise and
-incorrect responses in the data. Through feedback and other studies
-(\citeproc{ref-noisy_humans}{Guillory and Bilmes 2011}) it was
-determined that presenting users with an option of indifference between
-the two items can remove those problems. Moreover, in connection to AL,
-the authors show that such an option helps to select more informative
-queries since it provides more domain knowledge that can be used,
-resulting in a decrease in the number of queries required. For this
-problem, the following terms are defined:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  \(c\) - a cost function that represents user preferences, and the
-  result the model has to determine at the end of training. The
-  preferred items will have lower costs, and less preferred ones will
-  have higher costs. The goal is to determine this function with the
-  fewest possible number of queries using AL.
-\item
-  \(H\) - a set of hypotheses over the possible cost functions, where
-  for each \(h \in H\) there is a cost function \(c_h\) associated with
-  it.
-\item
-  \(h^*\) - a true hypothesis that the model needs to determine, which
-  has cost \(c_{h^*}\) associated with it
-\item
-  \(t(x,y)\) - a test performed to compare items \(x\) and \(y\) (the
-  user is being asked to provide a response to which item is better).
-  Those tests result in changes and adjustments to \(H\) as more
-  information is learned.
-\item
-  \(o(x,y)\) - observation or result of \(t(x,y)\), where
-  \(o(x,y) \in \{x<y, x>y\}\)
-\item
-  \(S = \{(t_1, o_1), (t_2, o_2),...,(t_m, o_m)\}\) - a sequence of
-  \(m\) pairs of tests and observations
-\item
-  \(w(H|S)\) - probability mass of all hypotheses that are still
-  consistent with the observations (similar to the dark grey area and
-  \(Q(i,j)\)). This means that if \(h \in H\) is inconsistent with user
-  responses received, it is removed from \(H\).
-\end{enumerate}
-
-With the key terms defined, let's consider the noiseless base setting
-where users only have two options for response. Those components will
-also later be translated to the setting with the third option so the
-true cost function can be determined there. \(w(H|S)\) is the sum of the
-weights of all hypotheses that are still consistent with the evidence:
-\(w(H|S) = \sum_{h \in H} w(h | S)\). Each \(w(h|S)\) is a probability
-of the evidence's existence given such hypothesis: \(w(h|S) = p(S|h)\).
-Such probability comes from the test-observation pairs since they
-compose the set \(S\). Moreover, each test is independent of other
-tests, which gives \(p(S|h) = \prod_{(t,o) \in S} p((t,o) | h)\). In the
-noiseless setting, users will select an option that minimizes their cost
-function (selecting more preferred items), mathematically defined as:
-\[\begin{aligned}
-    p((t, o = x) | h) = 
-    \begin{cases}
-        1 & c_h(x) < c_h(y)\\
-        0 & else
-    \end{cases}
-\end{aligned}\]
-
-Users are not perfect evaluators. Prior work
-(\citeproc{ref-unnoisy_humans}{Amershi et al. 2014}) has shown that
-treating users as perfect can lead to poor performance. That gave rise
-to accounting for noise in users' responses, but a majority of such work
-applies the same noise to all queries and all responses. While those led
-to great performance results (\citeproc{ref-noisy_humans}{Guillory and
-Bilmes 2011}), they don't accurately reflect the real world, which gave
-rise to the idea of creating query-based noise. Effectively, for some of
-the queries it is important to incorporate the fact that the user is
-unsure and noisy, but for others, if the user is confident, noise in the
-response is not needed at all. For comparison-based learning, this means
-that the noise is related to the costs of the two items compared.
-Specifically for items \(x\) and \(y\), if
-\(c_{h^*}(x) \simeq c_{h^*}(y)\) then the items are hard to distinguish
-for the user, so here it is preferred to incorporate user uncertainty
-and noise. But if \(c_{h^*}(x) >> c_{h^*}(y)\), the user will certainly
-select \(y\) and the other way around, which is where the noise is not
-needed. Query-dependent noise is also supported in the psychology
-literature, which means that such an approach is more related to the
-real world. In particular, psychologists talk about the Luce-Sheppard
-Choice rule (\citeproc{ref-lus-shep}{Shepard 1957}) when talking about
-comparisons. This rule previously gave rise to a logistic model based on
-the noise (\citeproc{ref-lus-log}{Viappiani and Boutilier 2010}) where
-the probability of observation for a given test is
-\(p((t, o = x) | h) \propto exp(-\gamma * c_h(x))\)
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=1\linewidth,height=\textheight,keepaspectratio]{src/Figures/Noiseless probs.png}
-
-}
-
-\caption{\label{fig-noiseless_1}User response model in the noiseless
-setting}
-
-\end{figure}%
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=1\linewidth,height=\textheight,keepaspectratio]{src/Figures/Noise probs.png}
-
-}
-
-\caption{\label{fig-noiseless_2}User response with Luce Sheppard noise
-model}
-
-\end{figure}%
-
-Figure~\ref{fig-noiseless_1}, Figure~\ref{fig-noiseless_2} demonstrate
-the difference between the noiseless setting and incorporating the
-Luce-Sheppard Choice rule. GBS is the baseline model with only 2
-response options, and CLAUS is the model with the uncertainty option
-added. The figures show how incorporating such noise influences and
-smoothes the probability distribution of the user's response.
-
-We will now discuss the functionality of CLAUS, which is an algorithm
-designed by (\citeproc{ref-claus}{Holladay et al. 2016}) that allows
-users to select an uncertain response about the two options that they
-need to rank. The authors model such uncertainty as \(\epsilon\) and it
-is associated with each \(c_h\), so now every hypothesis \(h\) is
-defined over a pair of \((c_h, \epsilon_h)\). It is important to note
-that the goal is to still learn and maintain our objective on \(c\),
-\(\epsilon\) is only necessary to model the users' responses. The
-uncertainty relates to the cost function as
-\(|c_h(x) - c_h(y)| < \epsilon_h\). This means that the user is
-uncertain between items \(x\) and \(y\) and their cost difference is
-negligible such that the user is not able to select which item is
-better. This in turn gives more information about the real value of the
-two items, as a binary response would indicate the user's preference
-towards one item, which will not be real and will skew the cost
-functions. This causes modifications of the problem set-up:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  For test \(t(x,y)\) the observation will be
-  \(o(x,y) \in \{x<y, x>y, \tilde{xy}\}\), where \(\tilde{xy}\) is the
-  uncertain response.
-\item
-  The probability distribution over the user's response
-  (\textbf{?@eq-prob\_base}) will now be defined as:
-\end{enumerate}
-
-\[\begin{aligned}
-    p((t, o = x) | h) = 
-    \begin{cases}
-        1 & c_h(x) < c_h(y) - \epsilon_h\\
-        0 & else
-    \end{cases}, \quad
-    p((t, o = \tilde{xy}) | h) = 
-    \begin{cases}
-        1 & |c_h(x) - c_h(y)|^2 < \epsilon_h^2\\
-        0 & else
-    \end{cases}
-\end{aligned}\]
-
-This means the user confidently selects \(x\) when it is better than
-\(y\) by more than \(\epsilon\), but if the squared difference of the
-cost functions of two items is negligible by \(\epsilon\) user will
-choose the indifferent option.
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\setcounter{enumi}{2}
-\tightlist
-\item
-  Finally this also updates the noise model: \[\begin{aligned}
-  &p((t, o = x) | h) \propto \exp(-\gamma * [c_h(x) - c_h(y)]) \\
-  &p((t, o = \tilde{xy}) | h) \propto exp(-1/\epsilon_h^2 * [c_h(x) - c_h(y)]^2)
-  \end{aligned}\]
-\end{enumerate}
-
-Rather than predicting a specific pair \((c_h, \epsilon_h)\), the
-algorithm focuses on predicting a group of pairs that are similar to one
-another, otherwise called equivalence class (\textbf{?@fig-equiv\_c}),
-which indicates not essentially different hypothesis for the cost
-function and uncertainty. That information is learned through each new
-test, as the algorithm updates the information about \(c\) and
-\(\epsilon\) that distinguishes between the distinct \(h\), finding the
-equivalence groups among them. Moreover, the authors tweaked the
-parameter responsible for the size of the equivalence class (how many
-hypotheses can be grouped together at a time).
-
-\begin{longtable}[]{@{}lcc@{}}
-\caption{Performance of GBS and CLAUS with different labels for the
-uncertainty}\label{tbl-claus_tab}\tabularnewline
-\toprule\noalign{}
-Category & Accuracy & Query Count \\
-\midrule\noalign{}
-\endfirsthead
-\toprule\noalign{}
-Category & Accuracy & Query Count \\
-\midrule\noalign{}
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-GBS - About Equal & \(94.15 \pm 0.52\) & \(36.02 \pm 0.03\) \\
-GBS - Not Sure & \(\textbf{94.66} \pm \textbf{0.55}\) &
-\(35.95 \pm 0.04\) \\
-CLAUS - About Equal & \(91.56 \pm 0.84\) &
-\(\textbf{25.93} \pm \textbf{0.41}\) \\
-CLAUS - Not Sure & \(90.86 \pm 0.74\) & \(26.98 \pm 0.47\) \\
-\end{longtable}
-
-The first performance evaluation is done on the number of queries and
-confirms that it decreases. The GBS model serves as the baseline, as it
-will do all of the comparison queries using the binary response options.
-The CLAUS model is measured over different values of \(\epsilon\) on the
-x-axis and over different sizes of the equivalence sets indicated by
-different shades of blue. Figure shows that all variants of CLAUS use
-approximately 10 fewer queries on average compared to GBS. Moreover,
-using bigger-sized equivalence classes can further decrease the number
-of needed queries. The most optimal \(\epsilon \simeq 0.07\), after
-which higher \(\epsilon\) does not provide any benefit.
-
-Lastly, the authors considered the performance difference, which is
-indicated in Table~\ref{tbl-claus_tab}. For that authors used two
-different labels for the uncertainty button in CLAUS, it was either
-labeled as ``About Equal'' or ``Not Sure'' as those can provoke
-different responses and feelings in users. Moreover, GBS and CLAUS-type
-responses were mixed in the same set of questions to the user, which
-splits the metrics for both in two as can be seen in
-Table~\ref{tbl-claus_tab}. The performance of CLAUS is lower by \(3\%\)
-on average, showing that a smaller number of queries can still lead to a
-performance loss. However, the second column of
-Table~\ref{tbl-claus_tab} supports the information, as it also shows
-that 10 fewer queries were conducted on average.
-
-AL can be essential in learning within dynamic systems and environments.
-Say we have an agent in an environment, and we want it to conform to a
-certain behavior as set by a human. How exactly do we go about doing
-this? In a traditional RL setting, this is solved by a class of
-algorithms under Inverse Reinforcement Learning. Techniques such as VICE
-and GAIL attempt to learn a reward function that can distinguish between
-states visited by the agent and states desired to be visited as defined
-by a human. In effect, a human will demonstrate what it would like the
-agent to do in the environment, and from there, learning is done.
-However, what if humans do not precisely know how an agent should
-optimally behave in an environment but still have some opinion on what
-trajectories would be better than others? This is where a paper like
-Active Preference-Based Learning of Reward Functions comes into the
-picture. The paper aims to use human preferences to aid an agent's
-learning within a dynamic system.
-
-A dynamic system contains human input, robotic input, and an environment
-state. The transitions between states is defined by \(f_{HR}\), so that
-we have \(x^{t+1} = f_{HR}(x^t, u_R, u_H)\). At a given time step \(t\),
-we have \(x_t\), \(u_R^t\), and \(u_H^t\). This can be encapsulated into
-a single \(d\) dimensional feature vector that the authors denote as
-\(\phi\). The paper then assumes that the underlying reward model we are
-trying to learn can be represented linearly. If we have our human reward
-preference function defined as \(r_H\), this means we can write \(r_H\)
-as \(r_H(x^t, u_R^t, u_H^t) = w^{\intercal}\phi(x^t, u_R^t, u_H^t)\).
-Because the reward function is linear, we can take the weight vector out
-of the summation if we want to calculate the reward over an entire
-trajectory:
-
-\[R_{H}(x^0, u_R, u_H) = \sum_{t=0}^{N} r_{H}(x^t, u^t, u_H^t) \quad \Phi = \sum \phi(x^t, u_R^t, u_H^t) \quad R_H(traj) = w\cdot\Phi(traj)\]
-
-First, the scale of \(w\) does not matter because we only care about the
-relative rewards produced with \(w\) (given two different trajectories,
-we want to answer the question of which trajectory a human would prefer,
-i.e.~which one has a higher preference reward). This means we can
-constrain \(||w|| <= 1\), so the initial prior is uniform over a unit
-ball. From here, we can determine a probabilistic expression to assess
-whether we should prefer trajectory A or B (because it can be noisy with
-human input). Let \(I_t = +1\) if the human prefers trajectory \(A\).
-According to Bradley-Terry model,
-\(p(A \succ B|w) = \sigma(R_H(traj_A) - R_H(traj_B))\). Let
-\(\psi = \Phi(traj_a) - \Phi(traj_b). Then f_{\psi} (w) = p(I_t|w) = \sigma(I_t w^{\intercal}\psi)\).
-We can update \(p(w)\) everytime we get a result from a human preference
-query using Bayes' rule: \(p(w|I_t) <- p(w) \cdot p(I_t|w)\) via Markov
-chain Monte Carlo method. This paper synthetically generates queries
-through an optimization process and then presents them to a human to
-pick between. The idea is that we want to generate a query that
-maximizes the conditional entropy \(H(I|w)\). We want to pick a query
-that we are most uncertain about given our current weights (thus having
-the highest conditional entropy given the weights):
-\[\max_{x^0, u_R, u_H^A, u_H^B} \min\{\mathbb{E}[1-f_{\psi}(w)], \mathbb{E}[1 - f_{-\psi}(w)]\}\]
-
-To do so, we sample \(w_1, ... w_m\) from \(p(w)\), approximating the
-distribution \(p(w)\) as \(p(w) = \frac{1}{M} \sum \delta (w_i).\) We
-can now approximate the expectation expression as
-\(E[1 - f_{\psi}(w)] = \frac{1}{M} (\sum 1 - f_{\psi}(w_i))\), and now
-we can optimize the expression to generate a synthetic query. The
-algorithm itself works well, however there ends up being a bottle neck
-that each query needs to be synthesized before being sent to the human
--- one at a time. There is no room for parallelization and so the
-authors proposed a second algorithm in a separate paper that allows for
-the batching of queries:
-
-\[\max_{\xi_{ib+1_A}, \xi_{ib+1_B}, ... , \xi_{ib+b_A}, \xi_{ib+b_B}} \mathbb{H}(I_{ib+1}, I_{ib+2}, .., I_{ib+b} | w)\]
-
-We could consider optimizing this in the greedy fashion. This would mean
-just synthetically generating \(b\) independent queries. The drawback of
-this method would be that the queries would likely be very similar to
-each other. The authors propose a few other heuristics that would help
-guide the algorithm away from generating very similar queries, such as
-Medioid Selection where we have to cluster \(B\) greedy vectors into
-\(b < B\) groups and pick one vector from each group (the medioid). The
-authors also propose two other methods rooted in providing different
-queries: boundary medioids selection and successive elimination. The
-authors test both the non-batched and variety of batched learning
-algorithms on multiple environments. When graphed over \(N\) the
-non-batched AL approach does in the same ball-park of performance as the
-batched approaches. However, over time, we see that learning is a much
-slower process when not-batched.
-
-\section{Case Study 2: Performance Metric
-Elicitation}\label{sec-metric-elicitation}
-
-In binary classification problems, selecting an appropriate performance
-metric that aligns with the real-world task is crucial. The problem of
-\emph{metric elicitation} aims to characterize and discover the
-performance metric of a practitioner, reflecting the rewards or costs
-associated with correct or incorrect classification. For instance, in
-medical contexts such as diagnosing a disease or determining the
-appropriateness of a treatment, trade-offs are made for incorrect
-decisions. Not administering a treatment could lead to the worsening of
-a disease (a false negative), whereas delivering the wrong treatment
-could cause adverse side effects worse than not treating the condition
-(a false positive). Rather than choosing from a limited set of default
-choices like the F1-score or weighted accuracy, metric elicitation
-considers the process of devising a metric that best matches the
-preferences of practitioners or users. This is achieved by querying an
-``oracle'' who provides feedback on proposed potential metrics through
-pairwise comparisons. Since queries to humans are often expensive, the
-goal is to minimize the number of comparisons needed.
-
-The motivation for the pairwise comparison aspect of metric elicitation
-(\citeproc{ref-pmlr-v89-hiranandani19a}{Hiranandani et al. 2019a}) stems
-from a rich history of literature in psychology, economics, and computer
-science (\citeproc{ref-pref1}{Samuelson 1938};
-\citeproc{ref-pref2}{Mas-Colell 1977}; \citeproc{ref-pref3}{Varian
-2006}; \citeproc{ref-pref4}{Braziunas and Boutilier 2012};
-\citeproc{ref-ab}{Tamburrelli and Margara 2014}), demonstrating that
-humans are often ineffective at providing absolute feedback on aspects
-such as potential prices, user interfaces, or even ML model outputs
-(hence the comparison-based structure of RLHF, for instance).
-Additionally, confusion matrices accurately capture binary metrics such
-as accuracy, \(F_\beta\), and Jaccard similarity by recording the number
-of false positives, true positives, false negatives, and true negatives
-obtained by a classifier. The main goal of this chapter is to introduce
-two binary-search procedures that can approximate the oracle's
-performance metric for two types of metrics (linear and
-linear-fractional performance metrics) by presenting the oracle with
-confusion matrices generated by various classifiers. Essentially, we are
-learning an optimal threshold for classification given a decision
-boundary for a binary classification problem.
-
-First, we introduce some relevant notation that will later be used to
-formalize notions of oracle queries, classifiers, and metrics. In this
-context, \(X \in \mathcal{X}\) represents an input random variable,
-while \(Y \in \{0, 1\}\) denotes the output random variable. We learn
-from a dataset of size \(n\), denoted by \(\{(x, y)_i\}^n_{i=1}\), which
-is generated independently and identically distributed (i.i.d.) from
-some distribution \(\mathbb{P}(X, Y)\). The conditional probability of
-the positive class, given some sample \(x\), is denoted by
-\(\eta(\vec{x}) = \mathbb{P}(Y=1 | X=x)\). The marginal probability of
-the positive class is represented by \(\zeta = \mathbb{P}(Y=1)\). The
-set of all potential classifiers is
-\(\mathcal{H} = \{h : \mathcal{X} \rightarrow \{0,1\}\}\). The confusion
-matrix for a classifier \(h\) is
-\(C(h, \mathbb{P}) \in \mathbb{R}^{2 \times 2}\), where
-\(C_{ij}(h, \mathbb{P}) = \mathbb{P}(Y=i, h=j)\) for
-\(i, j \in \{0,1\}\). These entries represent the false positives, true
-positives, false negatives, and true negatives, ensuring that
-\(\sum_{i,j}C_{ij}=1\). The set of all confusion matrices is denoted by
-\(\mathcal{C}\). Since \(FN(h, \mathbb{P}) = \zeta - TP(h, \mathbb{P})\)
-and \(FP(h, \mathbb{P}) = 1 - \zeta - TN(h, \mathbb{P})\),
-\(\mathcal{C}\) is actually a 2-dimensional space, not a 4-dimensional
-space.
-
-Any hyperplane in the \((tp, tn)\) space is given by
-\(\ell := a \cdot tp + b \cdot tn = c\), where
-\(a, b, c \in \mathbb{R}\). Given a classifier \(h\), we define a
-performance metric
-\(\phi : [0, 1]^{2 \times 2} \rightarrow \mathbb{R}\). The value
-\(\phi(C(h))\), which represents the performance of a classifier with
-respect to a certain metric, is referred to as the \emph{utility} of the
-classifier \(h\). We assume, without loss of generality, that a higher
-value of \(\phi\) indicates a better performance metric for \(h\). Our
-focus is to recover some metric \(\phi\) using comparisons between
-confusion matrices \(C(h)\), determined by classifiers \(h\), which
-approximates the oracle's ``ground-truth'' metric \(\phi^*\). Next, we
-introduce two classes of performance metrics---\emph{Linear Performance
-Metrics (LPM)} and \emph{Linear-Fractional Performance Metrics
-(LFPM)}---for which we will present two elicitation algorithms.
-
-An LPM, given constants
-\(\{a_{11}, a_{01}, a_{10}, a_{00}\} \in \mathbb{R}^{4}\), is defined as
-\(\phi(C) = a_{11} TP + a_{01} FP + a_{10} FN + a_{00} TN = m_{11} TP + m_{00} TN + m_{0}\),
-where \(m_{11} = (a_{11} - a_{10})\), \(m_{00} = (a_{00} - a_{01})\),
-and \(m_{0} = a_{10} \zeta + a_{01} (1 - \zeta)\). This
-reparametrization simplifies the metric by reducing dimensionality,
-making it more tractable for elicitation. One example of an LPM is
-\emph{weighted accuracy}, defined as \(WA = w_1TP + w_2TN\), where
-adjusting \(w_1\) and \(w_2\) controls the relative importance of
-different types of misclassification. An LFPM, defined by constants
-\(\{a_{11}, a_{01}, a_{10}, a_{00}, b_{11}, b_{01}, b_{10}, b_{00}\} \in \mathbb{R}^{8}\),
-is given by:
-\[\phi(C) = \frac{a_{11} TP + a_{01} FP + a_{10} FN + a_{00} TN}{b_{11} TP + b_{01} FP + b_{10} FN + b_{00} TN} = \frac{p_{11} TP + p_{00} TN + p_{0}}{q_{11} TP + q_{00} TN + q_{0}},\]
-where \(p_{11} = (a_{11} - a_{10})\), \(p_{00} = (a_{00} - a_{01})\),
-\(q_{11} = (b_{11} - b_{10})\), \(q_{00} = (b_{00} - b_{01})\),
-\(p_{0} = a_{10} \zeta + a_{01} (1 - \zeta)\), and
-\(q_{0} = b_{10} \zeta + b_{01} (1 - \zeta)\). This parametrization also
-simplifies the elicitation process by reducing the number of variables.
-Common LFPMs include the \(F_\beta\) score and Jaccard similarity,
-defined as:
-
-\begin{equation}\phantomsection\label{eq-lfpm_metrics}{F_{\beta} = \frac{TP}{\frac{TP}{1+\beta^{2}} - \frac{TN}{1+\beta^{2}} + \frac{\beta^{2} \zeta + 1 - \zeta}{1+\beta^{2}}}, \quad JAC = \frac{TP}{1 - TN}.}\end{equation}
-
-Setting \(\beta = 1\) gives the F1 score, which is widely used as a
-classification metric. Since we are considering all possible metrics in
-the LPM and LFPM families, we need to make certain assumptions about
-\(\mathcal{C}\). Particularly, we will assume that
-\(g(t) = \mathbb{P}[\eta(X) \geq t]\) is continuous and strictly
-decreasing for \(t \in [0, 1]\); essentially, \(\eta\) has positive
-density and zero probability.
-
-Additionally, \(\mathcal{C}\) is convex, closed, and contained within
-the rectangle \([0, \zeta] \times [0, 1-\zeta]\), and is rotationally
-symmetric around its center, \((\frac{\zeta}{2}, \frac{1-\zeta}{2})\),
-where the axes represent the proportion of true positives and negatives.
-The only vertices of \(\mathcal{C}\) are \((0, 1-\zeta)\) and
-\((\zeta, 0)\), corresponding to predicting all \(0\)'s or all \(1\)'s
-on a given dataset. Therefore, \(\mathcal{C}\) is strictly convex, and
-any line tangent to it is tangent at exactly one point, corresponding to
-one particular confusion matrix. Next, recall that an LPM is represented
-in terms of three parameters (\(\phi = m_{11}TP + m_{00}TN + m_0\)). We
-have just seen that this LPM and its corresponding confusion matrix
-correspond to a certain point on the boundary of \(\mathcal{C}\). We
-first note that this point is independent of \(m_0\). Additionally, we
-only care about the relative weightings of \(m_{11}\) and \(m_{00}\),
-not their actual values---they are scale invariant. Therefore, we can
-parametrize the space of LPMs as
-\(\varphi_{LPM} = \{\mathbf{m} = (\cos \theta, \sin \theta) : \theta \in [0, 2\pi]\}\),
-where \(\cos \theta\) corresponds to \(m_{00}\) and \(\sin \theta\)
-corresponds to \(m_{11}\). As we already know, we can recover the Bayes
-classifier given \(\mathbf{m}\), and it is unique, corresponding to one
-point on the boundary of \(\mathcal{C}\) due to its convexity. The
-supporting hyperplane at this point is defined as
-\(\bar{\ell}_{\mathbf{m}} := m_{11} \cdot tp + m_{00} \cdot tn = m_{11} \overline{TP}_{\mathbf{m}} + m_{00} \overline{TN}_{\mathbf{m}}\).
-We note that if \(m_{00}\) and \(m_{11}\) have opposite signs, then
-\(\bar{h}_m\) is the trivial classifier predicting all 1's or all 0's,
-since either predicting true positives or true negatives results in
-negative reward. This corresponds to a supporting hyperplane with a
-positive slope, so it can only be tangent at the vertices. Additionally,
-the boundary \(\partial \mathcal{C}\) can be split into upper and lower
-boundaries (\(\partial \mathcal{C}_{+}, \partial \mathcal{C}_{-}\)),
-corresponding to \(\theta \in (0, \pi/2)\) and
-\(\theta \in (\pi, 3\pi/2)\) respectively (and whether
-\(m_{00}, m_{11}\) are positive or negative). We also define the notions
-of Bayes optimal and inverse-optimal classifiers. Given a performance
-metric \(\phi\), we define:
-
-\begin{itemize}
-\tightlist
-\item
-  The \emph{Bayes utility} as
-  \(\bar{\tau} := \sup_{h \in \mathcal{H}} \phi(C(h)) = \sup_{C \in \mathcal{C}} \phi(C)\);
-  this is the highest achievable utility (using the metric \(\phi\))
-  over all classifiers \$h \in \(\mathcal{H}\) for a given problem.
-\item
-  The \emph{Bayes classifier} as
-  \(\bar{h} := \arg \max_{h \in \mathcal{H}} \phi(C(h))\); this is the
-  classifier \(h\) corresponding to the Bayes utility.
-\item
-  The \emph{Bayes confusion matrix} as
-  \(\bar{C} := \arg \max_{C \in \mathcal{C}} \phi(C)\); this is the
-  confusion matrix corresponding to the Bayes utility and classifier.
-\end{itemize}
-
-Similarly, the inverse Bayes utility, classifier, and confusion matrix
-can be defined by replacing ``\(\sup\)'' with ``\(\inf\)''; they
-represent the classifier and confusion matrix corresponding to the lower
-bound on utility for a given problem. We also have the following useful
-proposition:
-
-\begin{tcolorbox}[colframe=.grey, title=\faPenSquare \enspace Proposition]
-
-\begin{proposition}[]\protect\hypertarget{prp-prp3.1}{}\label{prp-prp3.1}
-
-Let \(\phi \in \varphi_{LPM}\). Then
-
-\begin{equation}\phantomsection\label{eq-eq3.46}{\bar{h}(x) = \left\{\begin{array}{lr}
-\mathbbm{1}\left[\eta(x) \geq \frac{m_{00}}{m_{11} + m_{00}}\right], & m_{11} + m_{00} \geq 0 \\
-\mathbbm{1}\left[\frac{m_{00}}{m_{11} + m_{00}} \geq \eta(x)\right], & \text { o.w. }
-\end{array}\right\}}\end{equation}
-
-is a Bayes optimal classifier with respect to \(\phi\). The inverse
-Bayes classifier is given by \(\underline{h} = 1 - \bar{h}\).
-
-\end{proposition}
-
-\end{tcolorbox}
-
-This is a simple derivation based on the fact that we only get rewards
-from true positives and true negatives. Essentially, if we recover an
-LPM, we can use it to determine the best-performing classifier, obtained
-by placing a threshold on the conditional probability of a given sample,
-that corresponds to a confusion matrix. Therefore, the three notions of
-Bayes utility, classifier, and confusion matrix are functionally
-equivalent in our setting.
-
-We will now formalize the problem of metric elicitation. Given two
-classifiers \(h\) and \(h'\) (or equivalently, two confusion matrices
-\(C\) and \(C'\)), we define an \emph{oracle query} as the function:
-
-\begin{equation}\phantomsection\label{eq-oracle}{\Gamma\left(h, h^{\prime}\right)=\Omega\left(C, C^{\prime}\right)=\mathbbm{1}\left[\phi(C)>\phi\left(C^{\prime}\right)\right]=: \mathbbm{1} \left[C \succ C^{\prime}\right],}\end{equation}
-
-which represents the classifier preferred by the practitioner. We can
-then define the metric elicitation problem for populations:
-
-\begin{tcolorbox}[colframe=.grey, title=\faPenSquare \enspace Definition]
-
-\begin{definition}[]\protect\hypertarget{def-def3.1}{}\label{def-def3.1}
-
-Suppose the true (oracle) performance metric is \(\phi\). The goal is to
-recover a metric \(\hat{\phi}\) by querying the oracle for as few
-pairwise comparisons of the form \(\Omega\left(C, C^{\prime}\right)\) so
-that \(\|\phi - \hat{\phi}\|_{--} < \kappa\) for a sufficiently small
-\(\kappa > 0\) and for any suitable norm \(\|\cdot\|_{--}\).
-
-\end{definition}
-
-\end{tcolorbox}
-
-In practice, we do not have access to the true probability distribution
-or the population, which would provide the true values of \(C\) and
-\(C'\). However, we can subtly alter this problem description to use
-\(\hat{C}\) and \(\hat{C}^{\prime}\), which are derived from our dataset
-of \(n\) samples:
-
-\begin{tcolorbox}[colframe=.grey, title=\faPenSquare \enspace Definition]
-
-\begin{definition}[]\protect\hypertarget{def-def3.2}{}\label{def-def3.2}
-
-Suppose the true (oracle) performance metric is \(\phi\). The aim is to
-recover a metric \(\hat{\phi}\) by querying the oracle for as few
-pairwise comparisons of the form
-\(\Omega\left(\hat{C}, \hat{C}^{\prime}\right)\) so that
-\(\|\phi - \hat{\phi}\|_{--} < \kappa\) for a sufficiently small
-\(\kappa > 0\) and for any suitable norm \(\|\cdot\|_{--}\).
-
-\end{definition}
-
-\end{tcolorbox}
-
-As is common in theoretical ML research, we solve the population problem
-and then consider ways to extend this to practical settings where we
-only have limited datasets of samples. In our case, this corresponds to
-calculating the confusion matrices from a portion of the dataset we have
-access to.
-
-\subsection{Linear Performance Metric Elicitation}\label{sec-orgb6dac4e}
-
-For LPM elicitation, we need one more proposition.
-
-\begin{tcolorbox}[colframe=.grey, title=\faPenSquare \enspace Proposition]
-
-\begin{proposition}[]\protect\hypertarget{prp-prp3.2}{}\label{prp-prp3.2}
-
-For a metric \(\psi\) (quasiconvex and monotone increasing in TP/TN) or
-\(\phi\) (quasiconcave and monotone increasing), and parametrization
-\(\rho^+\)/\(\rho^-\) of upper/lower boundary, composition
-\(\psi \circ \rho^-\) is quasiconvex and unimodal on {[}0, 1{]}, and
-\(\phi \circ \rho^+\) is quasiconcave and unimodal on {[}0, 1{]}.
-
-\end{proposition}
-
-\end{tcolorbox}
-
-Quasiconcavity and quasiconvexity are slightly more general variations
-on concavity and convexity. Their main useful property in our setting is
-that they are unimodal (they have a singular extremum), so we can devise
-a binary-search-style algorithm for eliciting the Bayes optimal and
-inverse-optimal confusion matrices for a given setting, as well as the
-corresponding \(\phi\)'s. We first note that to maximize a quasiconcave
-metric, in which \(\phi\) is monotonically increasing in \(TP\) and
-\(TN\), we note that the resulting maximizer (and supporting hyperplane)
-will occur on the upper boundary of \(\mathcal{C}\). We thus set our
-initial search range to be \([0, \pi/2]\) and repeatedly divide it into
-four regions. Then, we calculate the resulting confusion matrix on the 5
-resulting boundaries of these regions and query the oracle \(4\) times.
-We repeat this in each iteration of the binary search until a maximizer
-is found.
-
-\begin{tcolorbox}[colframe=.grey, title=\faQuestion \enspace Remark]
-
-\begin{refremark}
-In the case of quasiconcave and quasiconvex search ranges, a slightly
-more sophisticated variation on typical binary search must be used. To
-illustrate this, consider the two distributions in
-Figure~\ref{fig-bsearch}:
-
-\begin{figure}[H]
-
-\begin{minipage}{0.50\linewidth}
-
-\includegraphics[width=0.45\linewidth,height=\textheight,keepaspectratio]{src/Figures/normaldistribution.png}
-
-\end{minipage}%
-
-\end{figure}%
-
-For both the symmetric and skewed distributions, if we were to divide
-the search range into two portions and compare \(A\), \(C\), and \(E\),
-we would find that \(C > A\) and \(C > E\). In both cases, this does not
-help us reduce our search range, since the true maximum could lie on
-either of the two intervals (as in the second case), or at \(C\) itself
-(as in the first case). Therefore, we must make comparisons between all
-five points \(A, B, C, D, and E\). This allows us to correctly restrict
-our search range to \([B, D]\) in the first case and \([C, E]\) in the
-second. These extra search requirements are due to the quasiconcavity of
-the search space we are considering, in which there exists a maximum but
-we need to make several comparisons at various points throughout the
-search space to be able to reduce its size in each iteration.
-
-\label{rem-explaination_binary_search}
-
-\end{refremark}
-
-\end{tcolorbox}
-
-\begin{algorithm}[H]
-    \caption{Quasiconcave Metric Maximization}
-    \label{alg-lpm}
-\begin{algorithmic}[1]
-        \State \textbf{input:} $\epsilon > 0$ and oracle $\Omega$
-        \State \textbf{initialize:} $\theta_a = 0, \theta_b = \frac{\pi}{2}$
-        \While{$|\theta_b - \theta_a| > \epsilon$}
-            \State set $\theta_c = \frac{3\theta_a+\theta_b}{4}$, $\theta_d = \frac{\theta_a+\theta_b}{2}$, and $\theta_e = \frac{\theta_a+3\theta_b}{4}$
-            
-            \State obtain $h\theta_a, h\theta_c, h\theta_d, h\theta_e, h\theta_b$ using Proposition 1
-            
-            \State Compute $C\theta_a, C\theta_c, C\theta_d, C\theta_e, C\theta_b$ using (1)
-            
-            \State Query $\Omega(C\theta_c, C\theta_a), \Omega(C\theta_d, C\theta_c), \Omega(C\theta_e, C\theta_d)$, and $\Omega(C\theta_b, C\theta_e)$
-
-            \If{$q_{i,j}$ is ambiguous}
-                \State request $q_{i,j}$'s label from reference
-            \Else
-                \State impute $q_{i,j}$'s label from previously labeled queries
-            \EndIf
-            
-            \If{$C\theta' \succ C\theta'' \succ C\theta'''$ for consecutive $\theta < \theta' < \theta''$}
-                \State assume the default order $C\theta \prec C\theta' \prec C\theta''$
-            \EndIf
-
-            \If{$C\theta' \succ C\theta'' \succ C\theta'''$ for consecutive $\theta < \theta' < \theta''$}
-                \State assume the default order $C\theta \prec C\theta' \prec C\theta''$
-            \EndIf
-            
-            \If{$C\theta_a \succ C\theta_c$} 
-                \State Set $\theta_b = \theta_d$ 
-            \ElsIf{$C\theta_a \prec C\theta_c \succ C\theta_d$} 
-                \State Set $\theta_b = \theta_d$ 
-            \ElsIf{$C\theta_c \prec C\theta_d \succ C\theta_e$} 
-                \State Set $\theta_a = \theta_c$ 
-                \State Set $\theta_b = \theta_e$ 
-            \ElsIf{$C\theta_d \prec C\theta_e \succ C\theta_b$} 
-                \State Set $\theta_a = \theta_d$ 
-            \Else 
-                \State Set $\theta_a = \theta_d$ 
-            \EndIf
-        \EndWhile
-        \State \textbf{output:} $\vec{m}, C$, and $\vec{l}$, where $\vec{m} = m_l(\theta_d), C = C\theta_d$, and $\vec{l} := (\vec{m}, (tp, tn)) = (\vec{m}, C)$
-    \end{algorithmic}
-\end{algorithm}
-
-To elicit LPMs, we run  Algorithm~\ref{alg-lpm} , querying the oracle in
-each iteration, and set the elicited metric \(\hat{m}\) (which is the
-maximizer on \(\mathcal{C}\)) to be the slope of the resulting
-hyperplane, since the metric is linear.
-
-\begin{tcolorbox}[colframe=.grey, title=\faQuestion \enspace Remark]
-
-\begin{refremark}
-To find the minimum of a quasiconvex metric, we flip all instances of
-\(\prec\) and \(\succ\), and use an initial search range of
-\([\pi, 3\pi/2]\); we use this algorithm, which we refer to as
- Algorithm~\ref{alg-lfpm} , in our elicitation of LFPMs.
-
-\label{rem-explaination_lpm}
-
-\end{refremark}
-
-\end{tcolorbox}
-
-Next, we provide a Python implementation of  Algorithm~\ref{alg-lpm} .
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\KeywordTok{def}\NormalTok{ get\_m(theta):}
-    \CommentTok{"""}
-\CommentTok{    Inputs: }
-\CommentTok{    {-} theta: the value that parametrizes m}
-\CommentTok{    Outputs:}
-\CommentTok{    {-} m\_0 and m\_1 for the LPM}
-\CommentTok{    """}
-
-    \ControlFlowTok{return}\NormalTok{ (math.cos(theta), math.sin(theta))}
-
-\KeywordTok{def}\NormalTok{ lpm\_elicitation(epsilon, oracle):}
-    \CommentTok{"""}
-\CommentTok{    Inputs:}
-\CommentTok{    {-} epsilon: some epsilon \textgreater{} 0 representing threshold of error}
-\CommentTok{    {-} oracle: some function that accepts 2 confusion matrices and}
-\CommentTok{        returns true if the first is preferred and false otherwise}
-\CommentTok{    Outputs:}
-\CommentTok{    {-} estimate for m, which is used to compute the LPM as described above}
-\CommentTok{    """}
-
-\NormalTok{    a }\OperatorTok{=} \DecValTok{0}
-\NormalTok{    b }\OperatorTok{=}\NormalTok{ math.pi}\OperatorTok{/}\DecValTok{2}
-    \ControlFlowTok{while}\NormalTok{ (b }\OperatorTok{{-}}\NormalTok{ a }\OperatorTok{\textgreater{}}\NormalTok{ epsilon):}
-\NormalTok{        c }\OperatorTok{=}\NormalTok{ (}\DecValTok{3} \OperatorTok{*}\NormalTok{ a }\OperatorTok{+}\NormalTok{ b) }\OperatorTok{/} \DecValTok{4}
-\NormalTok{        d }\OperatorTok{=}\NormalTok{ (a }\OperatorTok{+}\NormalTok{ b) }\OperatorTok{/} \DecValTok{2}
-\NormalTok{        e }\OperatorTok{=}\NormalTok{ (a }\OperatorTok{+} \DecValTok{3} \OperatorTok{*}\NormalTok{ b) }\OperatorTok{/} \DecValTok{4}
-
-\NormalTok{        m\_a, m\_b, m\_c, m\_d, m\_e }\OperatorTok{=}\NormalTok{ (get\_m(x) }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ [a,b,c,d,e]) }\CommentTok{\# using definition of m}
-\NormalTok{        c\_a, c\_b, c\_c, c\_d, c\_e }\OperatorTok{=}\NormalTok{ (get\_c(x) }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ [m\_a, m\_b, m\_c, m\_d, m\_e]) }\CommentTok{\# compute classifier from m\textquotesingle{}s then calculate confusion matrices}
-        
-\NormalTok{        response\_ac }\OperatorTok{=}\NormalTok{ oracle(c\_a, c\_c)}
-\NormalTok{        response\_cd }\OperatorTok{=}\NormalTok{ oracle(c\_c, c\_d)}
-\NormalTok{        response\_de }\OperatorTok{=}\NormalTok{ oracle(c\_d, c\_e)}
-\NormalTok{        response\_eb }\OperatorTok{=}\NormalTok{ oracle(c\_e, c\_b)}
-
-        \CommentTok{\# update ranges to keep the peak}
-        \ControlFlowTok{if}\NormalTok{ response\_ac:}
-\NormalTok{            b }\OperatorTok{=}\NormalTok{ d}
-        \ControlFlowTok{elif}\NormalTok{ response\_cd:}
-\NormalTok{            b }\OperatorTok{=}\NormalTok{ d}
-        \ControlFlowTok{elif}\NormalTok{ response\_de:}
-\NormalTok{            a }\OperatorTok{=}\NormalTok{ c}
-\NormalTok{            b }\OperatorTok{=}\NormalTok{ e}
-        \ControlFlowTok{elif}\NormalTok{ response\_eb:}
-\NormalTok{            a }\OperatorTok{=}\NormalTok{ d}
-        \ControlFlowTok{else}\NormalTok{:}
-\NormalTok{            a }\OperatorTok{=}\NormalTok{ d}
-    \ControlFlowTok{return}\NormalTok{ get\_m(d), get\_c(d)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\subsection{Linear-Fractional Performance Metric
-Elicitation}\label{sec-lfpm-elicitation}
-
-Now, we present the next main result, which is an algorithm to elicit
-linear-fractional performance metrics. For this task, we will need the
-following assumption: Let \(\phi \in \varphi_{L F P M}\). We assume
-\(p_{11}, p_{00} \geq 0, p_{11} \geq q_{11}, p_{00} \geq q_{00},\)
-\(p_{0}=0, q_{0}=\)
-\(\left(p_{11}-q_{11}\right) \zeta+\left(p_{00}-q_{00}\right)(1-\zeta)\),
-and \(p_{11}+p_{00}=1\).
-
-These assumptions guarantee that the LFPM \(\phi\) which we are trying
-to elicit is monotonically increasing in \(TP\) and \(TN\), just as in
-the LPM elicitation case. We first provide motivation and an overview of
-the approach for LFPM elicitation and then present pseudocode for the
-algorithm.
-
-The general idea of the algorithm is to use  Algorithm~\ref{alg-lpm}  to
-obtain a maximizer and a minimizer for the given dataset; these result
-in two systems of equations involving the true LFPM \(\phi^*\) with 1
-degree of freedom. Then, we run a grid search that is independent of
-oracle queries to find the point where solutions to the systems match
-pointwise on the resulting confusion matrices; this occurs close to
-where the true metric lies.
-
-More formally, suppose that the true metric is
-\begin{equation}\phantomsection\label{eq-eq3.48}{\phi^{*}(C)=\frac{p_{11}^{*} T P+p_{00}^{*} T N}{q_{11}^{*} T P+q_{00}^{*} T N+q_{0}^{*}}.}\end{equation}
-Then, let \(\bar{\tau}\) and \(\underline{\tau}\) represent the
-maximizer and minimizer of \(\phi\) over \(\mathcal{C}\), respectively.
-There exists a hyperplane \[\begin{aligned}
-\bar{\ell}_{f}^{*}:=\left(p_{11}^{*}-\bar{\tau}^{*} q_{11}^{*}\right) t p+\left(p_{00}^{*}-\bar{\tau}^{*} q_{00}^{*}\right) t n=\bar{\tau}^{*} q_{0}^{*},
-\end{aligned}\] which touches \(\mathcal{C}\) at
-\(\left(\overline{T P}^{*}, \overline{T N}^{*}\right)\) on
-\(\partial \mathcal{C}_{+}\). Correspondingly, there also exists a
-hyperplane
-\(\underline{\ell}_{f}^{*}:=\left(p_{11}^{*}-\underline{\tau}^{*} q_{11}^{*}\right) t p+\left(p_{00}^{*}-\underline{\tau}^{*} q_{00}^{*}\right) \operatorname{tn}=\underline{\tau}^{*} q_{0}^{*}\),
-which touches \(\mathcal{C}\) at
-\(\left(\underline{TP}^{*}, \underline{T N}^{*}\right)\) on
-\(\partial \mathcal{C}_{-}\). While we are unable to obtain
-Equation~\ref{eq-eq3.48} and \textbf{?@eq-eq3.49} directly, we can use
- Algorithm~\ref{alg-lpm}  to get a hyperplane
-\begin{equation}\phantomsection\label{eq-eq3.51}{\bar{\ell}:=\bar{m}_{11} t p+\bar{m}_{00} t n= \bar{m}_{11} \overline{T P}^{*}+\bar{m}_{00} \overline{T N}^{*} = \bar{C}_{0},}\end{equation}
-which is equivalent to \(\bar{\ell}_{f}^{*}\) (Equation~\ref{eq-eq3.48})
-up to a constant multiple. From here, we can obtain the system of
-equations
-
-\begin{equation}\phantomsection\label{eq-eq3.52}{p_{11}^{*}-\bar{\tau}^{*} q_{11}^{*}=\alpha \bar{m}_{11}, p_{00}^{*}-\bar{\tau}^{*} q_{00}^{*}=\alpha \bar{m}_{00}, \bar{\tau}^{*} q_{0}^{*}=\alpha \bar{C}_{0},}\end{equation}
-where \(\alpha > 0\) (we know it is \(\geq0\) due to our assumptions
-earlier and because \(\bar{m}\) is positive, but if it is equal to \(0\)
-then \(\phi^*\) would be constant. So, our resulting system of equations
-is \begin{equation}\phantomsection\label{eq-eq3.53}{\begin{aligned}
-    p_{11}^{\prime}-\bar{\tau}^{*} q_{11}^{\prime}=\bar{m}_{11}, p_{00}^{\prime}-\bar{\tau}^{*} q_{00}^{\prime}=\bar{m}_{00}, \bar{\tau}^{*} q_{0}^{\prime}=\bar{C}_{0}.
-\end{aligned}}\end{equation}
-
-Now, similarly, we can approximate \textbf{?@eq-eq3.49} using the
-algorithm we defined for quasiconvex metrics
-( Algorithm~\ref{alg-lfpm} ), where we altered the search range and
-comparisons. After finding the minimizer, we obtain the hyperplane
-\begin{equation}\phantomsection\label{eq-eq3.54}{\underline{\ell}:=\underline{m}_{11} t p+\underline{m}_{00} t n=\underline{m}_{11} \underline{TP}^{*}+\underline{m}_{00} \underline{TN}^{*} = \underline{C}_{0},}\end{equation}
-which is equivalent to \(\underline{\ell}_{f}^{*}\)
-(\textbf{?@eq-eq3.49}) up to a constant multiple. So then, our system of
-equations is
-\begin{equation}\phantomsection\label{eq-eq3.55}{p_{11}^{*}-\underline{\tau}^{*} q_{11}^{*}=\gamma \underline{m}_{11}, p_{00}^{*}-\underline{\tau}^{*} q_{00}^{*}=\gamma \underline{m}_{00}, \underline{\tau}^{*} q_{0}^{*}=\gamma \underline{C}_{0},}\end{equation}
-where \(\gamma <0\) (for a reason analogous to why we have
-\(\alpha >0\)), meaning our resulting system of equations is
-\begin{equation}\phantomsection\label{eq-eq3.56}{\begin{aligned}
-    p_{11}^{\prime \prime}-\underline{\tau}^{*} q_{11}^{\prime \prime}=\underline{m}_{11}, p_{00}^{\prime \prime}-\underline{\tau}^{*} q_{00}^{\prime \prime}=\underline{m}_{00}, \underline{\tau}^{*} q_{0}^{\prime \prime}=\underline{C}_{0}.
-\end{aligned}}\end{equation}
-
-Equation~\ref{eq-eq3.55} and Equation~\ref{eq-eq3.56} form the two
-systems of equations mentioned in our overview of the algorithm. Next,
-we demonstrate that they have only one degree of freedom. Note that if
-we know \(p_{11}'\), we could solve both systems of equations as
-follows:
-\begin{equation}\phantomsection\label{eq-eq3.57}{\begin{aligned}
-    p_{00}^{\prime}  &=1-p_{11}^{\prime}, q_{0}^{\prime}=\bar{C}_{0} \frac{P^{\prime}}{Q^{\prime}}\\
-    q_{11}^{\prime}  &=\left(p_{11}^{\prime}-\bar{m}_{11}\right) \frac{P^{\prime}}{Q^{\prime}} \\
-    q_{00}^{\prime}&=\left(p_{00}^{\prime}-\bar{m}_{00}\right) \frac{P^{\prime}}{Q^{\prime}},
-\end{aligned}}\end{equation} where
-\(P^{\prime}=p_{11}^{\prime} \zeta+p_{00}^{\prime}(1-\zeta)\) and
-\(Q^{\prime}=P^{\prime}+\bar{C}_{0}-\)
-\(\bar{m}_{11} \zeta-\bar{m}_{00}(1-\zeta).\)
-
-Now, suppose we know \(p_{11}'\). We could use this value to solve both
-systems Equation~\ref{eq-eq3.55} and Equation~\ref{eq-eq3.56}, yielding
-two metrics, \(\phi'\) and \(\phi''\), from the maximizer and minimizer,
-respectively. Importantly, when
-\(p_{11}^{*} / p_{00}^{*}=p_{11}^{\prime} / p_{00}^{\prime}=p_{11}^{\prime \prime} / p_{00}^{\prime \prime}\),
-then
-\(\phi^{*}(C)=\phi^{\prime}(C) / \alpha=-\phi^{\prime \prime}(C) / \gamma\).
-Essentially, when we find a value of \(p_{11}'\) that results in
-\(\phi'\) and \(\phi''\) h aving constant ratios at all points on the
-boundary of \(\mathcal{C}\), we can obtain \(\phi^*\), as it is
-derivable from \(\phi'\) and \(\alpha\) (or, alternatively, \(\phi''\)
-and \(\gamma\)).
-
-We will perform a grid search for \(p_{11}'\) on \([0,1]\). For each
-point in our search, we will compute \(\phi'\) and \(\phi''\). Then, we
-will generate several confusion matrices on the boundaries and calculate
-the ratio \$\phi'\,' / \(\phi'\) for each. We will select the value of
-\(p_{11}'\) for which the ratio \(\phi'' / \phi'\) is closest to
-constant and use it to compute the elicited metric \(\hat{\phi}\). The
-pseudocode for LFPM elicitation is given in  Algorithm~\ref{alg-lfpm} .
-
-\begin{algorithm}[H]
-    \caption{Grid Search for Best Ratio}
-    \label{alg-lfpm}
-\begin{algorithmic}[1]
-        \State \textbf{Input:} $k, \Delta$.
-        \State \textbf{Initialize:} $\sigma_{\text{opt}} = \infty, p'_{11,\text{opt}} = 0$.
-        \State Generate $C_1, \dots, C_k$ on $\partial C_+$ and $\partial C_-$ (Section 3).
-        \State Generate $C_1, \dots, C_k$ on $\partial C_+$ and $\partial C_-$ (Section 3).
-        \For{$p'_{11} = 0; \; p'_{11} \leq 1; \; p'_{11} = p'_{11} + \Delta$}
-            \State Compute $\phi'$, $\phi''$ using Proposition 4. 
-            \State Compute array $r = \left[ \frac{\phi'(C_1)}{\phi''(C_1)}, \dots, \frac{\phi'(C_k)}{\phi''(C_k)} \right]$.
-            \State Set $\sigma = \text{std}(r)$.
-            \If{$\sigma < \sigma_{\text{opt}}$}
-                \State Set $\sigma_{\text{opt}} = \sigma$ and $p'_{11,\text{opt}} = p'_{11}$.
-            \EndIf
-        \EndFor
-        \State \textbf{Output:} $p'_{11,\text{opt}}$.
-    \end{algorithmic}
-\end{algorithm}
-
-We provide a Python implementation as below.
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\KeywordTok{def}\NormalTok{ lfpm\_elicitation(k, delta):}
-    \CommentTok{"""}
-\CommentTok{    Inputs:}
-\CommentTok{    {-} k: the number of confusion matrices to evaluate on}
-\CommentTok{    {-} delta: the spacing for the grid search}
-\CommentTok{    Outputs:}
-\CommentTok{    {-} p\_11\textquotesingle{}, which will allow us to compute the elicited LFPM}
-\CommentTok{    """}
-
-\NormalTok{    sigma\_opt }\OperatorTok{=}\NormalTok{ np.inf}
-\NormalTok{    p11\_opt }\OperatorTok{=} \DecValTok{0}
-\NormalTok{    C }\OperatorTok{=}\NormalTok{ compute\_confusion\_matrices(k) }\CommentTok{\# generates k confusion matrices to evaluate on}
-
-    \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\BuiltInTok{int}\NormalTok{(}\DecValTok{1}\OperatorTok{/}\NormalTok{delta)):}
-\NormalTok{        p11 }\OperatorTok{=}\NormalTok{ i }\OperatorTok{*}\NormalTok{ delta}
-\NormalTok{        phi1 }\OperatorTok{=}\NormalTok{ compute\_upper\_metric(p11) }\CommentTok{\# solves the first system of equations with p11 }
-\NormalTok{        phi2 }\OperatorTok{=}\NormalTok{ compute\_lower\_metric(p11) }\CommentTok{\# solves the second system of equations with p11 }
-\NormalTok{        utility\_1 }\OperatorTok{=}\NormalTok{ [phi1(c) }\ControlFlowTok{for}\NormalTok{ c }\KeywordTok{in}\NormalTok{ C] }\CommentTok{\#calculate phi for both systems of equations}
-\NormalTok{        utility\_2 }\OperatorTok{=}\NormalTok{ [phi2(c) }\ControlFlowTok{for}\NormalTok{ c }\KeywordTok{in}\NormalTok{ C]}
-
-\NormalTok{        r }\OperatorTok{=}\NormalTok{ []}
-        \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(k):}
-\NormalTok{            r.append(utility\_1[i] }\OperatorTok{/}\NormalTok{ utility\_2[i])}
-\NormalTok{        sigma }\OperatorTok{=}\NormalTok{ np.std(r)}
-
-        \ControlFlowTok{if}\NormalTok{(sigma }\OperatorTok{\textless{}}\NormalTok{ sigma\_opt):}
-\NormalTok{            sigma\_opt }\OperatorTok{=}\NormalTok{ sigma}
-\NormalTok{            p11\_opt }\OperatorTok{=}\NormalTok{ p11}
-    \ControlFlowTok{return}\NormalTok{ p11\_opt}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-In summary, to elicit LFPMs, we utilize a special property of the LPM
-minimizer and maximizer on \(\mathcal{C}\)--namely, that we can use the
-corresponding supporting hyperplanes to form a system of equations that
-can be used to approximate \(\phi^*\) if one parameter (\(p_{11}'\)) is
-found, and that this parameter can be found using an oracle-independent
-grid search. Importantly, these algorithms can be shown to satisfy
-significant theoretical guarantees. We provide formal statement and
-intuitive interpretation of these guarantees here, with their proofs
-available in the appendix of the original paper. First, we define the
-oracle noise \(\epsilon_{\Omega}\), which arises from the oracle
-potentially flipping the comparison output on two confusion matrices
-that are close enough in utility.
-
-Given \(\epsilon, \epsilon_{\Omega} \geq 0\) and a metric \(\phi\)
-satisfying our assumptions,  Algorithm~\ref{alg-lpm}  or
- Algorithm~\ref{alg-lfpm}  finds an approximate maximizer/minimizer and
-supporting hyperplane. Additionally, the value of \(\phi\) at that point
-is within \(O\left(\sqrt{\epsilon_{\Omega}} + \epsilon\right)\) of the
-optimum, and the number of queries is
-\(O\left(\log \frac{1}{\epsilon}\right)\). Let \(\mathbf{m}^{*}\) be the
-true performance metric. Given \(\epsilon > 0\), LPM elicitation outputs
-a performance metric \(\hat{\mathbf{m}}\), such that
-\(\left\|\mathbf{m}^{*} - \hat{\mathbf{m}}\right\|_{\infty} \leq \sqrt{2} \epsilon + \frac{2}{k_{0}} \sqrt{2 k_{1} \epsilon_{\Omega}}\).
-These results ensure that  Algorithm~\ref{alg-lpm}  and
- Algorithm~\ref{alg-lfpm}  find an appropriate maximizer and minimizer
-in the search space, within a certain range of accuracy that depends on
-oracle and sample noise, and within a certain number of queries. Both of
-these statements are guaranteed by the binary search approach.
-
-Let \(h_{\theta}\) and \(\hat{h}_{\theta}\) be two classifiers estimated
-using \(\eta\) and \(\hat{\eta}\), respectively. Further, let
-\(\bar{\theta}\) be such that
-\(h_{\bar{\theta}} = \arg \max _{\theta} \phi\left(h_{\theta}\right)\).
-Then
-\(\|C(\hat{h}_{\bar{\theta}}) - C\left(h_{\bar{\theta}}\right)\|_{\infty} = O\left(\left\|\hat{\eta}_{n} - \eta\right\|_{\infty}\right)\).
-This result indicates that the drop in elicited metric quality caused by
-using a dataset of samples rather than population confusion matrices is
-bounded by the drop in performance of the decision boundary \(\eta\).
-These three guarantees together ensure that oracle noise and sample
-noise do not amplify drops in performance when using metric elicitation;
-rather, these drops in performance are bounded by the drops that would
-typically occur when using the standard machine learning paradigm of
-training a decision boundary and using a pre-established metric. For
-further interesting exploration of the types of problems that can be
-solved using the framework of metric elicitation, we refer the reader to
-(\citeproc{ref-nips}{Hiranandani, Narasimhan, and Koyejo 2020}), which
-performs metric elicitation to determine the oracle's ideal tradeoff
-between the classifier's overall performance and the discrepancy between
-its performance on certain protected groups.
-
-\subsection{Multiclass Performance Metric
-Elicitation}\label{multiclass-performance-metric-elicitation}
-
-Although the previous section only described metric elicitation for
-binary classification problems, the general framework can still be
-applied to multiclass classification
-problems(\citeproc{ref-NEURIPS2019_1fd09c5f}{Hiranandani et al. 2019b}).
-Consider the case of classifying subtypes of leukemia
-(\citeproc{ref-YangNaiman+2014+477+496}{Yang and Naiman 2014}). We can
-train a neural network to predict conditional probability of a certain
-leukemia subtype given certain gene expressions. However, it may not be
-appropriate to classify the subtype purely based on whichever one has
-the highest confidence. For instance, a treatment for leukemia subtype
-C1 may be perfect for cases of C1, but it may be ineffective or harmful
-for certain other subtypes. Therefore, the final response from the
-classifier may not be as simple as as choosing the class with the
-highest conditional probability, just like how the threshold for binary
-classification may not always be 50\%. With multiclass metric
-elicitation, we can show confusion matrices to an oracle (like the
-doctor in the leukemia example) to determine which classifier has the
-best tradeoffs. In (\citeproc{ref-NEURIPS2019_1fd09c5f}{Hiranandani et
-al. 2019b}), the authors focus on eliciting linear performance metrics,
-which is what we will describe in this chapter. Most of the notation
-from Binary Metric Elicitation still persists, just modified to provide
-categorical responses. \(X \in \mathcal{X}\) is the input random
-variable. \(Y \in [k]\) is the output random variable, where \([k]\) is
-the index set \(\{1, 2, \dots, k\}\).
-
-The dataset of size \(n\) is denoted by \(\{(\vec{x}, y)\}_{i=1}^n\)
-generated independently and identically from \(\mathbb{P}(X, Y)\).
-\(\eta_i(\vec{x}) = \mathbb{P}(Y=i | X=\vec{x})\) gives the conditional
-probability of class \(i \in [k]\) given an observation.
-\(\xi_i = \mathbb{P}(Y=i)\) is the marginal probability of class
-\(i \in [k]\). The set of all classifiers is
-\(\mathcal{H} = \{h : \mathcal{X} \rightarrow \Delta_k\}\), where
-\(\Delta_k\) is (k-1) dimensional simplex. In this case, the outputs of
-classifiers are 1-hot vectors of size \(k\) where the only index with
-value 1 is the predicted class and all other positions have a value of
-0. The confusion matrix for a classifier, \(h\), is
-\(C(h, \mathbb{P}) \in \mathbb{R}^{k \times k}\), where:
-
-\begin{equation}\phantomsection\label{eq-eq3.59}{C_{ij}(h, \mathbb{P}) = \mathbb{P}(Y=i, h=j) \text{\qquad for } i, j \in [k]}\end{equation}
-
-Note that the confusion matrices are \(k\times k\) and store the joint
-probabilities of each type of classification for each possible class.
-This means that the sum of row \(i\) in the confusion matrix equals
-\(\xi_i\), because this is equivalent to adding over all possible
-classifications. Since we know the sums of each row, all diagonal
-elements can be reconstructed from just the off-diagonal elements, so a
-confusion matrix \(C(h, \mathbb{P})\) can be expressed as a vector of
-off-diagonal elements,
-\(\vec{c}(h, \mathbb{P}) = \textit{off-diag}(C(h, \mathbb{P}))\), and
-\(\vec{c} \in \mathbb{R}^q\) where \(q := k^2 - k\). The vector
-\(\vec{c}\) is called the vector of \emph{`off-diagonal confusions.'}
-The space of off-diagonal confusions is
-\(\mathcal{C} = \{\vec{c}(h, \mathbb{P}) : h \in \mathcal{H}\}\).
-
-In cases where the oracle would care about the exact type of
-misclassification (i.e.~misclassifying and object from class 1 as class
-2), this off-diagonal confusion matrix is necessary. However, there are
-many cases where the performance of a classifier is determined by just
-the probability of correct prediction for each class, which just
-requires the diagonal elements. In these cases, we can define the vector
-of \emph{`diagonal confusions'} as
-\(\vec{d}(h, \mathbb{P}) = \textit{diag}(C(h, \mathbb{P})) \in \mathbb{R}^k\).
-The space of diagonal confusions is
-\(\mathcal{D} = \{\vec{d}(h, \mathbb{P}) : h \in \mathcal{H}\}\).
-
-Finally, the setup for metric elicitation is identical to the one
-examined in the previous chapter. We still assume access to an oracle
-that can choose between two classifiers or confusion matrices, using
-notation \(\Gamma\) for comparing two classifiers and \(\Omega\) for
-comparing confusion matrices, which returns 1 if the first classifier is
-better and 0 otherwise. We still assume that the oracle behaves
-according to some unknown performance metric, and we wish to recover
-this metric up to some small error tolerance (based on a suitable norm).
-The two different types of confusion vectors result in different
-algorithms for metric elicitation, which we will explore in later
-sections.
-
-A Diagonal Linear Performance Metric (DLPM) is a performance metric that
-only considers the diagonal elements in the confusion matrix. The metric
-is defined as \(\psi(\vec{d}) = \langle \vec{a}, \vec{d} \rangle\),
-where \(\vec{a} \in \mathbb{R}^k\) such that \(||\vec{a}||_1 = 1\). It
-is also called weighted accuracy
-(\citeproc{ref-pmlr-v37-narasimhanb15}{Narasimhan et al. 2015}). The
-family of DLPMs is denoted as \(\varphi_{DLPM}\). Since these only
-consider the diagonal elements, which we want to maximize, we can focus
-on only eliciting monotonically increasing DLPMs, meaning that all
-elements in \(\vec{a}\) are non-negative.
-
-Consider the trivial classifiers that only predict a single class at all
-times. The diagonal confusions when only predicting class \(i\) are
-\(\vec{v}_i \in \mathbb{R}^k\) with \(\xi_i\) at index \(i\) and zero
-elsewhere. Note that this is the maximum possible value in index \(i\),
-because this represents perfectly classifying all points that have a
-true class of \(i\). We can consider the space of diagonal confusions,
-visualized in Figure~\ref{fig-diag_geom} (taken from
-(\citeproc{ref-NEURIPS2019_1fd09c5f}{Hiranandani et al. 2019b})). The
-space of \(\mathcal{D}\) is strictly convex, closed, and contained in
-the box \([0, \xi_1] \times \dots \times [0, \xi_k]\). We also know that
-the only vertices are \(\vec{v}_i\) for each \(i \in [k]^{(k-1)}\).
-
-\begin{figure}
-
-\centering{
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/Figures/diag_geometry.png}}
-
-}
-
-\caption{\label{fig-diag_geom}(a) Geometry of space of diagonal
-confusions for \(k=3\). This is a convex region with three flat areas
-representing confusions when restricted to only two classes. (b)
-Geometry of diagonal confusions when restricted to classes \(k_1\) and
-\(k_2\). Notice how this is identical to the space of confusion matrices
-examined in the previous chapter.}
-
-\end{figure}%
-
-We know that this is strictly convex under the assumption that an object
-from any class can be misclassified as any other class. Mathematically,
-the assumption is that
-\(g_{ij}(r) = \mathbb{P} \left[\frac{\eta_i(X)}{\eta_j(X)} \geq r \right]\)
-\(\forall i, j \in [k]\) are continuous and strictly decreasing for
-\(r \in [0, \infty)\).
-
-We can also define the space of binary classification confusion matrices
-confined to classes \(k_1\) and \(k_2\), which is the 2-D \((k_1, k_2)\)
-axis-aligned face of \(\mathcal{D}\), denoted as
-\(\mathcal{D}_{k_1, k_2}\). Note that this is strictly convex, since
-\(\mathcal{D}\) itself is strictly convex, and it has the same geometry
-as the space of binary confusion matrices examined in the previous
-chapter. Therefore, we can construct an RBO classifier for
-\(\psi \in \varphi_{DLPM}\), parameterized by \(\vec{a}\), as follows:
-\begin{equation}\phantomsection\label{eq-rbo_eq}{\begin{aligned}
-\bar{h}_{k_1, k_2}(\vec{x})= \left\{
-\begin{array}{ll}
-      k_1, \text{ if } a_{k_1} \eta_{k_1}(\vec{x}) \geq a_{k_2} \eta_{k_2}(\vec{x})\\
-k_2, \text{ o.w.}
-\end{array}
-\right\}.
-\end{aligned}}\end{equation}
-
-We can parameterize the upper boundary of \(\mathcal{D}_{k_1, k_2}\),
-denoted as \(\partial \mathcal{D}^{+}_{k_1, k_2}\), using a single
-parameter \(m \in [0, 1]\). Specifically, we can construct a DLPM by
-setting \(a_{k_1} = m\), \(a_{k_2} = 1 - m\), and all others to 0. Using
-Equation~\ref{eq-rbo_eq}, we can get the diagonal confusions, so varying
-\(m\) parameterizes \(\partial \mathcal{D}^{+}_{k_1, k_2}\). The
-parameterization is denoted as \(\nu(m; k_1, k_2)\).
-
-\subsubsection{Diagonal Linear Performance Metric
-Elicitation}\label{diagonal-linear-performance-metric-elicitation}
-
-Suppose the oracle follows a true metric, \(\psi\), that is linear and
-monotone increasing across all axes. If we consider the composition
-\(\psi \circ \nu(m; k_1, k_2): [0, 1] \rightarrow \mathbb{R}\), we know
-it must be concave and unimodal, because \(\mathcal{D}_{k_1, k_2}\) is a
-convex set. Therefore, we can find the value of \(m\) that maximizes
-\(\psi \circ \nu(m; k_1, k_2)\) for any given \(k_1\) and \(k_2\) using
-a binary search procedure.
-
-Since the RBO classifier for classes \(k_1\) and \(k_2\) only rely on
-the relative weights of the classes in the DLPM (see
-Equation~\ref{eq-rbo_eq}), finding the value of \(m\) that maximizes
-\(\psi \circ \nu(m; k_1, k_2)\) gives us the true relative ratio between
-\(a_{k_1}\) and \(a_{k_2}\). Specifically, from the definition of
-\(\nu\), we know that \(\frac{a_{k_2}}{a_{k_1}} = \frac{1-m}{m}\). We
-can therefore simply calculate the ratio between \(a_1\) and all other
-weights to reconstruct an estimate for the true metric. A python
-implementation of this algorithm is provided below.
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-
-\KeywordTok{def}\NormalTok{ rbo\_dlpm(m, k1, k2, k):}
-    \CommentTok{"""}
-\CommentTok{    This constructs DLPM weights for the upper boundary of the}
-\CommentTok{    restricted diagonal confusions, given a parameter m.}
-\CommentTok{    This is equivalent to }\CharTok{\textbackslash{}n}\CommentTok{u(m; k1, k2)}
-\CommentTok{    }
-\CommentTok{    Inputs:}
-\CommentTok{    {-} m: parameter (between 0 and 1) for the upper boundary}
-\CommentTok{    {-} k1: first axis for this  face}
-\CommentTok{    {-} k2: second axis for this face}
-\CommentTok{    {-} k: number of classes}
-\CommentTok{    Outputs:}
-\CommentTok{    {-} DLPM weights for this point on the upper boundary}
-\CommentTok{    """}
-\NormalTok{    new\_a }\OperatorTok{=}\NormalTok{ np.zeros(k)}
-\NormalTok{    new\_a[k1] }\OperatorTok{=}\NormalTok{ m}
-\NormalTok{    new\_a[k2] }\OperatorTok{=} \DecValTok{1} \OperatorTok{{-}}\NormalTok{ m}
-    \ControlFlowTok{return}\NormalTok{ new\_a}
-
-\KeywordTok{def}\NormalTok{ dlpm\_elicitation(epsilon, oracle, get\_d, k):}
-    \CommentTok{"""}
-\CommentTok{    Inputs:}
-\CommentTok{    {-} epsilon: some epsilon \textgreater{} 0 representing threshold of error}
-\CommentTok{    {-} oracle: some function that accepts 2 confusion matrices and}
-\CommentTok{        returns true if the first is preferred and false otherwise}
-\CommentTok{    {-} get\_d: some function that accepts dlpm weights and returns }
-\CommentTok{        diagonal confusions}
-\CommentTok{    {-} k: number of classes}
-\CommentTok{    Outputs:}
-\CommentTok{    {-} estimate for true DLPM weights}
-\CommentTok{    """}
-\NormalTok{    a\_hat }\OperatorTok{=}\NormalTok{ np.zeros(k)}
-\NormalTok{    a\_hat[}\DecValTok{0}\NormalTok{] }\OperatorTok{=} \DecValTok{1}
-    \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{1}\NormalTok{, k):}
-        \CommentTok{\# iterate over each axis to find appropriate ratio}
-\NormalTok{        a }\OperatorTok{=} \DecValTok{0}  \CommentTok{\# lower bound of binary search}
-\NormalTok{        b }\OperatorTok{=} \DecValTok{1}  \CommentTok{\# upper bound of binary search}
-
-        \ControlFlowTok{while}\NormalTok{ (b }\OperatorTok{{-}}\NormalTok{ a }\OperatorTok{\textgreater{}}\NormalTok{ epsilon):}
-\NormalTok{            c }\OperatorTok{=}\NormalTok{ (}\DecValTok{3} \OperatorTok{*}\NormalTok{ a }\OperatorTok{+}\NormalTok{ b) }\OperatorTok{/} \DecValTok{4}
-\NormalTok{            d }\OperatorTok{=}\NormalTok{ (a }\OperatorTok{+}\NormalTok{ b) }\OperatorTok{/} \DecValTok{2}
-\NormalTok{            e }\OperatorTok{=}\NormalTok{ (a }\OperatorTok{+} \DecValTok{3} \OperatorTok{*}\NormalTok{ b) }\OperatorTok{/} \DecValTok{4}
-
-            \CommentTok{\# get diagonal confusions for each point}
-\NormalTok{            d\_a, d\_c, d\_d, d\_e, d\_b }\OperatorTok{=}\NormalTok{ (get\_d(rbo\_dlpm(x, }\DecValTok{0}\NormalTok{, i, k)) }
-                \ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ [a, c, d, e, b])}
-
-            \CommentTok{\# query oracle for each pair}
-\NormalTok{            response\_ac }\OperatorTok{=}\NormalTok{ oracle(d\_a, d\_c)}
-\NormalTok{            response\_cd }\OperatorTok{=}\NormalTok{ oracle(d\_c, d\_d)}
-\NormalTok{            response\_de }\OperatorTok{=}\NormalTok{ oracle(d\_d, d\_e)}
-\NormalTok{            response\_eb }\OperatorTok{=}\NormalTok{ oracle(d\_e, d\_b)}
-
-            \CommentTok{\# update ranges to keep the peak}
-            \ControlFlowTok{if}\NormalTok{ response\_ac:}
-\NormalTok{                b }\OperatorTok{=}\NormalTok{ d}
-            \ControlFlowTok{elif}\NormalTok{ response\_cd:}
-\NormalTok{                b }\OperatorTok{=}\NormalTok{ d}
-            \ControlFlowTok{elif}\NormalTok{ response\_de:}
-\NormalTok{                a }\OperatorTok{=}\NormalTok{ c}
-\NormalTok{                b }\OperatorTok{=}\NormalTok{ e}
-            \ControlFlowTok{elif}\NormalTok{ response\_eb:}
-\NormalTok{                a }\OperatorTok{=}\NormalTok{ d}
-            \ControlFlowTok{else}\NormalTok{:}
-\NormalTok{                a }\OperatorTok{=}\NormalTok{ d}
-
-\NormalTok{        midpt }\OperatorTok{=}\NormalTok{ (a }\OperatorTok{+}\NormalTok{ b) }\OperatorTok{/} \DecValTok{2}
-\NormalTok{        a\_hat[i] }\OperatorTok{=}\NormalTok{ (}\DecValTok{1} \OperatorTok{{-}}\NormalTok{ midpt) }\OperatorTok{/}\NormalTok{ midpt}
-    \ControlFlowTok{return}\NormalTok{ a\_hat }\OperatorTok{/}\NormalTok{ np.}\BuiltInTok{sum}\NormalTok{(a\_hat)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-To use this algorithm for metric elicitation on a real dataset, we need
-to supply the ``oracle'' and ``get\_d'' functions. The oracle function
-is an interface to an expert who judges which of two confusion matrices
-is better. The get\_d function will need to construct a classifier given
-the DLPM weights, following the principles of the RBO classifier from
-Equation~\ref{eq-rbo_eq}, and calculate the confusion matrix from a
-validation set.
-
-Using the same oracle feedback noise model from the binary metric
-elicitation, we can make the following guarantees:
-
-\begin{tcolorbox}[colframe=.grey, title=\faPenSquare \enspace Proposition]
-
-\phantomsection\label{prop-prop_dlpm}
-Given \(\epsilon, \epsilon_\Omega \geq 0\), and a 1-Lipschitz DLPM
-\(\varphi^*\) parameterized by \(\vec{a}^*\). Then the output
-\(\hat{a}\) of the DLPM elicitation algorithm after
-\(O((k-1)\log\frac{1}{\epsilon})\) queries to the oracle satisfies
-\(||\vec{a}^* - \hat{a}||_\infty \leq O(\epsilon + \sqrt{\epsilon_\Omega})\),
-which is equivalent to
-\(||\vec{a}^* - \hat{a}||_2 \leq O(\sqrt{k}(\epsilon + \sqrt{\epsilon_\Omega}))\).
-
-\end{tcolorbox}
-
-In other words, the maximum difference between the estimate and true
-value along any component (indicated by the L-infinity norm) is linearly
-bounded by the sum of the epsilon specified by the algorithm and the
-square root of the oracle's correctness guarantee (\(\epsilon_\Omega\)).
-
-\section{Case Study 3: Active Preference Learning in
-Robotics}\label{case-study-3-active-preference-learning-in-robotics}
-
-How exactly do robots learn human preferences from just the pairwise
-comparisons, if they need to learn how to act in the environment itself?
-The comparisons in turn help robots learn the reward function of the
-human, which allows them to further take actions in real settings. Let's
-say there are two trajectories \(\xi_A\) and \(\xi_B\) that might be
-taken as the next course of action in any context, like choosing the
-next turn, or choosing the next chatGPT response. The robot is offering
-both to a human for comparison. To answer which of them is better, the
-human would ask themselves if \(R(\xi_A)\) or \(R(\xi_B)\) is bigger,
-with \(R(\xi) = w * \phi(\xi)\) being the reward function. In this
-equation \(w\) and \(\phi(\xi)\) are vectors of weights and features of
-the trajectory, so alternatively, we can express this as:
-
-\begin{equation}\phantomsection\label{eq-reward_eq}{R(\xi) = \begin{bmatrix} w_1 \\ w_2 \\ ... \\ w_N \end{bmatrix} \cdot \begin{bmatrix} \phi_1(\xi) \\ \phi_2(\xi) \\ ... \\ \phi_N(\xi) \end{bmatrix}}\end{equation}
-
-If one says that they preferred \(\xi_2\) less than \(\xi_1\) then it
-means
-\(\xi_2 < \xi_1 \implies R(\xi_2) < R(\xi_1) \implies w * \phi(\xi_2) < w * \phi(\xi_1) \implies 0 < w * (\phi(\xi_1) - \phi(\xi_2)) \implies 0 < w * \Phi\).
-Alternatively, if one preferred \(\xi_2\) more than \(\xi_1\), the signs
-would be flipped, resulting in \(0 > w * \Phi\). The two results can be
-represented in the N-dimensional space, where when it is split by the
-decision boundary, it creates half-spaces indicating preferences for
-each of the sides. For example we can see how a query between two items
-can split the plain into two halves, indicating preference towards one
-of the items. Such an image can be extended into bigger dimensions,
-where a line would become a separating hyperplane. If one is to truly
-believe the answers of one person, they would remove everything from the
-other side of the hyperplane that does not agree with the received human
-preference. But since humans are noisy, that approach is not optimal,
-thus most applications up-weight the indicated side of the plane to
-emphasize that points on that side are better, and down-weight the other
-side as they do not agree with the provided comparison.
-
-How should someone choose which queries to conduct, otherwise, what is
-the most informative query sequence? After completing one query, the
-next query should be orthogonal to the previous one so that the
-potential space consistent with the preferences decreases in half. The
-intuition behind that is the potential space has all of the reward
-functions that agree with the provided answers, so to find a specific
-reward function for a human, decreasing the space narrows down the
-possible options. The original query created the blue space, and a new
-one created a red space, resulting in a purple intersection of the two
-which is still consistent with both of the queries's results. The image
-shows that the purple portion is exactly half of the blue portion.
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.4\linewidth,height=\textheight,keepaspectratio]{src/Figures/2D-space.jpg}
-
-}
-
-\caption{\label{fig-2dspace}Creating further comparisons limits the
-space that agrees with answers to all of them. The blue area
-demonstrates a preference for object 1 over object 2. The red area
-demonstrates a preference for object 3 over object 4. Combination
-(purple area) shows the space that is consistent with both of those
-preferences.}
-
-\end{figure}%
-
-Mathematically, from (\citeproc{ref-pmlr-v87-biyik18a}{Biyik and Sadigh
-2018}) this can be expressed as set \(F\) of potential queries \(\phi\),
-where
-\(F = \{\phi: \phi = \Phi(\xi_A) - \Phi(\xi_B), \xi_A, \xi_B \in \Xi\}\)
-(defining that a query is the difference between the features of two
-trajectories). Using that, the authors define a human update function
-\(f_{\phi}(w) = \min(1, \exp(I^T\phi))\) that accounts for how much of
-the space will still be consistent with the preferences. Finally, for a
-specific query, they define the minimum volume removed as
-\(\min\{\mathbb{E}[1 - f_{\phi}(w)], \mathbb{E}[1 - f_{-\phi}(w)]\}\)
-(expected size of the two sides of the remaining space after it is split
-by a query - purple area in Figure~\ref{fig-2dspace}), and the final
-goal is to maximize that amount over all possible queries since it is
-optimal to get rid of as much space as possible to narrow down the
-options for the reward function:
-\(\max_{\phi} \min\{ \mathbb{E}[1 - f_{\phi}(w)], \mathbb{E}[1 - f_{-\phi}(w)]\}\).
-Effectively this is finding such \(\phi\) that maximizes the information
-one can get by asking the next comparison query. While this approach
-uses minimum volume removed, there can be other metrics inside the
-\(\max\) function. Some applications like movie recommendations do not
-require extra constraints, however in robotics one might want to add
-more constraints that satisfy certain rules, so that the resulting query
-follows the dynamics of the physical world.
-
-The first real example of learning reward functions from pairwise
-comparisons is a 2D driving simulator from
-(\citeproc{ref-pmlr-v87-biyik18a}{Biyik and Sadigh 2018}). In
-\textbf{?@fig-car\_direct} you can see the setting of a 3-lane road with
-the orange car being controlled by the computer. The queries conducted
-for this problem are two different trajectories presented to the human,
-and they are asked to evaluate which one of them is better. For the
-features that contribute to the reward function, it is important to
-consider that robots might not find some of the information as
-informative for the learning process as a human would. For this example,
-the underlying features included the distance between lane boundaries,
-distance to other cars, and the heading and speed of the controlled car.
-The weights toward the last feature were weighted the highest according
-to the authors, since it takes a lot of effort for the car to change or
-correct its direction.
-
-At the start of the learning process, the car had no direction learned
-and was moving all over the road. In the middle of learning after 30
-queries, the simulator learned to follow the direction of the road and
-go straight but still experienced collisions. After 70 queries, the
-simulator learned to avoid collisions, as well as keep the car within
-the lane without swerving.
-
-\subsubsection{Active Learning for Pairwise
-Comparisons}\label{active-learning-for-pairwise-comparisons}
-
-We have discussed that pairwise comparisons should be selected to
-maximize the minimum volume of remaining options removed. The question
-that can come out of the driving example is does it really matter to
-follow that goal or does random choice of queries performs as well? It
-turns out that indeed most AL algorithms (purposefully selecting
-queries) over time converge with the performance of the random query
-selection, so in long term the performance is similar. However, what is
-different is that AL achieves better performance earlier, which in
-time-sensitive tasks can be a critical factor. One example of such a
-setting can be exoskeletons for humans as part of the rehabilitation
-after surgery (\citeproc{ref-Li_2021}{Li et al. 2021}). Different people
-have significantly different walking patterns as well as rehabilitation
-requirements, so the exoskeleton needs to adapt to the human as soon as
-possible for a more successful rehabilitation. Figure
-Figure~\ref{fig-robotics} demonstrates the difference in the time needed
-between the two approaches. In general, in robotics, the time
-differences that might seem small to a human might be detrimental to the
-final performance.
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.6\linewidth,height=\textheight,keepaspectratio]{src/Figures/robo_graph.png}
-
-}
-
-\caption{\label{fig-robotics}Performance of AL and random query
-selection algorithms in the task of exoskeleton learning with human
-preferences. (\citeproc{ref-Li_2021}{Li et al. 2021})}
-
-\end{figure}%
-
-In conclusion, pairwise comparisons show to be a great way of learning
-linear reward functions, but at times present challenges or
-incapabilities that can be further improved with additional
-incorporations of approaches like AL. That improves many applications in
-terms of time spent getting to the result in case of exoskeleton
-adjustments, as well as getting to a middle ground between polar
-behaviors in applications like negotiations.
-
-\subsection{Application: Guiding Human Demonstrations in
-Robotics}\label{application-guiding-human-demonstrations-in-robotics}
-
-A strong approach to learning policies for robotic manipulation is
-imitation learning, the technique of learning behaviors from human
-demonstrations. In particular, interactive imitation learning allows a
-group of humans to contribute their own demonstrations for a task,
-allowing for scalable learning. However, not all groups of demonstrators
-are equally helpful for interactive imitation learning.
-
-The ideal set of demonstrations for imitation learning would follow a
-single, optimal method for performing the task, which a robot could
-learn to mimic. Conversely, \emph{multimodality}, the presence of
-multiple optimal methods in the demonstration set, is challenging for
-imitation learning since it has to learn from contradicting information
-for how to accomplish a task. A common reason for multimodality is the
-fact that different people often subconsciously choose different paths
-for execution, as illustrated in Figure~\ref{fig-multimodalexecution}.
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.5\linewidth,height=\textheight,keepaspectratio]{src/Figures/multimodal_peg.png}
-
-}
-
-\caption{\label{fig-multimodalexecution}Examples of two different ways
-to insert a nut onto a round peg. The orange demonstration picks up the
-nut from the hole while the blue demonstration picks up the nut from the
-side (\citeproc{ref-gandhi2022eliciting}{Gandhi et al. 2022})}
-
-\end{figure}%
-
-Gandhi et al. (\citeproc{ref-gandhi2022eliciting}{Gandhi et al. 2022})
-identifies whether demonstrations are compatible with one another and
-offer an active elicitation interface to guide humans to provide better
-demonstrations in interactive imitation learning. Their key motivation
-is to allow multiple users to contribute demonstrations over the course
-of data collection by guiding users towards compatible demonstrations.
-To identify whether a demonstration is ``compatible'' with a base policy
-trained with prior demonstrations, the researchers measure the
-\emph{likelihood} of demonstrated actions under the base policy, and the
-\emph{novelty} of the visited states. Intuitively, low likelihood and
-low novelty demonstrations should be excluded since they represent
-conflicting modes of behavior on states that the robot can already
-handle, and are therefore incompatible. This concept of compatibility is
-used for filtering a new set of demonstrations and actively eliciting
-compatible demonstrations. In the following subsections, we describe the
-process of estimating compatibility and active elicitation in more
-detal.
-
-\subsubsection{Estimating Compatiblity}\label{estimating-compatiblity}
-
-We want to define a compatibility measure \(\mathcal{M}\), that
-estimates the performance of policy \(\pi_{base}\) that is retrained on
-a union of \(\mathcal{D}_{base}\), the known base dataset, and
-\(\mathcal{D}_{new}\), the newly collected dataset. To define this
-compatibility measure in a way that is easy to compute, we can use two
-interpretable metrics: likelihood and novelty. The likelihood of actions
-\(a_{new}\) in \(\mathcal{D}_{new}\) is measured as the negative mean
-squared error between actions predicted by the base policy and this
-proposed action:
-
-\begin{equation}\phantomsection\label{eq-eq3.61}{likelihood(s_{new}, a_{new}) = -\mathbb{E}[|| \pi_{base}(s_{new}) - a_{new} ||^2_2].}\end{equation}
-
-The novelty of the state \(s_{new}\) in \(\mathcal{D}_{new}\) is the
-standard deviation in the predicted actions under base policy:
-
-\begin{equation}\phantomsection\label{eq-eq3.62}{novelty(s_{new}) = \mathrm{Var}[\pi_{base}(s_{new})].}\end{equation}
-
-We can plot likelihood and novelty on a 2D plane, as shown in
-Figure~\ref{fig-likelihood_novelty}, and identify thresholds on
-likelihood and novelty, denoted as \(\lambda\) and \(\eta\)
-respectively. Intuitively, demonstrations with low likelihood in low
-novelty states should be excluded, because this indicates that there is
-a conflict between the base behavior and the new demonstration due to
-multimodality. Note that in high novelty states, the likelihood should
-be disregarded because the base policy does not have a concrete idea for
-how to handle these states anyways so more data is needed.
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.8\linewidth,height=\textheight,keepaspectratio]{src/Figures/likelihood_novelty.png}
-
-}
-
-\caption{\label{fig-likelihood_novelty}Examples of plots of likelihood
-and novelty for compatible and incompatible operators
-(\citeproc{ref-gandhi2022eliciting}{Gandhi et al. 2022})}
-
-\end{figure}%
-
-The final compatibility metric, parameterized by the likelihood and
-novelty thresholds \(\lambda\) and \(\eta\), is
-\(\mathcal{M}(\mathcal{D}_{base}, (s_{new}, a_{new})) \in [0, 1]\),
-defined as:
-
-\begin{equation}\phantomsection\label{eq-eq3.63}{\begin{aligned}
-    \mathcal{M} = \begin{cases} 
-        1 - \min(\frac{\mathbb{E}[|| \pi_{base}(s_{new}) - a_{new} ||^2_2]}{\lambda}, 1) & \text{ if } \text{novelty}(s_{new}) < \eta \\
-        1 & \text{ otherwise }
-       \end{cases}.
-\end{aligned}}\end{equation}
-
-Note that \(\lambda\) and \(\eta\) need to be specified by hand. This is
-accomplished by assuming the ability to collect \emph{a priori
-incompatible} demonstrations to identify reasonable thresholds that
-remove the most datapoints in the incompatible demonstrations while
-keeping the most datapoints in the compatible demonstrations.
-
-\subsubsection{Case Studies with Fixed
-Sets}\label{case-studies-with-fixed-sets}
-
-The researchers evaluate the utility of the compatibility metric on
-three tasks: placing a square nut on a square peg, placing a round nut
-on a round peg, and opening a drawer and placing a hammer inside. For
-each task, they train a base policy using a ``proficient'' operator's
-demonstration while sampling trajectories from other operators for the
-new set. The naive baseline is to use all datapoints while the
-\(\mathcal{M}\)-Filtered demonstrations use the compatibility metric to
-filter out incompatible demonstrations. The results are presented in
-Table~\ref{tbl-m_filter_table}. As you can see, M-filtering results in
-equal or greater performance despite using less data than the naive
-baseline, demonstrating the effectiveness of compatibility-based
-filtering.
-
-\begin{longtable}[]{@{}lclclcl@{}}
-\caption{Success rates (mean/std across 3 training runs) for policies
-trained on \(\mathcal{D}_{new}\) by using all the data (Naive) or
-filtering by compatibility (\(\mathcal{M}\)-Filtered)
-(\citeproc{ref-gandhi2022eliciting}{Gandhi et al.
-2022})}\label{tbl-m_filter_table}\tabularnewline
-\toprule\noalign{}
-\endfirsthead
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-& Square Nut & & Round Nut & & Hammer Placement & \\
-Operator & Naive & \(\mathcal{M}\)-Filtered & Naive &
-\(\mathcal{M}\)-Filtered & Naive & \(\mathcal{M}\)-Filtered \\
-Base Operator & 38.7 (2.1) & - & 13.3 (2.3) & - & 24.7 (6.1) & - \\
-Operator 1 & 54.3 (1.5) & 61.0 (4.4) & 26.7 (11.7) & 32.0 (12.2) & 38.0
-(2.0) & 39.7 (4.6) \\
-Operator 2 & 40.3 (5.1) & 42.0 (2.0) & 22.0 (7.2) & 26.7 (5.0) & 33.3
-(3.1) & 32.7 (6.4) \\
-Operator 3 & 37.3 (2.1) & 42.7 (0.6) & 17.3 (4.6) & 18.0 (13.9) & 8.0
-(0.0) & 12.0 (0.0) \\
-Operator 4 & 27.3 (3.5) & 37.3 (2.1) & 7.3 (4.6) & 13.3 (1.2) & 4.0
-(0.0) & 4.0 (0.0) \\
-\end{longtable}
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.8\linewidth,height=\textheight,keepaspectratio]{src/Figures/active_elicitation.png}
-
-}
-
-\caption{\label{fig-active_elicitation}The phases of the active
-elicitation interface: (a) initial prompting, (b) demonstrations with
-live feedback, and (c) corrective feedback
-(\citeproc{ref-gandhi2022eliciting}{Gandhi et al. 2022})}
-
-\end{figure}%
-
-\subsubsection{Actively Eliciting Compatible
-Demonstrations}\label{actively-eliciting-compatible-demonstrations}
-
-In the previous section, we assume access to a dataset that has already
-been collected, and we see how filtering out incompatible demonstrations
-helps improve performance. However, when collecting a new dataset, it
-would be better to ensure that operators collect compatible
-demonstrations from the start, allowing us to retain as much data as
-possible for training.
-
-To actively elicit compatible demonstrations, the researchers set up a
-pipeline for live feedback and examples. At the start, operators are
-given a task specification and some episodes to practice using the
-robot. Then, the active elicitation process begins, as shown in
-Figure~\ref{fig-active_elicitation}. Each operator is shown some
-rollouts of the base policy to understand the style of the base
-operator. Next, the operator provides a demonstration similar to the
-ones they were shown. As they record their demonstrations, the interface
-provides online feedback, with green indicating compatible actions and
-red indicating incompatible actions. If the number of incompatible
-state-action pairs (ones where \(\mathcal{M}\) is zero) exceeds 5\% of
-the demonstration length, the demonstration is rejected. However, to
-provide corrective feedback, the interface shows the areas of the
-demonstration with the highest average incompatibility and also provides
-an expert demo that shows what should actually be done. Demonstrators
-can use this feedback to provide more compatible demonstrations moving
-forward.
-
-This process helps improve the demonstration quality in both simulation
-and real experiments, as show in
-Table~\ref{tbl-active_elicitation_results}. Specifically, on the real
-results, active elicitation outperformed the base policy by 25\% and
-naive data collection by 55\%. Overall, active elicitation is a powerful
-tool to ensure that data collected for imitation learning improves the
-quality of the learned policy.
-
-\begin{longtable}[]{@{}llrll@{}}
-\caption{Success rates (mean/std across users) for policies trained on
-\(\mathcal{D}_{new}\) by using all the data (Naive), filtering by
-compatibility (\(\mathcal{M}\)-Filtered), or using informed
-demonstration collection (\citeproc{ref-gandhi2022eliciting}{Gandhi et
-al. 2022})}\label{tbl-active_elicitation_results}\tabularnewline
-\toprule\noalign{}
-Task Ba & se Naive & Naive + Fil & tered Informed & \\
-\midrule\noalign{}
-\endfirsthead
-\toprule\noalign{}
-Task Ba & se Naive & Naive + Fil & tered Informed & \\
-\midrule\noalign{}
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-Round Nut 13. & 3 (2.3) 9. & 6 (4.6) & 9.7 (4.2) 15 & .7 (6.0) \\
-Hammer Placement 24. & 7 (6.1) 20. & 8 (15.7) & 22.0 (15.5) 31. & 8
-(16.3) \\
-\(\left[ \textup{Real} \right]\) Food Plating & 60.0 30. & 0 (17.3) & -
-85 & .0 (9.6) \\
-\end{longtable}
-
-A fundamental limitation of eliciting compatible demonstrations is the
-fact that the ``base'' demonstrator is considered the ground truth. When
-the base demonstrator specifies a preference, all other demonstrators
-must abide by it, even if they have strong preferences against it. For
-instance, when pouring milk and cereal into a bowl, different people
-have different preferences for what is the correct order, but active
-elicitation forces all demonstrators to follow the initial preference of
-the base operator. The researchers hope that future work can enable
-users to override the default demonstration set and follow a base
-behavior that better aligns with their preferences. This could enable
-multiple modes of behavior to be collected in data while only following
-a user's specified preference instead of attempting to collapse all
-modes into a single policy.
-
-Looking forward, active elicitation provides a foundation for allowing
-robots to query humans about the type of data needed, enabling more
-efficient data collection through transparency.
-
-In summary, this chapter has explored the complexities and innovations
-in interAL as applied to large models within robotics. It begins by
-investigating pairwise comparisons and their role in efficiently
-learning linear reward functions from large datasets, overcoming
-limitations in supervised learning. When combined with active learning
-techniques, these comparisons supply timely, targeted, and
-context-appropriate feedback, enhancing performance in time-critical
-applications like exoskeleton adjustments during rehabilitation.
-
-We then shift to imitation learning or inverse reward learning from
-demonstrations, emphasizing the difficulties introduced by multimodal
-demonstration sets. active elicitation approaches to compile compatible
-demonstrations, streamlining the learning process by guiding users to
-provide more valuable, steady examples are incredibly promising,
-however, to tackling this issue. This method shows promise in refining
-the interactive imitation learning data collection pipeline, enabling
-more capable and effective robotic training.
-
-Additionally, the chapter examines the integration of foundation models
-into robotics, highlighting the transformative innovations of R3M and
-Voltron. R3M's pre-training on diverse human activities dramatically
-improves robotic manipulation with minimal supervision. Meanwhile,
-Voltron builds on these capabilities by incorporating language-driven
-representation learning for remarkably adaptable and nuanced robotic
-task performance. These models represent significant leaps in robotics
-while opening new frontiers for future research and applications.
-
-\section{Exercises}\label{exercises-1}
-
-\subsection*{Question 1: Uncertainty Quantification in Preference
-Learning (40
-points)}\label{sec-question-1-uncertainty-quantification-in-preference-learning-40-points}
-\addcontentsline{toc}{subsection}{Question 1: Uncertainty Quantification
-in Preference Learning (40 points)}
-
-In this question, we will explore Bayesian approaches to logistic
-regression in the context of preference learning using the Bradley-Terry
-model. We will compare different models and inference methods, including
-parametric linear models estimated using Metropolis-Hastings, parametric
-neural network models estimated using Hamiltonian Monte Carlo, and
-non-parametric models with Gaussian Processes. Finally, we will assess
-the uncertainty quantification in these models using the Expected
-Calibration Error (ECE).
-
-Assume we have a dataset of pairwise preferences
-\(\mathcal{D} = \{(x_i, y_i)\}_{i=1}^N\), where \(x_i \in \mathbb{R}^d\)
-represents the feature difference between two items (i.e.,
-\(x_i = e^{(i)}_1 - e^{(i)}_2\) for embeddings \(e^{(i)}_1\) and
-\(e^{(i)}_2\)), and \(y_i \in \{0, 1\}\) indicates the preference
-(\(y_i = 1\) if item 1 is preferred over item 2 in the \(i\)-th pair).
-
-The likelihood of observing \(y_i\) given \(x_i\) and model parameters
-\(\theta\) is given by the logistic function:
-
-\[P(y_i = 1 | x_i, \theta) = \sigma(x_i^\top \theta) = \frac{1}{1 + e^{-x_i^\top \theta}}.\]
-
-We will adopt a Bayesian approach by placing priors on the model
-parameters and using Markov Chain Monte Carlo (MCMC) methods to estimate
-the posterior distributions.
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{Uncertainty Quantification and Expected Calibration Error (11
-  points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 2 point)}. Spend some time reading
-    \url{https://tinyurl.com/m77mk9c}. Explain what the Expected
-    Calibration Error (ECE) measures and why it is important for
-    assessing uncertainty quantification in probabilistic models.
-  \item
-    \textbf{(Coding, 6 points)}. In
-    \texttt{uncertainty\_quantification/ece.py}, implement the ECE using
-    the formula
-    \[\text{ECE} = \sum_{k=1}^K \frac{n_k}{N} \left| \text{acc}(B_k) - \text{conf}(B_k) \right|,\]
-    where \(n_k\) is the number of samples in bin \(B_k\), \(N\) is the
-    total number of samples, \(\text{acc}(B_k)\) is the accuracy in bin
-    \(B_k\), and \(\text{conf}(B_k)\) is the average confidence in bin
-    \(B_k\).
-  \item
-    \textbf{(Written, 3 point)}. After doing parts (b), (c), and (d),
-    compare the ECE scores and reliability diagrams of the 3 models.
-    Which model(s) provide the best uncertainty quantification? Discuss
-    possible reasons for the observed differences.
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-
-\KeywordTok{def}\NormalTok{ expected\_calibration\_error(probs, labels, model\_name, n\_bins}\OperatorTok{=}\DecValTok{20}\NormalTok{, n\_ticks}\OperatorTok{=}\DecValTok{10}\NormalTok{, plot}\OperatorTok{=}\VariableTok{True}\NormalTok{):}
-    \CommentTok{"""}
-\CommentTok{    Computes the Expected Calibration Error (ECE) for a model and plots a refined reliability diagram}
-\CommentTok{    with confidence histogram and additional calibration statistics.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} probs (np.array): Array of predicted probabilities for the positive class (for binary classification).}
-\CommentTok{    {-} labels (np.array): Array of true labels (0 or 1).}
-\CommentTok{    {-} model\_name (str): Name of the model for labeling the plot.}
-\CommentTok{    {-} n\_bins (int): Number of bins to divide the probability interval [0,1] into.}
-\CommentTok{    {-} n\_ticks (int): Number of ticks to show along the x{-}axis.}
-\CommentTok{    {-} plot (bool): If True, generates the reliability plot; otherwise, only computes ECE.}
-
-\CommentTok{    Returns:}
-\CommentTok{    {-} float: Computed ECE value.}
-\CommentTok{    """}
-    
-    \CommentTok{\# Ensure probabilities are in the range [0, 1]}
-    \ControlFlowTok{assert}\NormalTok{ np.}\BuiltInTok{all}\NormalTok{((probs }\OperatorTok{\textgreater{}=} \DecValTok{0}\NormalTok{) }\OperatorTok{\&}\NormalTok{ (probs }\OperatorTok{\textless{}=} \DecValTok{1}\NormalTok{)), }\StringTok{"Probabilities must be in the range [0, 1]"}
-    
-    \CommentTok{\# Initialize bin edges, centers, and storage for accuracy, confidence, and counts}
-\NormalTok{    bin\_edges }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, n\_bins }\OperatorTok{+} \DecValTok{1}\NormalTok{)}
-\NormalTok{    bin\_centers }\OperatorTok{=}\NormalTok{ (bin\_edges[:}\OperatorTok{{-}}\DecValTok{1}\NormalTok{] }\OperatorTok{+}\NormalTok{ bin\_edges[}\DecValTok{1}\NormalTok{:]) }\OperatorTok{/} \DecValTok{2}
-\NormalTok{    bar\_width }\OperatorTok{=} \FloatTok{1.0} \OperatorTok{/}\NormalTok{ n\_bins}
-
-\NormalTok{    accs }\OperatorTok{=}\NormalTok{ np.zeros(n\_bins)}
-\NormalTok{    confs }\OperatorTok{=}\NormalTok{ np.zeros(n\_bins)}
-\NormalTok{    bin\_counts }\OperatorTok{=}\NormalTok{ np.zeros(n\_bins)}
-
-    \CommentTok{\# Populate bin statistics: accuracy, confidence, and count}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}7 lines)}
-    \CommentTok{\# Loop over each bin and:}
-    \CommentTok{\# {-} Find indices of probabilities that fall within the bin.}
-    \CommentTok{\# {-} Count the number of items in the bin.}
-    \CommentTok{\# {-} Calculate the accuracy (average of true labels) within the bin.}
-    \CommentTok{\# {-} Calculate the confidence (average of predicted probabilities) within the bin.}
-    \ControlFlowTok{pass} 
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-    
-    \CommentTok{\# Compute ECE: weighted average of |accuracy {-} confidence| across bins}
-    \CommentTok{\# YOUR CODE HERE (1 line)}
-    \CommentTok{\# {-} Use the bin counts to calculate a weighted average of the differences between accuracy and confidence.}
-\NormalTok{    ece\_value }\OperatorTok{=} \VariableTok{None}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-    
-    \CommentTok{\# Return only ECE if plot is not required}
-    \ControlFlowTok{if} \KeywordTok{not}\NormalTok{ plot:}
-        \ControlFlowTok{return}\NormalTok{ ece\_value}
-
-    \CommentTok{\# Compute average confidence and accuracy for reference lines}
-\NormalTok{    avg\_confidence }\OperatorTok{=}\NormalTok{ np.mean(probs)}
-\NormalTok{    avg\_accuracy }\OperatorTok{=}\NormalTok{ np.mean(labels)}
-    
-    \CommentTok{\# Create reliability diagram and histogram}
-\NormalTok{    fig, (ax1, ax2) }\OperatorTok{=}\NormalTok{ plt.subplots(}\DecValTok{2}\NormalTok{, }\DecValTok{1}\NormalTok{, gridspec\_kw}\OperatorTok{=}\NormalTok{\{}\StringTok{\textquotesingle{}height\_ratios\textquotesingle{}}\NormalTok{: [}\DecValTok{3}\NormalTok{, }\DecValTok{1}\NormalTok{]\}, figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{8}\NormalTok{, }\DecValTok{10}\NormalTok{))}
-    
-    \CommentTok{\# Reliability diagram (top plot)}
-\NormalTok{    ax1.plot([}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{], [}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{], }\StringTok{\textquotesingle{}k{-}{-}\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Perfect Calibration\textquotesingle{}}\NormalTok{)}
-    \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(n\_bins):}
-        \CommentTok{\# Draw the gap bar starting from the diagonal line (perfect calibration)}
-\NormalTok{        ax1.bar(bin\_centers[i], }\BuiltInTok{abs}\NormalTok{(accs[i] }\OperatorTok{{-}}\NormalTok{ confs[i]), width}\OperatorTok{=}\NormalTok{bar\_width, bottom}\OperatorTok{=}\BuiltInTok{min}\NormalTok{(accs[i], confs[i]), }
-\NormalTok{                color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.3}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Accuracy{-}Confidence Gap\textquotesingle{}} \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{==} \DecValTok{0} \ControlFlowTok{else} \StringTok{""}\NormalTok{)}
-        \CommentTok{\# Draw the accuracy bar as a small black line on top of the gap bar}
-\NormalTok{        ax1.plot([bin\_centers[i] }\OperatorTok{{-}}\NormalTok{ bar\_width }\OperatorTok{/} \DecValTok{2}\NormalTok{, bin\_centers[i] }\OperatorTok{+}\NormalTok{ bar\_width }\OperatorTok{/} \DecValTok{2}\NormalTok{], }
-\NormalTok{                 [accs[i], accs[i]], color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{2}\NormalTok{)}
-
-    \CommentTok{\# Add a black line as a sample for accuracy in the legend}
-\NormalTok{    ax1.plot([], [], color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{2}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Accuracy Marker\textquotesingle{}}\NormalTok{)}
-
-\NormalTok{    ax1.set\_xlim(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{)}
-\NormalTok{    ax1.set\_ylim(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{)}
-\NormalTok{    ax1.set\_ylabel(}\StringTok{\textquotesingle{}Accuracy\textquotesingle{}}\NormalTok{)}
-\NormalTok{    ax1.set\_title(}\SpecialStringTok{f\textquotesingle{}}\SpecialCharTok{\{}\NormalTok{model\_name}\SpecialCharTok{\}}\CharTok{\textbackslash{}n}\SpecialStringTok{ECE=}\SpecialCharTok{\{}\NormalTok{ece\_value}\SpecialCharTok{:.2f\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-\NormalTok{    ax1.legend()}
-
-    \CommentTok{\# Set tick marks based on \textasciigrave{}n\_ticks\textasciigrave{} evenly spaced along the x{-}axis}
-\NormalTok{    tick\_positions }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, n\_ticks }\OperatorTok{+} \DecValTok{1}\NormalTok{)}
-\NormalTok{    ax1.set\_xticks(tick\_positions)}
-\NormalTok{    ax2.set\_xticks(tick\_positions)}
-\NormalTok{    ax1.set\_xticklabels([}\SpecialStringTok{f\textquotesingle{}}\SpecialCharTok{\{}\NormalTok{x}\SpecialCharTok{:.2f\}}\SpecialStringTok{\textquotesingle{}} \ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ tick\_positions])}
-\NormalTok{    ax2.set\_xticklabels([}\SpecialStringTok{f\textquotesingle{}}\SpecialCharTok{\{}\NormalTok{x}\SpecialCharTok{:.2f\}}\SpecialStringTok{\textquotesingle{}} \ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ tick\_positions])}
-
-    \CommentTok{\# Confidence histogram with average markers}
-\NormalTok{    ax2.bar(bin\_centers, bin\_counts, width}\OperatorTok{=}\NormalTok{bar\_width, color}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.6}\NormalTok{)}
-\NormalTok{    ax2.axvline(x}\OperatorTok{=}\NormalTok{avg\_confidence, color}\OperatorTok{=}\StringTok{\textquotesingle{}gray\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}{-}{-}\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{2}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Avg. confidence\textquotesingle{}}\NormalTok{)}
-\NormalTok{    ax2.axvline(x}\OperatorTok{=}\NormalTok{avg\_accuracy, color}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}{-}\textquotesingle{}}\NormalTok{, linewidth}\OperatorTok{=}\DecValTok{2}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Avg. accuracy\textquotesingle{}}\NormalTok{)}
-\NormalTok{    ax2.set\_xlim(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{)}
-\NormalTok{    ax2.set\_xlabel(}\StringTok{\textquotesingle{}Confidence\textquotesingle{}}\NormalTok{)}
-\NormalTok{    ax2.set\_ylabel(}\StringTok{\textquotesingle{}Count\textquotesingle{}}\NormalTok{)}
-\NormalTok{    ax2.legend()}
-
-\NormalTok{    plt.tight\_layout()}
-\NormalTok{    plt.show()}
-    
-    \ControlFlowTok{return}\NormalTok{ ece\_value}
-
-\ControlFlowTok{if} \VariableTok{\_\_name\_\_} \OperatorTok{==} \StringTok{"\_\_main\_\_"}\NormalTok{:}
-    \CommentTok{\# Test with random probabilities and labels}
-\NormalTok{    probs }\OperatorTok{=}\NormalTok{ np.random.rand(}\DecValTok{10000}\NormalTok{)  }\CommentTok{\# Random probabilities between 0 and 1}
-\NormalTok{    labels }\OperatorTok{=}\NormalTok{ np.random.binomial(}\DecValTok{1}\NormalTok{, (probs }\OperatorTok{+} \DecValTok{1}\NormalTok{) }\OperatorTok{/} \DecValTok{2}\NormalTok{)}
-
-    \CommentTok{\# Run the function and display the result}
-\NormalTok{    ece\_value }\OperatorTok{=}\NormalTok{ expected\_calibration\_error(probs, labels, }\StringTok{"Test Model"}\NormalTok{, plot}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"ECE Value: }\SpecialCharTok{\{}\NormalTok{ece\_value}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\setcounter{enumi}{1}
-\item
-  \textbf{Parametric Linear Model Estimated Using Metropolis-Hastings
-  (11 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 3 points)}. Assume a prior on \(\theta\) such that
-    \(\theta \sim \mathcal{N}(0, \sigma^2 I)\), where \(\sigma^2\) is
-    the variance and \(I\) is the identity matrix. Derive the expression
-    for the posterior distribution \(P(\theta | \mathcal{D})\) up to a
-    normalization constant.
-  \item
-    \textbf{(Coding, 6 points)}. Implement the Metropolis-Hastings
-    algorithm to sample from the posterior distribution of \(\theta\) in
-    \texttt{uncertainty\_quantification/metropolis.py}.
-  \item
-    \textbf{(Written, 2 points)}. Discuss how you chose the proposal
-    variance \(\tau^2\) and the number of iterations \(T\) and
-    \(T_{\text{burn-in}}\). How did these choices affect the convergence
-    and mixing of your MCMC chain?
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ torch}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ tqdm }\ImportTok{import}\NormalTok{ tqdm}
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{from}\NormalTok{ ece }\ImportTok{import}\NormalTok{ expected\_calibration\_error}
-
-\CommentTok{\# Load training and testing data}
-\NormalTok{x\_train }\OperatorTok{=}\NormalTok{ torch.tensor(np.load(}\StringTok{\textquotesingle{}../data/differences\_train.npy\textquotesingle{}}\NormalTok{))}
-\NormalTok{x\_test }\OperatorTok{=}\NormalTok{ torch.tensor(np.load(}\StringTok{\textquotesingle{}../data/differences\_test.npy\textquotesingle{}}\NormalTok{))}
-\NormalTok{y\_train }\OperatorTok{=}\NormalTok{ torch.tensor(np.load(}\StringTok{\textquotesingle{}../data/labels\_train.npy\textquotesingle{}}\NormalTok{))}
-\NormalTok{y\_test }\OperatorTok{=}\NormalTok{ torch.tensor(np.load(}\StringTok{\textquotesingle{}../data/labels\_test.npy\textquotesingle{}}\NormalTok{))}
-
-\CommentTok{\# Likelihood function for logistic regression (per data point)}
-\KeywordTok{def}\NormalTok{ likelihood(theta, x, y):}
-    \CommentTok{"""}
-\CommentTok{    Computes the likelihood of the data given the logistic regression parameters.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} theta (torch.Tensor): Model parameters.}
-\CommentTok{    {-} x (torch.Tensor): Input data.}
-\CommentTok{    {-} y (torch.Tensor): True labels.}
-
-\CommentTok{    Returns:}
-\CommentTok{    {-} torch.Tensor: Likelihood values for each data point.}
-\CommentTok{    """}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}3 lines)}
-    \CommentTok{\# Calculate logits as the linear combination of inputs and parameters.}
-    \CommentTok{\# Use the sigmoid function to compute the probability of the positive class.}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-\CommentTok{\# Prior probability (theta \textasciitilde{} N(0, I)) {-} only depends on theta, not per sample}
-\KeywordTok{def}\NormalTok{ prior(theta, sigma):}
-    \CommentTok{"""}
-\CommentTok{    Computes the prior probability of theta under a Gaussian distribution with variance sigma\^{}2.}
-
-\CommentTok{    Args:}
-\CommentTok{    {-} theta (torch.Tensor): Model parameters.}
-\CommentTok{    {-} sigma (float): Standard deviation of the prior distribution.}
-
-\CommentTok{    Returns:}
-\CommentTok{    {-} torch.Tensor: Prior probability value.}
-\CommentTok{    """}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}2 lines)}
-    \CommentTok{\# Implement Gaussian prior with zero mean and identity covariance.}
-    \CommentTok{\# Note that the normalization constant is not needed for Metropolis{-}Hastings.}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-\CommentTok{\# Metropolis{-}Hastings sampler}
-\KeywordTok{def}\NormalTok{ metropolis\_hastings(x, y, num\_samples, burn\_in, tau, sigma):}
-    \CommentTok{"""}
-\CommentTok{    Runs the Metropolis{-}Hastings algorithm to sample from the posterior distribution.}
-
-\CommentTok{    Args:}
-\CommentTok{    {-} x (torch.Tensor): Input data.}
-\CommentTok{    {-} y (torch.Tensor): True labels.}
-\CommentTok{    {-} num\_samples (int): Total number of samples to draw.}
-\CommentTok{    {-} burn\_in (int): Number of initial samples to discard.}
-\CommentTok{    {-} tau (float): Proposal standard deviation.}
-\CommentTok{    {-} sigma (float): Prior standard deviation.}
-
-\CommentTok{    Returns:}
-\CommentTok{    {-} torch.Tensor: Collected samples post burn{-}in.}
-\CommentTok{    {-} float: Acceptance ratio.}
-\CommentTok{    """}
-    \CommentTok{\# Initialize theta (starting point of the chain) and containers for samples and acceptance count}
-\NormalTok{    theta }\OperatorTok{=}\NormalTok{ torch.zeros(x.shape[}\DecValTok{1}\NormalTok{])}
-\NormalTok{    samples }\OperatorTok{=}\NormalTok{ []}
-\NormalTok{    acceptances }\OperatorTok{=} \DecValTok{0}
-    
-    \CommentTok{\# Run the Metropolis{-}Hastings algorithm}
-    \ControlFlowTok{for}\NormalTok{ t }\KeywordTok{in}\NormalTok{ tqdm(}\BuiltInTok{range}\NormalTok{(num\_samples), desc}\OperatorTok{=}\StringTok{"MCMC Iteration"}\NormalTok{):}
-        \CommentTok{\# YOUR CODE HERE (\textasciitilde{}12{-}16 lines)}
-        \CommentTok{\# 1. Propose new theta from the proposal distribution (e.g., Gaussian around current theta).}
-        \CommentTok{\# 2. Compute prior and likelihood for current and proposed theta}
-        \CommentTok{\# 3. Calculate the acceptance ratio as the product of likelihood and prior ratios.}
-        \CommentTok{\# 4. Accept or reject the proposal based on the acceptance probability.}
-        \CommentTok{\# 5. Store the sample after the burn{-}in period}
-        \ControlFlowTok{pass}
-        \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-    
-    \ControlFlowTok{return}\NormalTok{ torch.stack(samples), acceptances }\OperatorTok{/}\NormalTok{ num\_samples}
-
-\CommentTok{\# Run Metropolis{-}Hastings on training data}
-\NormalTok{num\_samples }\OperatorTok{=} \DecValTok{10000}
-\NormalTok{burn\_in }\OperatorTok{=} \DecValTok{1000}
-\NormalTok{tau }\OperatorTok{=} \FloatTok{0.01}  \CommentTok{\# Proposal variance (tune this for convergence)}
-\NormalTok{sigma }\OperatorTok{=} \FloatTok{2.0}  \CommentTok{\# Prior variance}
-
-\CommentTok{\# Collect samples and compute acceptance ratio}
-\NormalTok{samples, acceptance\_ratio }\OperatorTok{=}\NormalTok{ metropolis\_hastings(x\_train, y\_train, num\_samples}\OperatorTok{=}\NormalTok{num\_samples, burn\_in}\OperatorTok{=}\NormalTok{burn\_in, tau}\OperatorTok{=}\NormalTok{tau, sigma}\OperatorTok{=}\NormalTok{sigma)}
-\NormalTok{averaged\_weights }\OperatorTok{=}\NormalTok{ samples.mean(axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Predicted weights: }\SpecialCharTok{\{}\NormalTok{averaged\_weights}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Acceptance Ratio: }\SpecialCharTok{\{}\NormalTok{acceptance\_ratio}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# Evaluate accuracy on training set}
-\NormalTok{train\_predictions }\OperatorTok{=}\NormalTok{ (x\_train }\OperatorTok{@}\NormalTok{ averaged\_weights }\OperatorTok{\textgreater{}} \DecValTok{0}\NormalTok{).}\BuiltInTok{float}\NormalTok{()}
-\NormalTok{train\_acc }\OperatorTok{=}\NormalTok{ (train\_predictions }\OperatorTok{==}\NormalTok{ y\_train).}\BuiltInTok{float}\NormalTok{().mean()}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Train Accuracy: }\SpecialCharTok{\{}\NormalTok{train\_acc}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# Evaluate accuracy on testing set}
-\NormalTok{test\_predictions }\OperatorTok{=}\NormalTok{ (x\_test }\OperatorTok{@}\NormalTok{ averaged\_weights }\OperatorTok{\textgreater{}} \DecValTok{0}\NormalTok{).}\BuiltInTok{float}\NormalTok{()}
-\NormalTok{acc }\OperatorTok{=}\NormalTok{ (test\_predictions }\OperatorTok{==}\NormalTok{ y\_test).}\BuiltInTok{float}\NormalTok{().mean()}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Test Accuracy: }\SpecialCharTok{\{}\NormalTok{acc}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# Compute expected calibration error on testing set}
-\NormalTok{expected\_calibration\_error(torch.sigmoid(x\_test }\OperatorTok{@}\NormalTok{ averaged\_weights).numpy(), y\_test.numpy(), model\_name}\OperatorTok{=}\StringTok{"Metropolis{-}Hastings"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\setcounter{enumi}{2}
-\item
-  \textbf{Parametric Neural Network Model Estimated Using Hamiltonian
-  Monte Carlo (11 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 2 points)}. Explain why Hamiltonian Monte Carlo
-    (HMC) is suitable for sampling from the posterior distribution of
-    neural network parameters compared to Metropolis-Hastings.
-  \item
-    \textbf{(Coding, 7 points)}. Implement HMC to sample from the
-    posterior distribution of the parameters \(\theta\) of a neural
-    network \(f(x; \theta)\) used for preference prediction in
-    \texttt{uncertainty\_quantification/hmc\_nn.py}. This will require a
-    GPU and take around 5 minutes on it!
-  \item
-    \textbf{(Written, 2 points)}. Briefly describe the performance of
-    the HMC and Metropolis-Hastings models and provide the accuracy
-    numbers.
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\CommentTok{\# Use a GPU when running this file! JAX should automatically default to GPU.}
-\ImportTok{import}\NormalTok{ jax.numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{import}\NormalTok{ numpyro}
-\ImportTok{import}\NormalTok{ numpyro.distributions }\ImportTok{as}\NormalTok{ dist}
-\ImportTok{from}\NormalTok{ numpyro.infer }\ImportTok{import}\NormalTok{ MCMC, NUTS}
-\ImportTok{from}\NormalTok{ jax }\ImportTok{import}\NormalTok{ random}
-\ImportTok{from}\NormalTok{ ece }\ImportTok{import}\NormalTok{ expected\_calibration\_error}
-
-\CommentTok{\# DO NOT CHANGE! This function can be ignored.}
-\KeywordTok{def}\NormalTok{ set\_numpyro(new\_sampler):}
-\NormalTok{    numpyro.sample }\OperatorTok{=}\NormalTok{ new\_sampler}
-
-\CommentTok{\# Define the neural network model with one hidden layer}
-\KeywordTok{def}\NormalTok{ nn\_model(x\_data, y\_data, hidden\_dim}\OperatorTok{=}\DecValTok{10}\NormalTok{):}
-    \CommentTok{"""}
-\CommentTok{    Defines a Bayesian neural network with one hidden layer.}
-
-\CommentTok{    Args:}
-\CommentTok{    {-} x\_data (np.array): Input data.}
-\CommentTok{    {-} y\_data (np.array): Target labels.}
-\CommentTok{    {-} hidden\_dim (int): Number of units in the hidden layer.}
-
-\CommentTok{    Returns:}
-\CommentTok{    {-} hidden\_activations: Activations from the hidden layer.}
-\CommentTok{    {-} logits: Logits for the output layer.}
-\CommentTok{    """}
-\NormalTok{    input\_dim }\OperatorTok{=}\NormalTok{ x\_data.shape[}\DecValTok{1}\NormalTok{]}
-    
-    \CommentTok{\# Prior over the weights and biases for the hidden layer}
-\NormalTok{    w\_hidden }\OperatorTok{=}\NormalTok{ numpyro.sample(}\StringTok{\textquotesingle{}w\_hidden\textquotesingle{}}\NormalTok{, dist.Normal(np.zeros((input\_dim, hidden\_dim)), np.ones((input\_dim, hidden\_dim))))}
-\NormalTok{    b\_hidden }\OperatorTok{=}\NormalTok{ numpyro.sample(}\StringTok{\textquotesingle{}b\_hidden\textquotesingle{}}\NormalTok{, dist.Normal(np.zeros(hidden\_dim), np.ones(hidden\_dim)))}
-    
-    \CommentTok{\# Compute the hidden layer activations using ReLU}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}1 line)}
-    \CommentTok{\# Implement the hidden layer computation, applying a ReLU activation.}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE }
-    
-    \CommentTok{\# Prior over the weights and biases for the output layer}
-\NormalTok{    w\_output }\OperatorTok{=}\NormalTok{ numpyro.sample(}\StringTok{\textquotesingle{}w\_output\textquotesingle{}}\NormalTok{, dist.Normal(np.zeros(hidden\_dim), np.ones(hidden\_dim)))}
-\NormalTok{    b\_output }\OperatorTok{=}\NormalTok{ numpyro.sample(}\StringTok{\textquotesingle{}b\_output\textquotesingle{}}\NormalTok{, dist.Normal(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{))}
-    
-    \CommentTok{\# Compute the logits for the output layer}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}1 line)}
-    \CommentTok{\# Calculate the logits as the linear combination of hidden activations and output layer weights.}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-    \CommentTok{\# Likelihood (Bernoulli likelihood with logits)}
-\NormalTok{    numpyro.sample(}\StringTok{\textquotesingle{}obs\textquotesingle{}}\NormalTok{, dist.Bernoulli(logits}\OperatorTok{=}\NormalTok{logits), obs}\OperatorTok{=}\NormalTok{y\_data)}
-    \ControlFlowTok{return}\NormalTok{ hidden\_activations, logits}
-
-\KeywordTok{def}\NormalTok{ sigmoid(x):}
-    \CommentTok{"""Helper function to compute the sigmoid of x."""}
-    \ControlFlowTok{return} \DecValTok{1} \OperatorTok{/}\NormalTok{ (}\DecValTok{1} \OperatorTok{+}\NormalTok{ np.exp(}\OperatorTok{{-}}\NormalTok{x))}
-
-\ControlFlowTok{if} \VariableTok{\_\_name\_\_} \OperatorTok{==} \StringTok{"\_\_main\_\_"}\NormalTok{:}
-    \CommentTok{\# Load training and testing data}
-\NormalTok{    x\_train }\OperatorTok{=}\NormalTok{ np.load(}\StringTok{\textquotesingle{}../data/differences\_train.npy\textquotesingle{}}\NormalTok{)}
-\NormalTok{    x\_test }\OperatorTok{=}\NormalTok{ np.load(}\StringTok{\textquotesingle{}../data/differences\_test.npy\textquotesingle{}}\NormalTok{)}
-\NormalTok{    y\_train }\OperatorTok{=}\NormalTok{ np.load(}\StringTok{\textquotesingle{}../data/labels\_train.npy\textquotesingle{}}\NormalTok{)}
-\NormalTok{    y\_test }\OperatorTok{=}\NormalTok{ np.load(}\StringTok{\textquotesingle{}../data/labels\_test.npy\textquotesingle{}}\NormalTok{)}
-
-    \CommentTok{\# HMC Sampler Configuration}
-\NormalTok{    hmc\_kernel }\OperatorTok{=}\NormalTok{ NUTS(nn\_model)}
-
-    \CommentTok{\# Running HMC with the MCMC interface in NumPyro}
-\NormalTok{    num\_samples }\OperatorTok{=} \DecValTok{200}  \CommentTok{\# Number of samples}
-\NormalTok{    warmup\_steps }\OperatorTok{=} \DecValTok{100}  \CommentTok{\# Number of burn{-}in steps}
-\NormalTok{    rng\_key }\OperatorTok{=}\NormalTok{ random.PRNGKey(}\DecValTok{0}\NormalTok{)  }\CommentTok{\# Random seed}
-
-    \CommentTok{\# MCMC object with HMC kernel}
-\NormalTok{    mcmc }\OperatorTok{=}\NormalTok{ MCMC(hmc\_kernel, num\_samples}\OperatorTok{=}\NormalTok{num\_samples, num\_warmup}\OperatorTok{=}\NormalTok{warmup\_steps)}
-\NormalTok{    mcmc.run(rng\_key, x\_train, y\_train)}
-
-    \CommentTok{\# Get the sampled weights (theta samples)}
-\NormalTok{    samples }\OperatorTok{=}\NormalTok{ mcmc.get\_samples()}
-
-    \CommentTok{\# Extract the weight samples}
-\NormalTok{    w\_hidden\_samples }\OperatorTok{=}\NormalTok{ samples[}\StringTok{\textquotesingle{}w\_hidden\textquotesingle{}}\NormalTok{]}
-\NormalTok{    b\_hidden\_samples }\OperatorTok{=}\NormalTok{ samples[}\StringTok{\textquotesingle{}b\_hidden\textquotesingle{}}\NormalTok{]}
-\NormalTok{    w\_output\_samples }\OperatorTok{=}\NormalTok{ samples[}\StringTok{\textquotesingle{}w\_output\textquotesingle{}}\NormalTok{]}
-\NormalTok{    b\_output\_samples }\OperatorTok{=}\NormalTok{ samples[}\StringTok{\textquotesingle{}b\_output\textquotesingle{}}\NormalTok{]}
-
-    \CommentTok{\# Compute the averaged weights and biases}
-\NormalTok{    w\_hidden\_mean }\OperatorTok{=}\NormalTok{ np.mean(w\_hidden\_samples, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-\NormalTok{    b\_hidden\_mean }\OperatorTok{=}\NormalTok{ np.mean(b\_hidden\_samples, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-\NormalTok{    w\_output\_mean }\OperatorTok{=}\NormalTok{ np.mean(w\_output\_samples, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-\NormalTok{    b\_output\_mean }\OperatorTok{=}\NormalTok{ np.mean(b\_output\_samples, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-
-    \CommentTok{\# Forward pass through the network for testing set}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}2 lines)}
-    \CommentTok{\# Compute hidden layer activations and logits for the test set using the mean weights and biases.}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-\NormalTok{    test\_predictions }\OperatorTok{=}\NormalTok{ test\_logits }\OperatorTok{\textgreater{}} \DecValTok{0}
-\NormalTok{    test\_accuracy }\OperatorTok{=}\NormalTok{ np.mean(test\_predictions }\OperatorTok{==}\NormalTok{ y\_test)}
-    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Test Accuracy: }\SpecialCharTok{\{}\NormalTok{test\_accuracy}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-    \CommentTok{\# Forward pass through the network for training set}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}2 lines)}
-    \CommentTok{\# Compute hidden layer activations and logits for the training set.}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-\NormalTok{    train\_predictions }\OperatorTok{=}\NormalTok{ train\_logits }\OperatorTok{\textgreater{}} \DecValTok{0}
-\NormalTok{    train\_accuracy }\OperatorTok{=}\NormalTok{ np.mean(train\_predictions }\OperatorTok{==}\NormalTok{ y\_train)}
-    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Train Accuracy: }\SpecialCharTok{\{}\NormalTok{train\_accuracy}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-    \CommentTok{\# Compute expected calibration error on testing set}
-\NormalTok{    expected\_calibration\_error(sigmoid(test\_logits), y\_test, model\_name}\OperatorTok{=}\StringTok{"HMC"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\setcounter{enumi}{3}
-\item
-  \textbf{Non-Parametric Model with Gaussian Process (GP) (7 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 2 point)}. Describe how a Gaussian Process can be
-    used for preference learning in this context (i.e., describe how the
-    latent function is used for classification).
-  \item
-    \textbf{(Coding, 2 points)}. Run the GP classification for
-    preference learning code in\\
-    \texttt{uncertainty\_quantification/gaussian\_process.py} and
-    provide the accuracy numbers. This can only be run on a CPU and may
-    take around 10 minutes to complete.
-  \item
-    \textbf{(Written, 3 point)}. Discuss the computational complexity of
-    the GP model compared to the parametric models. What are the
-    advantages and disadvantages of using a GP in this setting?
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{from}\NormalTok{ sklearn.gaussian\_process }\ImportTok{import}\NormalTok{ GaussianProcessClassifier}
-\ImportTok{from}\NormalTok{ sklearn.gaussian\_process.kernels }\ImportTok{import}\NormalTok{ RBF}
-\ImportTok{from}\NormalTok{ sklearn.metrics }\ImportTok{import}\NormalTok{ accuracy\_score}
-\ImportTok{from}\NormalTok{ ece }\ImportTok{import}\NormalTok{ expected\_calibration\_error}
-
-\NormalTok{x\_train }\OperatorTok{=}\NormalTok{ np.load(}\StringTok{\textquotesingle{}../data/differences\_train.npy\textquotesingle{}}\NormalTok{)}
-\NormalTok{x\_test }\OperatorTok{=}\NormalTok{ np.load(}\StringTok{\textquotesingle{}../data/differences\_test.npy\textquotesingle{}}\NormalTok{)}
-\NormalTok{y\_train }\OperatorTok{=}\NormalTok{ np.load(}\StringTok{\textquotesingle{}../data/labels\_train.npy\textquotesingle{}}\NormalTok{)}
-\NormalTok{y\_test }\OperatorTok{=}\NormalTok{ np.load(}\StringTok{\textquotesingle{}../data/labels\_test.npy\textquotesingle{}}\NormalTok{)}
-
-\NormalTok{kernel }\OperatorTok{=} \FloatTok{1.0} \OperatorTok{*}\NormalTok{ RBF(length\_scale}\OperatorTok{=}\FloatTok{1.0}\NormalTok{)}
-\NormalTok{gp\_classifier }\OperatorTok{=}\NormalTok{ GaussianProcessClassifier(kernel}\OperatorTok{=}\NormalTok{kernel, random\_state}\OperatorTok{=}\DecValTok{42}\NormalTok{, n\_jobs}\OperatorTok{={-}}\DecValTok{1}\NormalTok{)}
-\NormalTok{gp\_classifier.fit(x\_train, y\_train)}
-
-\NormalTok{y\_pred\_probs }\OperatorTok{=}\NormalTok{ gp\_classifier.predict\_proba(x\_test)[:, }\DecValTok{1}\NormalTok{]}
-\NormalTok{y\_pred\_labels }\OperatorTok{=}\NormalTok{ (y\_pred\_probs }\OperatorTok{\textgreater{}} \FloatTok{0.5}\NormalTok{)}
-
-\NormalTok{train\_accuracy }\OperatorTok{=}\NormalTok{ accuracy\_score(y\_train, gp\_classifier.predict(x\_train))}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Train Accuracy: }\SpecialCharTok{\{}\NormalTok{train\_accuracy}\SpecialCharTok{:.4f\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-\NormalTok{test\_accuracy }\OperatorTok{=}\NormalTok{ accuracy\_score(y\_test, y\_pred\_labels)}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Test Accuracy: }\SpecialCharTok{\{}\NormalTok{test\_accuracy}\SpecialCharTok{:.4f\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-\NormalTok{expected\_calibration\_error(y\_pred\_probs, y\_test, model\_name}\OperatorTok{=}\StringTok{"Gaussian Process Classifier"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\subsection*{Question 2: Active Learning for Preference Learning (40
-points)}\label{sec-question-2-active-learning-for-preference-learning-40-points}
-\addcontentsline{toc}{subsection}{Question 2: Active Learning for
-Preference Learning (40 points)}
-
-In this question, you will explore active learning strategies for
-preference learning using a linear model. We will use expected
-information gain as the acquisition function to select the most
-informative queries, where each query is a pair of items. Assume that we
-model the preferences using a simple linear model. Given feature vectors
-\(x_1\) and \(x_2\) corresponding to two items, the probability that
-\(x_1\) is preferred over \(x_2\) is modeled using a logistic regression
-model, i.e.,
-
-\[P(x_1 \succ x_2 | \theta) = \sigma(\theta^\top (x_1 - x_2)),\]
-
-where \(\theta \in \mathbb{R}^d\) is the model parameter vector, and
-\(\sigma(z)\) is the sigmoid function
-\(\sigma(z) = \frac{1}{1 + e^{-z}}\). The goal is to sequentially select
-pairs of items to maximize the information gained about \(\theta\)
-through preference queries.
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{Expected Information Gain (15 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{Derive the Expected Information Gain (Written, 3 points).}
-    Suppose that after observing a preference between two items \(x_1\)
-    and \(x_2\), the posterior distribution over \(\theta\) is updated.
-    The information gain from this observation is the reduction in
-    uncertainty about \(\theta\) measured using the Kullback-Leibler
-    (KL) divergence between the prior and posterior distributions. Given
-    the current posterior distribution \(P(\theta | \mathcal{D})\) and a
-    possible observation \(y \in \{0, 1\}\) (where \(y = 1\) if \(x_1\)
-    is preferred over \(x_2\), and \(y = 0\) otherwise), the expected
-    information gain is: \[\begin{aligned}
-    \mathbb{E}[\text{IG}(x_1, x_2)] = &P(y=1 | x_1, x_2, \theta) D_{\text{KL}}\left( P(\theta | y = 1, \mathcal{D}) \parallel P(\theta | \mathcal{D}) \right) \\+ 
-    &P(y=0 | x_1, x_2, \theta) D_{\text{KL}}\left( P(\theta | y = 0, \mathcal{D}) \parallel P(\theta | \mathcal{D}) \right)
-    \end{aligned}\]
-
-    Derive this expression for the expected information gain of
-    selecting the pair \((x_1, x_2)\) for a preference query. Start by
-    explaining how the KL divergence measures the information gain, and
-    break down the expectation over the possible outcomes of the query.
-  \item
-    \textbf{Simplifying the KL Divergence (Written, 4 points).} Assuming
-    the prior and posterior distributions over \(\theta\) are Gaussian
-    (i.e., \(P(\theta) \sim \mathcal{N}(\mu, \Sigma)\) and
-    \(P(\theta | \mathcal{D}) \sim \mathcal{N}(\mu', \Sigma')\)), show
-    that the KL divergence between the Gaussian posterior and prior
-    simplifies to: \[\begin{aligned}
-        D_{\text{KL}}\left( \mathcal{N}(\mu', \Sigma') \parallel \mathcal{N}(\mu, \Sigma) \right) &= \frac{1}{2} \left( \text{tr}(\Sigma^{-1} \Sigma') + (\mu' - \mu)^\top \Sigma^{-1} (\mu' - \mu)\right.\\
-        &\left.- d + \log\left( \frac{\det(\Sigma)}{\det(\Sigma')} \right) \right).
-        \end{aligned}\]
-  \item
-    \textbf{Approximate Information Gain for a Linear Model (Written, 4
-    points).} In the case of a linear model with Gaussian priors on
-    \(\theta\), assume that the posterior distribution
-    \(P(\theta | \mathcal{D}) \sim \mathcal{N}(\mu, \Sigma)\) is updated
-    using Bayes' rule after each observation. The likelihood of
-    observing a preference \(y\) is logistic, which does not conjugate
-    with the Gaussian prior. However, for the purposes of this question,
-    assume that after each query, the posterior mean \(\mu'\) and
-    covariance \(\Sigma'\) can be updated using an approximation method
-    such as Laplace's approximation.
-
-    Using these assumptions, compute the expected information gain for a
-    specific query \((x_1, x_2)\) in closed form. You may express the
-    information gain in terms of the updated mean \(\mu'\) and
-    covariance \(\Sigma'\) after observing the preference outcome.
-  \item
-    \textbf{Laplace Approximation for Posterior (Written, 4 points).}
-    The Laplace approximation for the posterior is given by
-    \[\begin{aligned}
-    \mu'=\arg \min_\theta -\log P(\theta | \mathcal{D})\\
-    \Sigma'^{-1}=\nabla_\theta\nabla_\theta -\log P(\theta|\mathcal{D})|_{\theta=\mu'}
-    \end{aligned}\] In our scenario with the Bradley-Terry model for
-    likelihood, simplify \(-\log P(\theta | \mathcal{D})\) and its
-    Hessian ignoring the normalization constant.
-  \end{enumerate}
-\item
-  \textbf{Active Learning Algorithm (25 points)} In this section, you
-  will implement an active learning algorithm for selecting the most
-  informative queries using the expected information gain criterion.
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Coding, 4 points).} Implement
-    \texttt{kl\_divergence\_gaussians} in
-    \texttt{active\_learning/main.py}.
-  \item
-    \textbf{(Coding, 4 points).} Following your derived Laplace
-    approximation, implement \texttt{negative\_log\_posterior}.
-  \item
-    \textbf{(Coding, 4 points).} Implement \texttt{compute\_hessian}
-    that is used to obtain the inverse of the covariance matrix.
-  \item
-    \textbf{(Coding, 3 points).} Implement
-    \texttt{expected\_information\_gain}.
-  \item
-    \textbf{(Coding, 4 points).} Finally, implement
-    \texttt{active\_learning}.
-  \item
-    \textbf{(Coding + Written, 6 points).} Plot the \(L^2\) norm of the
-    covariance matrix for each loop of the active learning loop.
-    Additionally, on the same plot, implement a random baseline and plot
-    its \(L^2\) covariance matrix norm. The random baseline should
-    randomly select a point in the dataset and not use any acquisition
-    function. Interpret your plot and use it to compare the two methods.
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ torch}
-\ImportTok{import}\NormalTok{ torch.nn.functional }\ImportTok{as}\NormalTok{ F}
-\ImportTok{from}\NormalTok{ torch.optim }\ImportTok{import}\NormalTok{ Adam}
-\ImportTok{from}\NormalTok{ tqdm }\ImportTok{import}\NormalTok{ tqdm}
-\ImportTok{from}\NormalTok{ sklearn.model\_selection }\ImportTok{import}\NormalTok{ train\_test\_split}
-\ImportTok{from}\NormalTok{ sklearn.datasets }\ImportTok{import}\NormalTok{ make\_classification}
-
-\KeywordTok{class}\NormalTok{ LogisticActiveLearning:}
-    \KeywordTok{def} \FunctionTok{\_\_init\_\_}\NormalTok{(}\VariableTok{self}\NormalTok{, test\_size}\OperatorTok{=}\FloatTok{0.2}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Initializes LogisticActiveLearning model, sets device, and prepares data.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} test\_size (float): Proportion of the dataset used for validation.}
-\CommentTok{        """}
-        \CommentTok{\# Make device customizable}
-        \VariableTok{self}\NormalTok{.device }\OperatorTok{=}\NormalTok{ torch.device(}\StringTok{"cpu"}\NormalTok{)}
-\NormalTok{        X, y }\OperatorTok{=}\NormalTok{ make\_classification(n\_samples}\OperatorTok{=}\DecValTok{10000}\NormalTok{, random\_state}\OperatorTok{=}\DecValTok{42}\NormalTok{)}
-
-        \CommentTok{\# Convert data and labels to tensors}
-\NormalTok{        x\_data }\OperatorTok{=}\NormalTok{ torch.tensor(X, dtype}\OperatorTok{=}\NormalTok{torch.float32).to(}\VariableTok{self}\NormalTok{.device)}
-\NormalTok{        y\_data }\OperatorTok{=}\NormalTok{ torch.tensor(y, dtype}\OperatorTok{=}\NormalTok{torch.float32).to(}\VariableTok{self}\NormalTok{.device)}
-        \VariableTok{self}\NormalTok{.N, }\VariableTok{self}\NormalTok{.D }\OperatorTok{=}\NormalTok{ x\_data.shape}
-
-        \CommentTok{\# Split into training and validation sets}
-\NormalTok{        train\_indices, val\_indices }\OperatorTok{=}\NormalTok{ train\_test\_split(}\BuiltInTok{range}\NormalTok{(}\VariableTok{self}\NormalTok{.N), test\_size}\OperatorTok{=}\NormalTok{test\_size, random\_state}\OperatorTok{=}\DecValTok{42}\NormalTok{)}
-        \VariableTok{self}\NormalTok{.x\_train }\OperatorTok{=}\NormalTok{ x\_data[train\_indices]}
-        \VariableTok{self}\NormalTok{.y\_train }\OperatorTok{=}\NormalTok{ y\_data[train\_indices]}
-        \VariableTok{self}\NormalTok{.x\_val }\OperatorTok{=}\NormalTok{ x\_data[val\_indices]}
-        \VariableTok{self}\NormalTok{.y\_val }\OperatorTok{=}\NormalTok{ y\_data[val\_indices]}
-
-        \CommentTok{\# Initialize mean and inverse covariance for the prior}
-        \VariableTok{self}\NormalTok{.weights\_mean }\OperatorTok{=}\NormalTok{ torch.zeros(}\VariableTok{self}\NormalTok{.D, requires\_grad}\OperatorTok{=}\VariableTok{True}\NormalTok{, device}\OperatorTok{=}\VariableTok{self}\NormalTok{.device)}
-        \VariableTok{self}\NormalTok{.weights\_inv\_cov }\OperatorTok{=}\NormalTok{ torch.eye(}\VariableTok{self}\NormalTok{.D).to(}\VariableTok{self}\NormalTok{.device)  }\CommentTok{\# Start with identity inverse covariance}
-
-    \KeywordTok{def}\NormalTok{ negative\_log\_posterior(}\VariableTok{self}\NormalTok{, w, x, y):}
-        \CommentTok{"""}
-\CommentTok{        Computes the negative log{-}posterior (negative log{-}prior + log{-}likelihood).}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} w (torch.Tensor): Model weights.}
-\CommentTok{        {-} x (torch.Tensor): Input data point.}
-\CommentTok{        {-} y (torch.Tensor): True label.}
-\CommentTok{        }
-\CommentTok{        Returns:}
-\CommentTok{        {-} torch.Tensor: Negative log{-}posterior value.}
-\CommentTok{        """}
-        \CommentTok{\# YOUR CODE HERE (\textasciitilde{}4{-}6 lines)}
-        \CommentTok{\# Compute log{-}prior term using inverse covariance}
-        \ControlFlowTok{pass}
-        \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-    \KeywordTok{def}\NormalTok{ optimize\_weights(}\VariableTok{self}\NormalTok{, w, x, y, num\_steps}\OperatorTok{=}\DecValTok{50}\NormalTok{, lr}\OperatorTok{=}\FloatTok{1e{-}2}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Optimizes weights using Adam optimizer.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} w (torch.Tensor): Initial weights.}
-\CommentTok{        {-} x (torch.Tensor): Input data point.}
-\CommentTok{        {-} y (torch.Tensor): True label.}
-\CommentTok{        {-} num\_steps (int): Number of optimization steps.}
-\CommentTok{        {-} lr (float): Learning rate.}
-\CommentTok{        }
-\CommentTok{        Returns:}
-\CommentTok{        {-} torch.Tensor: Updated weights.}
-\CommentTok{        {-} torch.Tensor: Hessian inverse covariance.}
-\CommentTok{        """}
-\NormalTok{        optimizer }\OperatorTok{=}\NormalTok{ Adam([w], lr}\OperatorTok{=}\NormalTok{lr)}
-        
-        \ControlFlowTok{for}\NormalTok{ step }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(num\_steps):}
-\NormalTok{            optimizer.zero\_grad()}
-\NormalTok{            loss }\OperatorTok{=} \VariableTok{self}\NormalTok{.negative\_log\_posterior(w, x, y)}
-\NormalTok{            loss.backward()}
-\NormalTok{            optimizer.step()}
-
-        \CommentTok{\# Compute the Hessian of log{-}posterior, serving as inverse covariance}
-\NormalTok{        inv\_cov }\OperatorTok{=} \VariableTok{self}\NormalTok{.compute\_hessian(w.detach(), x, y)}
-        \ControlFlowTok{return}\NormalTok{ w.detach().clone(), inv\_cov}
-
-    \KeywordTok{def}\NormalTok{ compute\_hessian(}\VariableTok{self}\NormalTok{, w, x, y):}
-        \CommentTok{"""}
-\CommentTok{        Computes the Hessian of the negative log{-}posterior, used as the inverse covariance.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} w (torch.Tensor): Model weights.}
-\CommentTok{        {-} x (torch.Tensor): Input data point.}
-\CommentTok{        {-} y (torch.Tensor): True label.}
-\CommentTok{        }
-\CommentTok{        Returns:}
-\CommentTok{        {-} torch.Tensor: Hessian of the negative log{-}posterior.}
-\CommentTok{        """}
-        \CommentTok{\# YOUR CODE HERE (\textasciitilde{}5{-}8 lines)}
-        \CommentTok{\# Hessian of the prior term}
-        \ControlFlowTok{pass}
-        \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-    \KeywordTok{def}\NormalTok{ acquisition\_fn(}\VariableTok{self}\NormalTok{, x):}
-        \CommentTok{"""}
-\CommentTok{        Computes posterior means and inverse covariances for y=1 and y=0 without modifying original parameters.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} x (torch.Tensor): Input data point.}
-\CommentTok{        }
-\CommentTok{        Returns:}
-\CommentTok{        {-} dict: Posterior properties for y=1 and y=0 cases.}
-\CommentTok{        """}
-\NormalTok{        weights\_y1 }\OperatorTok{=} \VariableTok{self}\NormalTok{.weights\_mean.clone().detach().requires\_grad\_(}\VariableTok{True}\NormalTok{)}
-\NormalTok{        weights\_y0 }\OperatorTok{=} \VariableTok{self}\NormalTok{.weights\_mean.clone().detach().requires\_grad\_(}\VariableTok{True}\NormalTok{)}
-
-        \CommentTok{\# Optimize weights and get Hessian for both y=1 and y=0 cases}
-\NormalTok{        posterior\_mean\_y1, inv\_cov\_y1 }\OperatorTok{=} \VariableTok{self}\NormalTok{.optimize\_weights(weights\_y1, x, }\DecValTok{1}\NormalTok{, num\_steps}\OperatorTok{=}\DecValTok{50}\NormalTok{)}
-\NormalTok{        posterior\_mean\_y0, inv\_cov\_y0 }\OperatorTok{=} \VariableTok{self}\NormalTok{.optimize\_weights(weights\_y0, x, }\DecValTok{0}\NormalTok{, num\_steps}\OperatorTok{=}\DecValTok{50}\NormalTok{)}
-
-        \CommentTok{\# Calculate probabilities for the acquisition function}
-\NormalTok{        prob\_y1 }\OperatorTok{=}\NormalTok{ torch.sigmoid(torch.dot(}\VariableTok{self}\NormalTok{.weights\_mean.detach(), x))}
-\NormalTok{        prob\_y0 }\OperatorTok{=} \DecValTok{1} \OperatorTok{{-}}\NormalTok{ prob\_y1}
-
-        \ControlFlowTok{return}\NormalTok{ \{}
-            \StringTok{\textquotesingle{}prob\_y1\textquotesingle{}}\NormalTok{: prob\_y1,}
-            \StringTok{\textquotesingle{}prob\_y0\textquotesingle{}}\NormalTok{: prob\_y0,}
-            \StringTok{\textquotesingle{}posterior\_mean\_y1\textquotesingle{}}\NormalTok{: posterior\_mean\_y1,}
-            \StringTok{\textquotesingle{}posterior\_inv\_cov\_y1\textquotesingle{}}\NormalTok{: inv\_cov\_y1,}
-            \StringTok{\textquotesingle{}posterior\_mean\_y0\textquotesingle{}}\NormalTok{: posterior\_mean\_y0,}
-            \StringTok{\textquotesingle{}posterior\_inv\_cov\_y0\textquotesingle{}}\NormalTok{: inv\_cov\_y0}
-\NormalTok{        \}}
-
-    \KeywordTok{def}\NormalTok{ expected\_information\_gain(}\VariableTok{self}\NormalTok{, x):}
-        \CommentTok{"""}
-\CommentTok{        Computes expected information gain for a given point \textasciigrave{}x\textasciigrave{}.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} x (torch.Tensor): Input data point.}
-\CommentTok{        }
-\CommentTok{        Returns:}
-\CommentTok{        {-} torch.Tensor: Expected Information Gain (EIG) value.}
-\CommentTok{        """}
-\NormalTok{        acquisition }\OperatorTok{=} \VariableTok{self}\NormalTok{.acquisition\_fn(x)}
-
-        \CommentTok{\# Compute KL divergences for y=1 and y=0 using inverse covariances}
-\NormalTok{        kl\_y1 }\OperatorTok{=}\NormalTok{ kl\_divergence\_gaussians(}
-\NormalTok{            acquisition[}\StringTok{\textquotesingle{}posterior\_mean\_y1\textquotesingle{}}\NormalTok{],}
-\NormalTok{            acquisition[}\StringTok{\textquotesingle{}posterior\_inv\_cov\_y1\textquotesingle{}}\NormalTok{],}
-            \VariableTok{self}\NormalTok{.weights\_mean.detach(),}
-            \VariableTok{self}\NormalTok{.weights\_inv\_cov}
-\NormalTok{        )}
-
-\NormalTok{        kl\_y0 }\OperatorTok{=}\NormalTok{ kl\_divergence\_gaussians(}
-\NormalTok{            acquisition[}\StringTok{\textquotesingle{}posterior\_mean\_y0\textquotesingle{}}\NormalTok{],}
-\NormalTok{            acquisition[}\StringTok{\textquotesingle{}posterior\_inv\_cov\_y0\textquotesingle{}}\NormalTok{],}
-            \VariableTok{self}\NormalTok{.weights\_mean.detach(),}
-            \VariableTok{self}\NormalTok{.weights\_inv\_cov}
-\NormalTok{        )}
-
-        \CommentTok{\# Expected Information Gain (EIG)}
-\NormalTok{        eig }\OperatorTok{=} \VariableTok{None} \CommentTok{\# YOUR CODE HERE (1 line)}
-        \ControlFlowTok{return}\NormalTok{ eig}
-
-    \KeywordTok{def}\NormalTok{ active\_learning(}\VariableTok{self}\NormalTok{, selected\_indices, subset\_size}\OperatorTok{=}\DecValTok{50}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Active learning loop that selects the most informative data point based on EIG.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} selected\_indices (list): Indices of previously selected samples.}
-\CommentTok{        {-} subset\_size (int): Number of samples to consider in each subset.}
-
-\CommentTok{        Returns:}
-\CommentTok{        {-} best\_x, best\_x\_idx, best\_acquisition: Selected data point and acquisition details.}
-\CommentTok{        """}
-\NormalTok{        best\_eig }\OperatorTok{=} \OperatorTok{{-}}\BuiltInTok{float}\NormalTok{(}\StringTok{\textquotesingle{}inf\textquotesingle{}}\NormalTok{)}
-\NormalTok{        best\_x }\OperatorTok{=} \VariableTok{None}
-\NormalTok{        best\_x\_idx }\OperatorTok{=} \OperatorTok{{-}}\DecValTok{1}
-\NormalTok{        best\_acquisition }\OperatorTok{=} \VariableTok{None}
-
-\NormalTok{        subset\_indices }\OperatorTok{=}\NormalTok{ [i }\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in}\NormalTok{ torch.randperm(}\BuiltInTok{len}\NormalTok{(}\VariableTok{self}\NormalTok{.x\_train)).tolist() }\ControlFlowTok{if}\NormalTok{ i }\KeywordTok{not} \KeywordTok{in}\NormalTok{ selected\_indices][:subset\_size]}
-
-        \CommentTok{\# YOUR CODE HERE (\textasciitilde{} 10 lines)}
-        \ControlFlowTok{pass}
-        \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-        \ControlFlowTok{return}\NormalTok{ best\_x, best\_x\_idx, best\_acquisition}
-
-    \KeywordTok{def}\NormalTok{ validate(}\VariableTok{self}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Computes accuracy on the validation set by predicting labels and comparing to true labels.}
-\CommentTok{        }
-\CommentTok{        Returns:}
-\CommentTok{        {-} float: Validation accuracy.}
-\CommentTok{        """}
-        \ControlFlowTok{with}\NormalTok{ torch.no\_grad():}
-\NormalTok{            logits }\OperatorTok{=} \VariableTok{self}\NormalTok{.x\_val }\OperatorTok{@} \VariableTok{self}\NormalTok{.weights\_mean}
-\NormalTok{            predictions }\OperatorTok{=}\NormalTok{ torch.sigmoid(logits) }\OperatorTok{\textgreater{}=} \FloatTok{0.5}  \CommentTok{\# Convert logits to binary predictions}
-\NormalTok{            accuracy }\OperatorTok{=}\NormalTok{ (predictions }\OperatorTok{==} \VariableTok{self}\NormalTok{.y\_val).}\BuiltInTok{float}\NormalTok{().mean().item()}
-            \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Validation accuracy: }\SpecialCharTok{\{}\NormalTok{accuracy }\OperatorTok{*} \DecValTok{100}\SpecialCharTok{:.2f\}}\SpecialStringTok{\%"}\NormalTok{)}
-        \ControlFlowTok{return}\NormalTok{ accuracy}
-
-    \KeywordTok{def}\NormalTok{ train(}\VariableTok{self}\NormalTok{, num\_iterations}\OperatorTok{=}\DecValTok{10}\NormalTok{, subset\_size}\OperatorTok{=}\DecValTok{50}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Train the model using active learning with subset sampling.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} num\_iterations (int): Number of active learning iterations.}
-\CommentTok{        {-} subset\_size (int): Number of samples to consider in each subset.}
-\CommentTok{        """}
-\NormalTok{        selected\_indices }\OperatorTok{=}\NormalTok{ []}
-        \ControlFlowTok{for}\NormalTok{ iteration }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(num\_iterations):}
-            \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Iteration }\SpecialCharTok{\{}\NormalTok{iteration }\OperatorTok{+} \DecValTok{1}\SpecialCharTok{\}}\SpecialStringTok{/}\SpecialCharTok{\{}\NormalTok{num\_iterations}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
-
-            \CommentTok{\# Select the most informative data point from a random subset}
-\NormalTok{            best\_x, best\_x\_idx, acquisition }\OperatorTok{=} \VariableTok{self}\NormalTok{.active\_learning(selected\_indices, subset\_size}\OperatorTok{=}\NormalTok{subset\_size)}
-\NormalTok{            selected\_indices.append(best\_x\_idx)}
-            \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Selected data point with EIG."}\NormalTok{)}
-
-            \CommentTok{\# Get the true label for the selected data point}
-\NormalTok{            y }\OperatorTok{=} \VariableTok{self}\NormalTok{.y\_train[best\_x\_idx].item()}
-
-            \CommentTok{\# Update posterior mean and inverse covariance based on true label}
-            \ControlFlowTok{if}\NormalTok{ y }\OperatorTok{==} \DecValTok{1}\NormalTok{:}
-                \VariableTok{self}\NormalTok{.weights\_mean }\OperatorTok{=}\NormalTok{ acquisition[}\StringTok{\textquotesingle{}posterior\_mean\_y1\textquotesingle{}}\NormalTok{]}
-                \VariableTok{self}\NormalTok{.weights\_inv\_cov }\OperatorTok{=}\NormalTok{ acquisition[}\StringTok{\textquotesingle{}posterior\_inv\_cov\_y1\textquotesingle{}}\NormalTok{]}
-            \ControlFlowTok{else}\NormalTok{:}
-                \VariableTok{self}\NormalTok{.weights\_mean }\OperatorTok{=}\NormalTok{ acquisition[}\StringTok{\textquotesingle{}posterior\_mean\_y0\textquotesingle{}}\NormalTok{]}
-                \VariableTok{self}\NormalTok{.weights\_inv\_cov }\OperatorTok{=}\NormalTok{ acquisition[}\StringTok{\textquotesingle{}posterior\_inv\_cov\_y0\textquotesingle{}}\NormalTok{]}
-
-            \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Covariance L2: }\SpecialCharTok{\{}\NormalTok{torch}\SpecialCharTok{.}\NormalTok{inverse(}\VariableTok{self}\NormalTok{.weights\_inv\_cov)}\SpecialCharTok{.}\NormalTok{norm()}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
-
-            \CommentTok{\# Validate model performance on the validation set}
-            \VariableTok{self}\NormalTok{.validate()}
-
-\CommentTok{\# KL divergence between two multivariate normal distributions}
-\KeywordTok{def}\NormalTok{ kl\_divergence\_gaussians(mu1, sigma1\_inv, mu2, sigma2\_inv):}
-    \CommentTok{"""}
-\CommentTok{    Computes the KL divergence between two multivariate Gaussian distributions.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} mu1, mu2 (torch.Tensor): Mean vectors of the distributions.}
-\CommentTok{    {-} sigma1\_inv, sigma2\_inv (torch.Tensor): Inverse covariance matrices of the distributions. PLEASE }\AlertTok{NOTE}\CommentTok{ THE INVERSE!}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} torch.Tensor: KL divergence value.}
-\CommentTok{    """}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{} 9{-}12 lines)}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-\CommentTok{\# Example usage}
-\NormalTok{model }\OperatorTok{=}\NormalTok{ LogisticActiveLearning()}
-\NormalTok{model.train(num\_iterations}\OperatorTok{=}\DecValTok{100}\NormalTok{, subset\_size}\OperatorTok{=}\DecValTok{50}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\subsection*{Question 3: Linear Performance Metric Elicitation (30
-points)}\label{sec-question-3-linear-performance-metric-elicitation-30-points}
-\addcontentsline{toc}{subsection}{Question 3: Linear Performance Metric
-Elicitation (30 points)}
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  \textbf{(Written, 10 points).} For background on the problem setting,
-  read \url{https://tinyurl.com/3b92sufm}. Suppose we have a linear
-  performance metric given by \[p(C) = 1-\alpha (FP)-\beta (FN)\] where
-  \(C\) is a confusion matrix and \(FP, FN\) denote false positive and
-  false negative rates. We wish to find the optimal classifier w.r.t.
-  \(p\). That is, \[\phi^* = \arg \max_{\phi\in\Phi} p(C(\phi))\] where
-  \(\Phi\) is the space of all probabilistic binary classifiers from
-  \(X\to [0, 1]\). Note that these classifiers return probabilities
-  corresponding to the label \(1\). Show that \(\phi^*\) is in fact
-  deterministic and given by \[\phi(x)=\begin{cases}
-      1 & \text{if } p(y|x) > f(\alpha,\beta) \\
-      0 & \text{otherwise}.
-  \end{cases}\] for a threshold function \(f\) that you must find.
-  (Hint: For a classifier \(\phi\), \(FP=P(\phi=1, y=0)\) and
-  \(FN=P(\phi=0, y=1)\). Marginalize these joint probabilities over
-  \(x\) and simplify.)
-\item
-  \textbf{(Written + Coding, 5 points).} Implement
-  \texttt{classifier\_metrics} in \texttt{lpme/main.py}. After doing so,
-  run \texttt{plot\_confusion\_region} and attach the plot. What do you
-  notice about the region of possible confusion matrices?
-\item
-  \textbf{(Coding, 15 points).} Implement \texttt{search\_theta} in
-  order to elicit the metric used by the oracle (which is parametrized
-  by \(\theta\)). Play around with the oracle's theta and run
-  \texttt{start\_search} to see how close you can approximate it!
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ torch}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ tqdm }\ImportTok{import}\NormalTok{ tqdm}
-
-\KeywordTok{class}\NormalTok{ DataDistribution:}
-    \KeywordTok{def} \FunctionTok{\_\_init\_\_}\NormalTok{(}\VariableTok{self}\NormalTok{, N: }\BuiltInTok{int}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Initializes the data distribution with a specified number of samples.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} N (int): Number of data points.}
-\CommentTok{        """}
-        \VariableTok{self}\NormalTok{.weights }\OperatorTok{=}\NormalTok{ torch.tensor([}\OperatorTok{{-}}\FloatTok{0.3356}\NormalTok{, }\OperatorTok{{-}}\FloatTok{1.4104}\NormalTok{, }\FloatTok{0.3144}\NormalTok{, }\OperatorTok{{-}}\FloatTok{0.5591}\NormalTok{, }\FloatTok{1.0426}\NormalTok{, }\FloatTok{0.6036}\NormalTok{, }\OperatorTok{{-}}\FloatTok{0.7549}\NormalTok{, }\OperatorTok{{-}}\FloatTok{1.1909}\NormalTok{, }\FloatTok{1.4779}\NormalTok{, }\OperatorTok{{-}}\FloatTok{0.7513}\NormalTok{])}
-        \VariableTok{self}\NormalTok{.D }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(}\VariableTok{self}\NormalTok{.weights)}
-
-\NormalTok{        gen }\OperatorTok{=}\NormalTok{ torch.Generator().manual\_seed(}\DecValTok{42}\NormalTok{)}
-        \VariableTok{self}\NormalTok{.data }\OperatorTok{=}\NormalTok{ torch.randn(N, }\VariableTok{self}\NormalTok{.D, generator}\OperatorTok{=}\NormalTok{gen)}
-        \VariableTok{self}\NormalTok{.probs }\OperatorTok{=}\NormalTok{ torch.sigmoid(}\VariableTok{self}\NormalTok{.data }\OperatorTok{@} \VariableTok{self}\NormalTok{.weights)}
-    
-\KeywordTok{def}\NormalTok{ classifier\_metrics(data\_dist, threshold, upper}\OperatorTok{=}\VariableTok{True}\NormalTok{):}
-    \CommentTok{"""}
-\CommentTok{    Computes the True Positive and True Negative rates based on a classifier threshold.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} data\_dist (DataDistribution): The data distribution instance.}
-\CommentTok{    {-} threshold (float): Threshold value for classification.}
-\CommentTok{    {-} upper (bool): If True, classifies as positive if above threshold; else, if below.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} tuple (float, float): True Positive Rate (TP) and True Negative Rate (TN) in that order.}
-\CommentTok{    """}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}3{-}5 lines)}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-\KeywordTok{def}\NormalTok{ sweep\_classifiers(data\_dist: DataDistribution):}
-    \CommentTok{"""}
-\CommentTok{    Sweeps through classifier thresholds and calculates True Positive and True Negative rates.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} data\_dist (DataDistribution): The data distribution instance.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} tuple: Upper and lower boundary data for True Positive and True Negative rates.}
-\CommentTok{    """}
-\NormalTok{    thresholds }\OperatorTok{=}\NormalTok{ torch.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{100}\NormalTok{)}
-\NormalTok{    upper\_boundary }\OperatorTok{=}\NormalTok{ []}
-\NormalTok{    lower\_boundary }\OperatorTok{=}\NormalTok{ []}
-    
-    \ControlFlowTok{for}\NormalTok{ threshold }\KeywordTok{in}\NormalTok{ tqdm(thresholds, desc}\OperatorTok{=}\StringTok{"Thresholds"}\NormalTok{):}
-\NormalTok{        tp\_upper, tn\_upper }\OperatorTok{=}\NormalTok{ classifier\_metrics(data\_dist, threshold, upper}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-\NormalTok{        upper\_boundary.append((tp\_upper, tn\_upper))}
-
-\NormalTok{        tp\_lower, tn\_lower }\OperatorTok{=}\NormalTok{ classifier\_metrics(data\_dist, threshold, upper}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
-\NormalTok{        lower\_boundary.append((tp\_lower, tn\_lower))}
-
-    \ControlFlowTok{return}\NormalTok{ upper\_boundary, lower\_boundary}
-
-\KeywordTok{class}\NormalTok{ Oracle:}
-    \KeywordTok{def} \FunctionTok{\_\_init\_\_}\NormalTok{(}\VariableTok{self}\NormalTok{, theta: }\BuiltInTok{float}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Initializes the oracle with a given theta for preference evaluation.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} theta (float): Oracle angle in radians.}
-\CommentTok{        """}
-        \VariableTok{self}\NormalTok{.theta }\OperatorTok{=}\NormalTok{ torch.tensor(theta)}
-
-    \KeywordTok{def}\NormalTok{ evaluate\_lpm(}\VariableTok{self}\NormalTok{, tp, tn):}
-        \CommentTok{"""}
-\CommentTok{        Computes the linear performance metric (LPM) based on theta.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} tp (float): True Positive rate.}
-\CommentTok{        {-} tn (float): True Negative rate.}
-\CommentTok{        }
-\CommentTok{        Returns:}
-\CommentTok{        {-} float: Linear performance metric evaluation.}
-\CommentTok{        """}
-        \ControlFlowTok{return}\NormalTok{ torch.cos(}\VariableTok{self}\NormalTok{.theta) }\OperatorTok{*}\NormalTok{ tp }\OperatorTok{+}\NormalTok{ torch.sin(}\VariableTok{self}\NormalTok{.theta) }\OperatorTok{*}\NormalTok{ tn}
-    
-    \KeywordTok{def}\NormalTok{ preferred\_classifier(}\VariableTok{self}\NormalTok{, tp\_1, tn\_1, tp\_2, tn\_2):}
-        \CommentTok{"""}
-\CommentTok{        Determines the preferred classifier based on LPM values.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} tp\_1, tn\_1, tp\_2, tn\_2 (float): True Positive and True Negative rates for two classifiers.}
-\CommentTok{        }
-\CommentTok{        Returns:}
-\CommentTok{        {-} bool: True if first classifier is preferred, False otherwise.}
-\CommentTok{        """}
-\NormalTok{        lpm\_1 }\OperatorTok{=} \VariableTok{self}\NormalTok{.evaluate\_lpm(tp\_1, tn\_1)}
-\NormalTok{        lpm\_2 }\OperatorTok{=} \VariableTok{self}\NormalTok{.evaluate\_lpm(tp\_2, tn\_2)}
-        \ControlFlowTok{return}\NormalTok{ (lpm\_1 }\OperatorTok{\textgreater{}}\NormalTok{ lpm\_2).item()}
-    
-\KeywordTok{def}\NormalTok{ theta\_to\_threshold(theta):}
-    \CommentTok{"""Converts theta angle to classification threshold."""}
-    \ControlFlowTok{return} \DecValTok{1} \OperatorTok{/}\NormalTok{ (}\DecValTok{1} \OperatorTok{+}\NormalTok{ torch.tan(theta) }\OperatorTok{**} \OperatorTok{{-}}\DecValTok{1}\NormalTok{)}
-
-\KeywordTok{def}\NormalTok{ search\_theta(oracle: Oracle, data\_dist, lower\_bound, upper\_bound):}
-    \CommentTok{"""}
-\CommentTok{    Performs a search over theta values to optimize the classification threshold.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} oracle (Oracle): The oracle for LPM evaluation.}
-\CommentTok{    {-} data\_dist (DataDistribution): The data distribution instance.}
-\CommentTok{    {-} lower\_bound (float): Lower bound for theta.}
-\CommentTok{    {-} upper\_bound (float): Upper bound for theta.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} tuple: Updated lower and upper bounds for theta.}
-\CommentTok{    """}
-\NormalTok{    left }\OperatorTok{=} \FloatTok{0.75} \OperatorTok{*}\NormalTok{ lower\_bound }\OperatorTok{+} \FloatTok{0.25} \OperatorTok{*}\NormalTok{ upper\_bound}
-\NormalTok{    middle }\OperatorTok{=} \FloatTok{0.5} \OperatorTok{*}\NormalTok{ lower\_bound }\OperatorTok{+} \FloatTok{0.5} \OperatorTok{*}\NormalTok{ upper\_bound}
-\NormalTok{    right }\OperatorTok{=} \FloatTok{0.25} \OperatorTok{*}\NormalTok{ lower\_bound }\OperatorTok{+} \FloatTok{0.75} \OperatorTok{*}\NormalTok{ upper\_bound}
-
-\NormalTok{    thetas }\OperatorTok{=}\NormalTok{ [lower\_bound, left, middle, right, upper\_bound]}
-\NormalTok{    thresholds }\OperatorTok{=}\NormalTok{ theta\_to\_threshold(torch.tensor(thetas))}
-\NormalTok{    new\_lower, new\_upper }\OperatorTok{=} \VariableTok{None}\NormalTok{, }\VariableTok{None}
-
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}18{-}25 lines)}
-    \CommentTok{\# 1. Collect metrics for each threshold value.}
-    \CommentTok{\# 2. Determine if LPM increases as theta increases.}
-    \CommentTok{\# 3. Check for pattern of increases and decreases in LPM.}
-    \CommentTok{\# 4. Update bounds based on observed LPM patterns.}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-    \ControlFlowTok{return}\NormalTok{ new\_lower, new\_upper}
-
-\CommentTok{\# Create instance and get upper \& lower boundary data}
-\NormalTok{data\_dist }\OperatorTok{=}\NormalTok{ DataDistribution(N}\OperatorTok{=}\DecValTok{10000000}\NormalTok{)}
-\NormalTok{oracle }\OperatorTok{=}\NormalTok{ Oracle(theta}\OperatorTok{=}\FloatTok{0.1}\NormalTok{)}
-
-\KeywordTok{def}\NormalTok{ plot\_confusion\_region():}
-    \CommentTok{"""}
-\CommentTok{    Plots the True Positive vs. True Negative rates for the upper and lower classifier boundaries.}
-\CommentTok{    """}
-\NormalTok{    upper\_boundary, lower\_boundary }\OperatorTok{=}\NormalTok{ sweep\_classifiers(data\_dist)}
-
-    \CommentTok{\# Prepare data for plotting for upper and lower boundaries}
-\NormalTok{    tp\_upper, tn\_upper }\OperatorTok{=} \BuiltInTok{zip}\NormalTok{(}\OperatorTok{*}\NormalTok{upper\_boundary)}
-\NormalTok{    tp\_lower, tn\_lower }\OperatorTok{=} \BuiltInTok{zip}\NormalTok{(}\OperatorTok{*}\NormalTok{lower\_boundary)}
-
-    \CommentTok{\# Plot the results for upper boundary}
-\NormalTok{    plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{8}\NormalTok{, }\DecValTok{6}\NormalTok{))}
-\NormalTok{    plt.plot(tp\_upper, tn\_upper, marker}\OperatorTok{=}\StringTok{\textquotesingle{}o\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}{-}\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.7}\NormalTok{, label}\OperatorTok{=}\StringTok{"Upper Boundary"}\NormalTok{)}
-\NormalTok{    plt.plot(tp\_lower, tn\_lower, marker}\OperatorTok{=}\StringTok{\textquotesingle{}o\textquotesingle{}}\NormalTok{, linestyle}\OperatorTok{=}\StringTok{\textquotesingle{}{-}{-}\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.7}\NormalTok{, label}\OperatorTok{=}\StringTok{"Lower Boundary"}\NormalTok{)}
-\NormalTok{    plt.title(}\StringTok{"True Positive vs. True Negative Rates (Upper \& Lower Boundaries)"}\NormalTok{)}
-\NormalTok{    plt.xlabel(}\StringTok{"True Positive Rate (TP)"}\NormalTok{)}
-\NormalTok{    plt.ylabel(}\StringTok{"True Negative Rate (TN)"}\NormalTok{)}
-\NormalTok{    plt.legend()}
-\NormalTok{    plt.grid(}\VariableTok{True}\NormalTok{)}
-\NormalTok{    plt.show()}
-
-\KeywordTok{def}\NormalTok{ start\_search():}
-    \CommentTok{"""}
-\CommentTok{    Starts the theta search using the LPM{-}based oracle and prints the search range per iteration.}
-\CommentTok{    """}
-\NormalTok{    lower\_bound }\OperatorTok{=} \DecValTok{0}
-\NormalTok{    upper\_bound }\OperatorTok{=}\NormalTok{ torch.pi }\OperatorTok{/} \DecValTok{2}
-    \ControlFlowTok{for}\NormalTok{ \_ }\KeywordTok{in}\NormalTok{ tqdm(}\BuiltInTok{range}\NormalTok{(}\DecValTok{10}\NormalTok{), desc}\OperatorTok{=}\StringTok{"LPM Search"}\NormalTok{):}
-        \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Theta Search Space: [}\SpecialCharTok{\{}\NormalTok{lower\_bound}\SpecialCharTok{\}}\SpecialStringTok{, }\SpecialCharTok{\{}\NormalTok{upper\_bound}\SpecialCharTok{\}}\SpecialStringTok{]"}\NormalTok{)}
-\NormalTok{        lower\_bound, upper\_bound }\OperatorTok{=}\NormalTok{ search\_theta(oracle, data\_dist, lower\_bound}\OperatorTok{=}\NormalTok{lower\_bound, upper\_bound}\OperatorTok{=}\NormalTok{upper\_bound)}
-    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Theta Search Space: [}\SpecialCharTok{\{}\NormalTok{lower\_bound}\SpecialCharTok{\}}\SpecialStringTok{, }\SpecialCharTok{\{}\NormalTok{upper\_bound}\SpecialCharTok{\}}\SpecialStringTok{]"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\subsection*{Question 4: D-optimal Design with Logistic Model (30
-points)}\label{sec-question-4-d-optimal-design-with-logistic-model-30-points}
-\addcontentsline{toc}{subsection}{Question 4: D-optimal Design with
-Logistic Model (30 points)}
-
-In this question, we explore D-optimal designs in the context of the
-Bradley-Terry model. The Bradley-Terry model is a logistic regression
-model used for paired comparison data. Given two items \(x_1\) and
-\(x_2\), the probability that item \(x_1\) is preferred over \(x_2\) is
-modeled as:
-
-\[P(x_1 \succ x_2 | \theta) = \frac{e^{\theta^\top x_1}}{e^{\theta^\top x_1} + e^{\theta^\top x_2}} = \frac{1}{1 + e^{\theta^\top (x_2 - x_1)}}\]
-
-where \(\theta \in \mathbb{R}^d\) represents the unknown model
-parameters, and \(x_1, x_2 \in \mathbb{R}^d\) are the feature vectors
-associated with the two items. D-optimal design aims to maximize the
-determinant of the Fisher information matrix, thus minimizing the volume
-of the confidence ellipsoid for the estimated parameters. In this
-exercise, you will analyze D-optimal designs for this model.
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{Fisher Information Matrix for the Bradley-Terry Model (12
-  points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 6 points).} Derive the Fisher information matrix
-    for the Bradley-Terry model at a design point \((x_1, x_2)\). Show
-    that the Fisher information matrix at a design point is:
-    \[I(x_1, x_2, \theta) = w(x_1, x_2, \theta) (x_1 - x_2)(x_1 - x_2)^\top,\]
-    where \(w(x_1, x_2, \theta)\) is a weight function given by:
-    \[w(x_1, x_2, \theta) = \frac{e^{\theta^\top x_1} e^{\theta^\top x_2}}{\left(e^{\theta^\top x_1} + e^{\theta^\top x_2}\right)^2} =\sigma'(\theta^\top (x_1-x_2)).\]
-    \(\sigma'\) is the derivative of the sigmoid function.
-  \item
-    \textbf{(Coding, 6 points).} Implement \texttt{fisher\_matrix} in
-    \texttt{d\_optimal/main.py} based on the derived expression.
-  \end{enumerate}
-\item
-  \textbf{D-optimal Design Criterion (18 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Coding, 11 points).} In the context of the Bradley-Terry
-    model, a D-optimal design maximizes the determinant of the Fisher
-    information matrix. Suppose we have a set of candidate items
-    \(\{x_1, \dots, x_n\}\), and we can choose \(N\) comparisons to
-    make. Formally, the D-optimal design maximizes:
-    \[\det\left( \sum_{i=1}^N w(x_{i1}, x_{i2}, \theta) (x_{i1} - x_{i2})(x_{i1} - x_{i2})^\top \right),\]
-    where \((x_{i1}, x_{i2})\) denotes a pair of compared items in the
-    design. Implement a greedy algorithm to approximate the D-optimal
-    design. Given a set of \(n\) items and their feature vectors
-    \(\{x_1, \dots, x_n\}\), your task is to iteratively select the pair
-    of items \((x_{i1}, x_{i2})\) that maximizes the determinant of the
-    Fisher information matrix. Please implement \texttt{greedy\_fisher}.
-    Note that the setup in the code assumes we have a dataset of all
-    possible differences between pairs of items as opposed to directly
-    selecting the pairs.
-  \item
-    \textbf{(Written + Coding, 7 points).} Notice that
-    \texttt{posterior\_inv\_cov} uses a Laplace approximation for the
-    posterior centered around the ground truth weights after labeling
-    the chosen points. However, it turns out this approximation doesn't
-    actually depend on the labels when taking the Hessian. Please run
-    the file \texttt{d\_optimal/main.py} and attach a plot of the norm
-    of the covariance matrix of the posterior. What difference do you
-    observe between greedy and random sampling? What is the win rate of
-    greedy?
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ tqdm }\ImportTok{import}\NormalTok{ tqdm}
-
-\KeywordTok{def}\NormalTok{ sigmoid(x):}
-    \CommentTok{"""Helper function to compute the sigmoid of x."""}
-    \ControlFlowTok{return} \DecValTok{1} \OperatorTok{/}\NormalTok{ (}\DecValTok{1} \OperatorTok{+}\NormalTok{ np.exp(}\OperatorTok{{-}}\NormalTok{x))}
-
-\KeywordTok{class}\NormalTok{ LogisticData:}
-    \KeywordTok{def} \FunctionTok{\_\_init\_\_}\NormalTok{(}\VariableTok{self}\NormalTok{, weights, seed}\OperatorTok{=}\DecValTok{42}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Initializes the LogisticData class with specified weights and seed.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} weights (np.array): True weights for data generation.}
-\CommentTok{        {-} seed (int): Random seed for reproducibility.}
-\CommentTok{        """}
-        \VariableTok{self}\NormalTok{.rng }\OperatorTok{=}\NormalTok{ np.random.default\_rng(seed)}
-        \VariableTok{self}\NormalTok{.weights }\OperatorTok{=}\NormalTok{ weights}
-    
-    \KeywordTok{def}\NormalTok{ generate\_data(}\VariableTok{self}\NormalTok{, N):}
-        \CommentTok{"""}
-\CommentTok{        Generates synthetic data for logistic regression.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} N (int): Number of data points.}
-\CommentTok{        }
-\CommentTok{        Returns:}
-\CommentTok{        {-} tuple: Generated data and labels.}
-\CommentTok{        """}
-\NormalTok{        data }\OperatorTok{=} \VariableTok{self}\NormalTok{.rng.standard\_normal((N, }\BuiltInTok{len}\NormalTok{(}\VariableTok{self}\NormalTok{.weights)))}
-\NormalTok{        probs }\OperatorTok{=}\NormalTok{ sigmoid(data }\OperatorTok{@} \VariableTok{self}\NormalTok{.weights)}
-\NormalTok{        labels }\OperatorTok{=}\NormalTok{ (}\VariableTok{self}\NormalTok{.rng.random(N) }\OperatorTok{\textless{}}\NormalTok{ probs).astype(}\BuiltInTok{int}\NormalTok{)}
-        \ControlFlowTok{return}\NormalTok{ data, labels}
-
-\KeywordTok{def}\NormalTok{ fisher\_matrix(difference\_vector, weights):}
-    \CommentTok{"""}
-\CommentTok{    Computes the Fisher information matrix for a single data point.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} difference\_vector (np.array): Difference vector (input data point).}
-\CommentTok{    {-} weights (np.array): Weights for the logistic model.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} np.array: Fisher information matrix for the data point.}
-\CommentTok{    """}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}2{-}4 lines)}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-\CommentTok{\# Initialization}
-\NormalTok{true\_weights }\OperatorTok{=}\NormalTok{ np.array([}\OperatorTok{{-}}\FloatTok{0.3356}\NormalTok{, }\OperatorTok{{-}}\FloatTok{1.4104}\NormalTok{, }\FloatTok{0.3144}\NormalTok{, }\OperatorTok{{-}}\FloatTok{0.5591}\NormalTok{, }\FloatTok{1.0426}\NormalTok{, }\FloatTok{0.6036}\NormalTok{, }\OperatorTok{{-}}\FloatTok{0.7549}\NormalTok{, }\OperatorTok{{-}}\FloatTok{1.1909}\NormalTok{, }\FloatTok{1.4779}\NormalTok{, }\OperatorTok{{-}}\FloatTok{0.7513}\NormalTok{])}
-\NormalTok{data\_dim }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(true\_weights)}
-\NormalTok{dataset\_generator }\OperatorTok{=}\NormalTok{ LogisticData(weights}\OperatorTok{=}\NormalTok{true\_weights)}
-
-\CommentTok{\# Number of iterations for sampling 500 points}
-\NormalTok{num\_iterations }\OperatorTok{=} \DecValTok{200}
-
-\CommentTok{\# Store covariance matrix norms for comparison}
-\NormalTok{cov\_norms\_greedy }\OperatorTok{=}\NormalTok{ []}
-\NormalTok{cov\_norms\_random }\OperatorTok{=}\NormalTok{ []}
-
-\KeywordTok{def}\NormalTok{ greedy\_fisher(data, curr\_fisher\_matrix, selected\_indices):}
-    \CommentTok{"""}
-\CommentTok{    Selects the data point that maximizes the Fisher information determinant.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} data (np.array): The data matrix.}
-\CommentTok{    {-} curr\_fisher\_matrix (np.array): Fisher matrix of already selected indices.}
-\CommentTok{    {-} selected\_indices (list): List of already selected indices.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} int: Index of the selected data point.}
-\CommentTok{    """}
-\NormalTok{    best\_det }\OperatorTok{=} \OperatorTok{{-}}\NormalTok{np.inf}
-\NormalTok{    best\_index }\OperatorTok{=} \OperatorTok{{-}}\DecValTok{1}
-    
-    \CommentTok{\# Iterate over data points to find the one maximizing Fisher determinant.}
-    \ControlFlowTok{for}\NormalTok{ i, difference\_vector }\KeywordTok{in} \BuiltInTok{enumerate}\NormalTok{(data):}
-        \CommentTok{\# YOUR CODE HERE (\textasciitilde{}5{-}10 lines)}
-        \CommentTok{\# Make sure to skip already selected data points!}
-        \ControlFlowTok{pass}
-        \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-    \ControlFlowTok{return}\NormalTok{ best\_index}
-
-\KeywordTok{def}\NormalTok{ posterior\_inv\_cov(X, laplace\_center):}
-    \CommentTok{"""}
-\CommentTok{    Computes the posterior inverse covariance matrix using Laplace approximation.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} X (np.array): Data matrix.}
-\CommentTok{    {-} laplace\_center (np.array): Center point (weights).}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} np.array: Posterior inverse covariance matrix.}
-\CommentTok{    """}
-    \CommentTok{\# Calculate probabilities for logistic regression model.}
-\NormalTok{    probs }\OperatorTok{=}\NormalTok{ sigmoid(X }\OperatorTok{@}\NormalTok{ laplace\_center)}
-\NormalTok{    W }\OperatorTok{=}\NormalTok{ np.diag(probs }\OperatorTok{*}\NormalTok{ (}\DecValTok{1} \OperatorTok{{-}}\NormalTok{ probs))}
-    
-    \CommentTok{\# Compute inverse covariance matrix assuming standard Gaussian prior.}
-\NormalTok{    inv\_cov }\OperatorTok{=}\NormalTok{ X.T }\OperatorTok{@}\NormalTok{ W }\OperatorTok{@}\NormalTok{ X }\OperatorTok{+}\NormalTok{ np.eye(}\BuiltInTok{len}\NormalTok{(true\_weights))}
-    \ControlFlowTok{return}\NormalTok{ inv\_cov}
-
-\ControlFlowTok{for}\NormalTok{ \_ }\KeywordTok{in}\NormalTok{ tqdm(}\BuiltInTok{range}\NormalTok{(num\_iterations)):}
-    \CommentTok{\# Generate a new sample of 500 data points}
-\NormalTok{    data, \_ }\OperatorTok{=}\NormalTok{ dataset\_generator.generate\_data(N}\OperatorTok{=}\DecValTok{500}\NormalTok{)}
-    
-    \CommentTok{\# Greedy selection of best 30 data points}
-\NormalTok{    selected\_indices }\OperatorTok{=}\NormalTok{ []}
-\NormalTok{    curr\_fisher\_matrix }\OperatorTok{=}\NormalTok{ np.zeros((data\_dim, data\_dim))}
-
-    \ControlFlowTok{for}\NormalTok{ \_ }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{30}\NormalTok{):}
-        \CommentTok{\# Select the data point maximizing Fisher information determinant.}
-\NormalTok{        best\_index }\OperatorTok{=}\NormalTok{ greedy\_fisher(data, curr\_fisher\_matrix, selected\_indices)}
-\NormalTok{        selected\_indices.append(best\_index)}
-\NormalTok{        curr\_fisher\_matrix }\OperatorTok{+=}\NormalTok{ fisher\_matrix(data[best\_index], true\_weights)}
-
-    \CommentTok{\# Prepare greedy and random samples}
-\NormalTok{    X\_greedy }\OperatorTok{=}\NormalTok{ data[selected\_indices]}
-
-    \CommentTok{\# Generate 30 random samples for comparison}
-\NormalTok{    random\_indices }\OperatorTok{=}\NormalTok{ np.random.choice(}\BuiltInTok{len}\NormalTok{(data), }\DecValTok{30}\NormalTok{, replace}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
-\NormalTok{    X\_random }\OperatorTok{=}\NormalTok{ data[random\_indices]}
-
-    \CommentTok{\# Compute posterior inverse covariance matrices for both strategies}
-\NormalTok{    posterior\_inv\_cov\_greedy }\OperatorTok{=}\NormalTok{ posterior\_inv\_cov(X\_greedy, laplace\_center}\OperatorTok{=}\NormalTok{true\_weights) }
-\NormalTok{    posterior\_inv\_cov\_random }\OperatorTok{=}\NormalTok{ posterior\_inv\_cov(X\_random, laplace\_center}\OperatorTok{=}\NormalTok{true\_weights)}
-
-    \CommentTok{\# Calculate covariance matrices (inverse of posterior inverse covariance)}
-\NormalTok{    cov\_matrix\_greedy }\OperatorTok{=}\NormalTok{ np.linalg.inv(posterior\_inv\_cov\_greedy)}
-\NormalTok{    cov\_matrix\_random }\OperatorTok{=}\NormalTok{ np.linalg.inv(posterior\_inv\_cov\_random)}
-
-    \CommentTok{\# Measure the norm (Frobenius norm) of the covariance matrices}
-\NormalTok{    cov\_norm\_greedy }\OperatorTok{=}\NormalTok{ np.linalg.norm(cov\_matrix\_greedy, }\StringTok{\textquotesingle{}fro\textquotesingle{}}\NormalTok{)}
-\NormalTok{    cov\_norm\_random }\OperatorTok{=}\NormalTok{ np.linalg.norm(cov\_matrix\_random, }\StringTok{\textquotesingle{}fro\textquotesingle{}}\NormalTok{)}
-
-    \CommentTok{\# Store norms for analysis}
-\NormalTok{    cov\_norms\_greedy.append(cov\_norm\_greedy)}
-\NormalTok{    cov\_norms\_random.append(cov\_norm\_random)}
-
-\CommentTok{\# Display comparison results}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Greedy mean: }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{mean(cov\_norms\_greedy)}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Random mean: }\SpecialCharTok{\{}\NormalTok{np}\SpecialCharTok{.}\NormalTok{mean(cov\_norms\_random)}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-\BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Greedy win rate: }\SpecialCharTok{\{}\NormalTok{(np.array(cov\_norms\_greedy) }\OperatorTok{\textless{}}\NormalTok{ np.array(cov\_norms\_random))}\SpecialCharTok{.}\NormalTok{mean()}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# Plot the distributions of covariance matrix norms}
-\NormalTok{plt.hist(cov\_norms\_greedy, bins}\OperatorTok{=}\DecValTok{30}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.7}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Greedy\textquotesingle{}}\NormalTok{)}
-\NormalTok{plt.hist(cov\_norms\_random, bins}\OperatorTok{=}\DecValTok{30}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.7}\NormalTok{, color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Random\textquotesingle{}}\NormalTok{)}
-\NormalTok{plt.xlabel(}\StringTok{\textquotesingle{}L2 Norm of Covariance Matrix\textquotesingle{}}\NormalTok{)}
-\NormalTok{plt.ylabel(}\StringTok{\textquotesingle{}Frequency\textquotesingle{}}\NormalTok{)}
-\NormalTok{plt.title(}\StringTok{\textquotesingle{}Comparison of Covariance Norms (Greedy vs. Random) Across Iterations\textquotesingle{}}\NormalTok{)}
-\NormalTok{plt.legend()}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\subsection*{Question 5: Nonparametric Metric Elicitation (30
-points)}\label{sec-question-5-nonparametric-metric-elicitation-30-points}
-\addcontentsline{toc}{subsection}{Question 5: Nonparametric Metric
-Elicitation (30 points)}
-
-In this question, we explore the problem of performance metric
-elicitation using a Gaussian Process (GP) to map the elements of the
-confusion matrix, specifically false positives (FP) and false negatives
-(FN), to an unknown performance metric. The goal is to learn a
-non-linear function that maps FP and FN to the metric, using relative
-preferences from pairwise classifier comparisons. We will use elliptical
-slice sampling for posterior inference.
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{Gaussian Process for Metric Elicitation (10 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 2 points).} Assume that the performance metric
-    \(\phi(C)\) is a non-linear function of the confusion matrix \(C\).
-    For simplicity, assume that \(\phi\) depends only on FP and FN,
-    i.e.,
-    \[\phi(\text{FP}, \text{FN}) \sim \mathcal{GP}(0, k((\text{FP}, \text{FN}), (\text{FP}', \text{FN}'))),\]
-    where \(k\) is the covariance kernel function of the Gaussian
-    Process. Explain why using a GP allows for flexible modeling of the
-    metric \(\phi\) as a non-linear function of FP and FN. What are the
-    advantages of using a GP over a linear model in this context?
-  \item
-    \textbf{(Written, 2 points).} Suppose we observe pairwise
-    comparisons between classifiers, where a user provides feedback on
-    which classifier they prefer based on the unknown metric \(\phi\).
-    Given two classifiers with confusion matrices
-    \(C_1 = (\text{FP}_1, \text{FN}_1)\) and
-    \(C_2 = (\text{FP}_2, \text{FN}_2)\), the user indicates their
-    relative preference. Let the observed preference be modeled by
-    Bradley-Terry as:
-    \[\Pr(C_1 \succ C_2) = \sigma(\phi(\text{FP}_1, \text{FN}_1) - \phi(\text{FP}_2, \text{FN}_2)).\]
-    where we view \(\phi\) as the reward function. How does this
-    likelihood affect the posterior inference in the GP? Where does it
-    introduce additional complexity?
-  \item
-    \textbf{(Written + Coding, 6 points).} Given a set of observed
-    pairwise comparisons, derive the posterior distribution over the
-    latent function values \(\phi\) given a set of confusion matrices
-    preferences using Bayes' rule. Express the posterior distribution in
-    terms of the GP prior and the pairwise likelihood function. You do
-    not need to include the normalization constant. Implement the
-    likelihood function in \texttt{loglik\_from\_preferences}.
-  \end{enumerate}
-\item
-  \textbf{Elliptical Slice Sampling for Posterior Inference (20 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 3 points).} Read
-    \url{https://proceedings.mlr.press/v9/murray10a/murray10a.pdf}.
-    Elliptical slice sampling is a sampling method used to generate
-    samples from the posterior distribution of a Gaussian Process.
-    Explain the key idea behind elliptical slice sampling and why it is
-    well-suited for sampling from the GP posterior in this context.
-  \item
-    \textbf{(Coding, 10 points).} Implement elliptical slice sampling in
-    \texttt{npme/elliptical\_sampler.py} by following Figure 2 in the
-    paper.
-  \item
-    \textbf{(Written, 3 points).} Run the algorithm on a synthetic
-    preference dataset of confusion matrices with pairwise preferences.
-    The synthetic data will be constructed using the metric
-    \[\phi_{\text{true}}(\text{FP}, \text{FN}) = \log(1 + \text{FP}) + \log(1 + \text{FN}),\]
-    which captures the idea that the human oracle perceives both false
-    positives and false negatives in a way that flattens out as these
-    values increase (i.e., marginal increases in FP and FN have
-    diminishing effects on the performance metric). Explain the
-    psychological motivation behind this non-linear function. Why might
-    a logarithmic form be appropriate for modeling human perception of
-    classification errors?
-
-    Run the file \texttt{npme/main.py} and attach the plot of
-    \(\phi_{\text{true}}\) vs your elicited metric. What do you notice
-    in the plot?
-  \item
-    \textbf{(Written + Coding, 4 points).} Once the GP has been trained
-    and posterior samples of the function \(\phi(\text{FP}, \text{FN})\)
-    have been obtained, how can we evaluate the quality of the elicited
-    metric? Propose a method to evaluate how well the elicited metric
-    \(\phi\) aligns with the user's true preferences and implement it in
-    \texttt{evaluate\_elicited\_metric} taking into the plot you saw in
-    part (iii).
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ typing }\ImportTok{import}\NormalTok{ Callable}
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{from}\NormalTok{ tqdm }\ImportTok{import}\NormalTok{ tqdm}
-
-\KeywordTok{class}\NormalTok{ EllipticalSliceSampler:}
-    \KeywordTok{def} \FunctionTok{\_\_init\_\_}\NormalTok{(}\VariableTok{self}\NormalTok{,}
-\NormalTok{                 prior\_cov: np.ndarray,}
-\NormalTok{                 loglik: Callable):}
-        \CommentTok{"""}
-\CommentTok{        Initializes the Elliptical Slice Sampler.}
-\CommentTok{        }
-\CommentTok{        Args:}
-\CommentTok{        {-} prior\_cov (np.ndarray): Prior covariance matrix.}
-\CommentTok{        {-} loglik (Callable): Log{-}likelihood function.}
-\CommentTok{        """}
-        \VariableTok{self}\NormalTok{.prior\_cov }\OperatorTok{=}\NormalTok{ prior\_cov}
-        \VariableTok{self}\NormalTok{.loglik }\OperatorTok{=}\NormalTok{ loglik}
-
-        \VariableTok{self}\NormalTok{.\_n }\OperatorTok{=}\NormalTok{ prior\_cov.shape[}\DecValTok{0}\NormalTok{]  }\CommentTok{\# Dimensionality of the space}
-        \VariableTok{self}\NormalTok{.\_chol }\OperatorTok{=}\NormalTok{ np.linalg.cholesky(prior\_cov)  }\CommentTok{\# Cache Cholesky decomposition}
-
-        \CommentTok{\# Initialize state by sampling from prior}
-        \VariableTok{self}\NormalTok{.\_state\_f }\OperatorTok{=} \VariableTok{self}\NormalTok{.\_chol }\OperatorTok{@}\NormalTok{ np.random.randn(}\VariableTok{self}\NormalTok{.\_n)}
-
-    \KeywordTok{def}\NormalTok{ \_indiv\_sample(}\VariableTok{self}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Main algorithm for generating an individual sample using Elliptical Slice Sampling.}
-\CommentTok{        """}
-\NormalTok{        f }\OperatorTok{=} \VariableTok{self}\NormalTok{.\_state\_f  }\CommentTok{\# Previous state}
-\NormalTok{        nu }\OperatorTok{=} \VariableTok{self}\NormalTok{.\_chol }\OperatorTok{@}\NormalTok{ np.random.randn(}\VariableTok{self}\NormalTok{.\_n)  }\CommentTok{\# Sample from prior for the ellipse}
-\NormalTok{        log\_y }\OperatorTok{=} \VariableTok{self}\NormalTok{.loglik(f) }\OperatorTok{+}\NormalTok{ np.log(np.random.uniform())  }\CommentTok{\# Log{-}likelihood threshold}
-
-\NormalTok{        theta }\OperatorTok{=}\NormalTok{ np.random.uniform(}\FloatTok{0.}\NormalTok{, }\DecValTok{2} \OperatorTok{*}\NormalTok{ np.pi)  }\CommentTok{\# Initial proposal angle}
-\NormalTok{        theta\_min, theta\_max }\OperatorTok{=}\NormalTok{ theta }\OperatorTok{{-}} \DecValTok{2} \OperatorTok{*}\NormalTok{ np.pi, theta  }\CommentTok{\# Define bracketing interval}
-
-        \CommentTok{\# Main loop: Accept sample if it meets log{-}likelihood threshold; otherwise, shrink the bracket.}
-        \ControlFlowTok{while} \VariableTok{True}\NormalTok{:}
-            \CommentTok{\# YOUR CODE HERE (\textasciitilde{}10 lines)}
-            \CommentTok{\# 1. Generate a new sample point based on the current angle.}
-            \CommentTok{\# 2. Check if the proposed point meets the acceptance criterion.            }
-            \CommentTok{\# 3. If not accepted, adjust the bracket and select a new angle.}
-            \ControlFlowTok{break}
-            \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-    \KeywordTok{def}\NormalTok{ sample(}\VariableTok{self}\NormalTok{,}
-\NormalTok{               n\_samples: }\BuiltInTok{int}\NormalTok{,}
-\NormalTok{               n\_burn: }\BuiltInTok{int} \OperatorTok{=} \DecValTok{500}\NormalTok{) }\OperatorTok{{-}\textgreater{}}\NormalTok{ np.ndarray:}
-        \CommentTok{"""}
-\CommentTok{        Generates samples using Elliptical Slice Sampling.}
-
-\CommentTok{        Args:}
-\CommentTok{        {-} n\_samples (int): Total number of samples to return.}
-\CommentTok{        {-} n\_burn (int): Number of initial samples to discard (burn{-}in).}
-
-\CommentTok{        Returns:}
-\CommentTok{        {-} np.ndarray: Array of samples after burn{-}in.}
-\CommentTok{        """}
-\NormalTok{        samples }\OperatorTok{=}\NormalTok{ []}
-        \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in}\NormalTok{ tqdm(}\BuiltInTok{range}\NormalTok{(n\_samples), desc}\OperatorTok{=}\StringTok{"Sampling"}\NormalTok{):}
-            \VariableTok{self}\NormalTok{.\_indiv\_sample()}
-            \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}}\NormalTok{ n\_burn:}
-\NormalTok{                samples.append(}\VariableTok{self}\NormalTok{.\_state\_f.copy())  }\CommentTok{\# Store sample post burn{-}in}
-
-        \ControlFlowTok{return}\NormalTok{ np.stack(samples)}
-
-\KeywordTok{def}\NormalTok{ sigmoid(x):}
-    \CommentTok{"""Sigmoid function to map values between 0 and 1."""}
-    \ControlFlowTok{return} \DecValTok{1} \OperatorTok{/}\NormalTok{ (}\DecValTok{1} \OperatorTok{+}\NormalTok{ np.exp(}\OperatorTok{{-}}\NormalTok{x))}
-
-\CommentTok{\# Step 1: Define a New Two{-}Dimensional Non{-}linear Function}
-\KeywordTok{def}\NormalTok{ nonlinear\_function(x1, x2):}
-    \CommentTok{"""}
-\CommentTok{    Computes a non{-}linear function of x1 and x2.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} x1 (np.array): First input array.}
-\CommentTok{    {-} x2 (np.array): Second input array.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} np.array: Computed function values.}
-\CommentTok{    """}
-    \ControlFlowTok{return}\NormalTok{ np.log(}\DecValTok{1} \OperatorTok{+}\NormalTok{ x1) }\OperatorTok{+}\NormalTok{ np.log(}\DecValTok{1} \OperatorTok{+}\NormalTok{ x2)}
-
-\CommentTok{\# Generate a 2D grid of points}
-\NormalTok{x1 }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{20}\NormalTok{)}
-\NormalTok{x2 }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{20}\NormalTok{)}
-\NormalTok{x1\_grid, x2\_grid }\OperatorTok{=}\NormalTok{ np.meshgrid(x1, x2)}
-\NormalTok{x\_grid\_points }\OperatorTok{=}\NormalTok{ np.vstack([x1\_grid.ravel(), x2\_grid.ravel()]).T}
-\NormalTok{f\_values }\OperatorTok{=}\NormalTok{ nonlinear\_function(x\_grid\_points[:, }\DecValTok{0}\NormalTok{], x\_grid\_points[:, }\DecValTok{1}\NormalTok{])}
-
-\CommentTok{\# Step 2: Generate Preferences Using Bradley{-}Terry Model Over the Grid}
-\KeywordTok{def}\NormalTok{ generate\_preferences(f\_vals, num\_prefs}\OperatorTok{=}\DecValTok{10000}\NormalTok{):}
-    \CommentTok{"""}
-\CommentTok{    Generates preferences based on the Bradley{-}Terry model.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} f\_vals (np.array): Function values at grid points.}
-\CommentTok{    {-} num\_prefs (int): Number of preference pairs to generate.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} list of tuple: Generated preference pairs (i, j).}
-\CommentTok{    """}
-\NormalTok{    preferences }\OperatorTok{=}\NormalTok{ []}
-\NormalTok{    num\_points }\OperatorTok{=} \BuiltInTok{len}\NormalTok{(f\_vals)}
-    \ControlFlowTok{for}\NormalTok{ \_ }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(num\_prefs):}
-\NormalTok{        i, j }\OperatorTok{=}\NormalTok{ np.random.choice(num\_points, size}\OperatorTok{=}\DecValTok{2}\NormalTok{, replace}\OperatorTok{=}\VariableTok{False}\NormalTok{)}
-        \CommentTok{\# Probability of preference using Bradley{-}Terry model}
-\NormalTok{        p\_ij }\OperatorTok{=}\NormalTok{ sigmoid(f\_vals[i] }\OperatorTok{{-}}\NormalTok{ f\_vals[j])}
-        \CommentTok{\# Decide preference based on random draw}
-        \ControlFlowTok{if}\NormalTok{ np.random.rand() }\OperatorTok{\textless{}}\NormalTok{ p\_ij:}
-\NormalTok{            preferences.append((i, j))}
-        \ControlFlowTok{else}\NormalTok{:}
-\NormalTok{            preferences.append((j, i))}
-    \ControlFlowTok{return}\NormalTok{ preferences}
-
-\NormalTok{preferences }\OperatorTok{=}\NormalTok{ generate\_preferences(f\_values)}
-
-\CommentTok{\# Step 3: Define the Likelihood Function for Elliptical Slice Sampling}
-\KeywordTok{def}\NormalTok{ loglik\_from\_preferences(f):}
-    \CommentTok{"""}
-\CommentTok{    Log{-}likelihood function using Bradley{-}Terry model for preferences.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} f (np.array): Sampled function values.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} float: Log{-}likelihood value.}
-\CommentTok{    """}
-\NormalTok{    log\_lik }\OperatorTok{=} \DecValTok{0}
-    \ControlFlowTok{for}\NormalTok{ idx\_i, idx\_j }\KeywordTok{in}\NormalTok{ preferences:}
-        \CommentTok{\# YOUR CODE HERE (\textasciitilde{}2 lines)}
-        \ControlFlowTok{pass}
-        \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-    \ControlFlowTok{return}\NormalTok{ log\_lik}
-
-\CommentTok{\# Step 4: Define the RBF Kernel to Compute Prior Covariance Matrix}
-\KeywordTok{def}\NormalTok{ rbf\_kernel(X1, X2, length\_scale}\OperatorTok{=}\FloatTok{1.0}\NormalTok{, sigma\_f}\OperatorTok{=}\FloatTok{1.0}\NormalTok{):}
-    \CommentTok{"""}
-\CommentTok{    Computes the Radial Basis Function (RBF) kernel between two sets of points.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} X1, X2 (np.array): Input data points.}
-\CommentTok{    {-} length\_scale (float): Kernel length scale parameter.}
-\CommentTok{    {-} sigma\_f (float): Kernel output scale.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} np.array: RBF kernel matrix.}
-\CommentTok{    """}
-\NormalTok{    sqdist }\OperatorTok{=}\NormalTok{ np.}\BuiltInTok{sum}\NormalTok{(X1}\OperatorTok{**}\DecValTok{2}\NormalTok{, axis}\OperatorTok{=}\DecValTok{1}\NormalTok{).reshape(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{) }\OperatorTok{+}\NormalTok{ np.}\BuiltInTok{sum}\NormalTok{(X2}\OperatorTok{**}\DecValTok{2}\NormalTok{, axis}\OperatorTok{=}\DecValTok{1}\NormalTok{) }\OperatorTok{{-}} \DecValTok{2} \OperatorTok{*}\NormalTok{ np.dot(X1, X2.T)}
-    \ControlFlowTok{return}\NormalTok{ sigma\_f}\OperatorTok{**}\DecValTok{2} \OperatorTok{*}\NormalTok{ np.exp(}\OperatorTok{{-}}\FloatTok{0.5} \OperatorTok{/}\NormalTok{ length\_scale}\OperatorTok{**}\DecValTok{2} \OperatorTok{*}\NormalTok{ sqdist)}
-
-\CommentTok{\# Define prior covariance (prior mean is zero vector)}
-\NormalTok{sigma\_prior }\OperatorTok{=}\NormalTok{ rbf\_kernel(x\_grid\_points, x\_grid\_points, length\_scale}\OperatorTok{=}\FloatTok{1.0}\NormalTok{, sigma\_f}\OperatorTok{=}\FloatTok{1.0}\NormalTok{)}
-
-\CommentTok{\# Add small jitter to diagonal for numerical stability}
-\NormalTok{jitter }\OperatorTok{=} \FloatTok{1e{-}6}
-\NormalTok{sigma\_prior }\OperatorTok{+=}\NormalTok{ jitter }\OperatorTok{*}\NormalTok{ np.eye(sigma\_prior.shape[}\DecValTok{0}\NormalTok{])}
-
-\CommentTok{\# Ensure the matrix is symmetric to avoid numerical issues}
-\NormalTok{sigma\_prior }\OperatorTok{=}\NormalTok{ (sigma\_prior }\OperatorTok{+}\NormalTok{ sigma\_prior.T) }\OperatorTok{/} \DecValTok{2}
-
-\CommentTok{\# Step 5: Run Elliptical Slice Sampling}
-\NormalTok{sampler }\OperatorTok{=}\NormalTok{ EllipticalSliceSampler(sigma\_prior, loglik\_from\_preferences)}
-\NormalTok{samples }\OperatorTok{=}\NormalTok{ sampler.sample(}\DecValTok{1000}\NormalTok{, n\_burn}\OperatorTok{=}\DecValTok{500}\NormalTok{)}
-\NormalTok{average\_samples }\OperatorTok{=}\NormalTok{ np.mean(samples, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-
-\CommentTok{\# Generate true function values on grid points}
-\NormalTok{true\_values\_on\_grid }\OperatorTok{=}\NormalTok{ nonlinear\_function(x\_grid\_points[:, }\DecValTok{0}\NormalTok{], x\_grid\_points[:, }\DecValTok{1}\NormalTok{])}
-
-\KeywordTok{def}\NormalTok{ evaluate\_elicited\_metric(true\_metric, elicited\_metric):}
-    \CommentTok{"""}
-\CommentTok{    Evaluates and prints the mean and standard deviation of the difference}
-\CommentTok{    between true and elicited metrics.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} true\_metric (np.array): True values of the function.}
-\CommentTok{    {-} elicited\_metric (np.array): Elicited (estimated) function values.}
-\CommentTok{    """}
-    \CommentTok{\# YOUR CODE HERE}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-\NormalTok{evaluate\_elicited\_metric(true\_values\_on\_grid, average\_samples)}
-
-\CommentTok{\# Step 6: Plot the True Non{-}linear Function and Elicited Metric in 3D}
-\NormalTok{fig }\OperatorTok{=}\NormalTok{ plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{12}\NormalTok{, }\DecValTok{8}\NormalTok{))}
-\NormalTok{ax }\OperatorTok{=}\NormalTok{ fig.add\_subplot(}\DecValTok{111}\NormalTok{, projection}\OperatorTok{=}\StringTok{\textquotesingle{}3d\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# Plot the true function}
-\NormalTok{x1\_fine }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{50}\NormalTok{)}
-\NormalTok{x2\_fine }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{50}\NormalTok{)}
-\NormalTok{x1\_fine\_grid, x2\_fine\_grid }\OperatorTok{=}\NormalTok{ np.meshgrid(x1\_fine, x2\_fine)}
-\NormalTok{true\_f\_values }\OperatorTok{=}\NormalTok{ nonlinear\_function(x1\_fine\_grid, x2\_fine\_grid)}
-\NormalTok{ax.plot\_surface(x1\_fine\_grid, x2\_fine\_grid, true\_f\_values, color}\OperatorTok{=}\StringTok{\textquotesingle{}blue\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}True Function\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# Plot the averaged samples as a surface}
-\NormalTok{x1\_avg }\OperatorTok{=}\NormalTok{ x\_grid\_points[:, }\DecValTok{0}\NormalTok{].reshape(}\DecValTok{20}\NormalTok{, }\DecValTok{20}\NormalTok{)}
-\NormalTok{x2\_avg }\OperatorTok{=}\NormalTok{ x\_grid\_points[:, }\DecValTok{1}\NormalTok{].reshape(}\DecValTok{20}\NormalTok{, }\DecValTok{20}\NormalTok{)}
-\NormalTok{avg\_values }\OperatorTok{=}\NormalTok{ average\_samples.reshape(}\DecValTok{20}\NormalTok{, }\DecValTok{20}\NormalTok{)}
-\NormalTok{ax.plot\_surface(x1\_avg, x2\_avg, avg\_values, color}\OperatorTok{=}\StringTok{\textquotesingle{}red\textquotesingle{}}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{, label}\OperatorTok{=}\StringTok{\textquotesingle{}Estimated Function\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# Customize plot}
-\NormalTok{ax.set\_xlabel(}\StringTok{\textquotesingle{}x1\textquotesingle{}}\NormalTok{)}
-\NormalTok{ax.set\_ylabel(}\StringTok{\textquotesingle{}x2\textquotesingle{}}\NormalTok{)}
-\NormalTok{ax.set\_zlabel(}\StringTok{\textquotesingle{}f(x1, x2)\textquotesingle{}}\NormalTok{)}
-\NormalTok{ax.set\_title(}\StringTok{\textquotesingle{}True Function vs. Averaged Estimated Function\textquotesingle{}}\NormalTok{)}
-\NormalTok{plt.legend()}
-\NormalTok{plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\section*{References}\label{bibliography-3}
-\addcontentsline{toc}{section}{References}
-
-\markright{References}
-
-\phantomsection\label{refs-3}
-\begin{CSLReferences}{1}{0}
-\bibitem[\citeproctext]{ref-unnoisy_humans}
-Amershi, Saleema, Maya Cakmak, W. Bradley Knox, and Todd Kulesza. 2014.
-{``Power to the People: The Role of Humans in Interactive Machine
-Learning.''} \emph{AI Magazine}.
-
-\bibitem[\citeproctext]{ref-AL_committee}
-Beluch, William H., Tim Genewein, A. Nürnberger, and Jan M. Köhler.
-2018. {``The Power of Ensembles for Active Learning in Image
-Classification.''} \emph{2018 IEEE/CVF Conference on Computer Vision and
-Pattern Recognition}, 9368--77.
-\url{https://api.semanticscholar.org/CorpusID:52838058}.
-
-\bibitem[\citeproctext]{ref-pmlr-v87-biyik18a}
-Biyik, Erdem, and Dorsa Sadigh. 2018. {``Batch Active Preference-Based
-Learning of Reward Functions.''} In \emph{Proceedings of the 2nd
-Conference on Robot Learning}, edited by Aude Billard, Anca Dragan, Jan
-Peters, and Jun Morimoto, 87:519--28. Proceedings of Machine Learning
-Research. PMLR. \url{https://proceedings.mlr.press/v87/biyik18a.html}.
-
-\bibitem[\citeproctext]{ref-pref4}
-Braziunas, Darius, and Craig Boutilier. 2012. {``Minimax Regret Based
-Elicitation of Generalized Additive Utilities.''}
-\url{https://arxiv.org/abs/1206.5255}.
-
-\bibitem[\citeproctext]{ref-AL_expmodelchange}
-Cai, Wenbin, Ya Zhang, and Jun Zhou. 2013. {``Maximizing Expected Model
-Change for Active Learning in Regression.''} In \emph{2013 IEEE 13th
-International Conference on Data Mining}, 51--60.
-\url{https://doi.org/10.1109/ICDM.2013.104}.
-
-\bibitem[\citeproctext]{ref-AL_variance}
-Cohn, David A., Zoubin Ghahramani, and Michael I. Jordan. 1996.
-{``Active Learning with Statistical Models.''} \emph{CoRR}
-cs.AI/9603104. \url{https://arxiv.org/abs/cs/9603104}.
-
-\bibitem[\citeproctext]{ref-geo_paper}
-G., Jamieson Kevin, and Robert Nowak. 2011. {``Active Ranking Using
-Pairwise Comparisons.''} \emph{Advances in Neural Information Processing
-Systems} 24.
-
-\bibitem[\citeproctext]{ref-gandhi2022eliciting}
-Gandhi, Kanishk, Siddharth Karamcheti, Madeline Liao, and Dorsa Sadigh.
-2022. {``Eliciting Compatible Demonstrations for Multi-Human Imitation
-Learning.''} In \emph{Proceedings of the 6th Conference on Robot
-Learning (CoRL)}.
-
-\bibitem[\citeproctext]{ref-bias_variance_orig_paper}
-Geman, Stuart, Elie Bienenstock, and René Doursat. 1992. {``Neural
-Networks and the Bias/Variance Dilemma.''} \emph{Neural Computation}
-4:1--58. \url{https://api.semanticscholar.org/CorpusID:14215320}.
-
-\bibitem[\citeproctext]{ref-noisy_humans}
-Guillory, Andrew, and Jeff Bilmes. 2011. {``Simultaneous Learning and
-Covering with Adversarial Noise.''} \emph{ICML}.
-
-\bibitem[\citeproctext]{ref-pmlr-v89-hiranandani19a}
-Hiranandani, Gaurush, Shant Boodaghians, Ruta Mehta, and Oluwasanmi
-Koyejo. 2019a. {``Performance Metric Elicitation from Pairwise
-Classifier Comparisons.''} In \emph{Proceedings of the Twenty-Second
-International Conference on Artificial Intelligence and Statistics},
-edited by Kamalika Chaudhuri and Masashi Sugiyama, 89:371--79.
-Proceedings of Machine Learning Research. PMLR.
-\url{https://proceedings.mlr.press/v89/hiranandani19a.html}.
-
-\bibitem[\citeproctext]{ref-NEURIPS2019_1fd09c5f}
-Hiranandani, Gaurush, Shant Boodaghians, Ruta Mehta, and Oluwasanmi O
-Koyejo. 2019b. {``Multiclass Performance Metric Elicitation.''} In
-\emph{Advances in Neural Information Processing Systems}, edited by H.
-Wallach, H. Larochelle, A. Beygelzimer, F. dAlché-Buc, E. Fox, and R.
-Garnett. Vol. 32. Curran Associates, Inc.
-\url{https://proceedings.neurips.cc/paper_files/paper/2019/file/1fd09c5f59a8ff35d499c0ee25a1d47e-Paper.pdf}.
-
-\bibitem[\citeproctext]{ref-nips}
-Hiranandani, Gaurush, Harikrishna Narasimhan, and Sanmi Koyejo. 2020.
-{``Fair Performance Metric Elicitation.''} In \emph{Advances in Neural
-Information Processing Systems}, edited by H. Larochelle, M. Ranzato, R.
-Hadsell, M. F. Balcan, and H. Lin, 33:11083--95. Curran Associates, Inc.
-\url{https://proceedings.neurips.cc/paper_files/paper/2020/file/7ec2442aa04c157590b2fa1a7d093a33-Paper.pdf}.
-
-\bibitem[\citeproctext]{ref-claus}
-Holladay, Rachel, Shervin Javdani, Anca Dragan, and Siddhartha
-Srinivasa. 2016. {``Active Comparison Based Learning Incorporating User
-Uncertainty and Noise.''} \emph{Proceedings of RSS '16 Workshop on Model
-Learning for Human-Robot Communication}.
-
-\bibitem[\citeproctext]{ref-AL_BALD}
-Houlsby, Neil, Ferenc Huszár, Zoubin Ghahramani, and Máté Lengyel. 2011.
-{``Bayesian Active Learning for Classification and Preference
-Learning.''} \emph{arXiv Preprint arXiv:1112.5745}.
-
-\bibitem[\citeproctext]{ref-AL_app_autonomous}
-Jarl, Sanna, Linus Aronsson, Sadegh Rahrovani, and Morteza Haghir
-Chehreghani. 2021. {``Active Learning of Driving Scenario
-Trajectories.''} \emph{Eng. Appl. Artif. Intell.} 113:104972.
-\url{https://api.semanticscholar.org/CorpusID:249113683}.
-
-\bibitem[\citeproctext]{ref-Li_2021}
-Li, Kejun, Maegan Tucker, Erdem Biyik, Ellen Novoseller, Joel W.
-Burdick, Yanan Sui, Dorsa Sadigh, Yisong Yue, and Aaron D. Ames. 2021.
-{``ROIAL: Region of Interest Active Learning for Characterizing
-Exoskeleton Gait Preference Landscapes.''} In \emph{2021 IEEE
-International Conference on Robotics and Automation (ICRA)}. IEEE.
-\url{https://doi.org/10.1109/icra48506.2021.9560840}.
-
-\bibitem[\citeproctext]{ref-AL_app_LLMs}
-Margatina, Katerina, Timo Schick, Nikolaos Aletras, and Jane Dwivedi-Yu.
-2023. {``Active Learning Principles for in-Context Learning with Large
-Language Models.''} \emph{ArXiv} abs/2305.14264.
-\url{https://api.semanticscholar.org/CorpusID:258841313}.
-
-\bibitem[\citeproctext]{ref-pref2}
-Mas-Colell, Andreu. 1977. {``The Recoverability of Consumers'
-Preferences from Market Demand Behavior.''} \emph{Econometrica} 45 (6):
-1409--30. \url{http://www.jstor.org/stable/1912308}.
-
-\bibitem[\citeproctext]{ref-AL_experrorredn}
-Mussmann, Stephen, Julia Reisler, Daniel Tsai, Ehsan Mousavi, Shayne
-O'Brien, and Moises Goldszmidt. 2022. {``Active Learning with Expected
-Error Reduction.''} \url{https://arxiv.org/abs/2211.09283}.
-
-\bibitem[\citeproctext]{ref-pmlr-v37-narasimhanb15}
-Narasimhan, Harikrishna, Harish Ramaswamy, Aadirupa Saha, and Shivani
-Agarwal. 2015. {``Consistent Multiclass Algorithms for Complex
-Performance Measures.''} In \emph{Proceedings of the 32nd International
-Conference on Machine Learning}, edited by Francis Bach and David Blei,
-37:2398--2407. Proceedings of Machine Learning Research. Lille, France:
-PMLR. \url{https://proceedings.mlr.press/v37/narasimhanb15.html}.
-
-\bibitem[\citeproctext]{ref-pref1}
-Samuelson, P. A. 1938. {``A Note on the Pure Theory of Consumer's
-Behaviour.''} \emph{Economica} 5 (17): 61--71.
-\url{http://www.jstor.org/stable/2548836}.
-
-\bibitem[\citeproctext]{ref-lus-shep}
-Shepard, Roger N. 1957. {``Stimulus and Response Generalization: A
-Stochastic Model Relating Generalization to Distance in Psychological
-Space.''} \emph{Psychometrika} 22(4):325--345.
-
-\bibitem[\citeproctext]{ref-AL_app_sensors}
-Singh, Aarti, Robert D. Nowak, and Parameswaran Ramanathan. 2006.
-{``Active Learning for Adaptive Mobile Sensing Networks.''} \emph{2006
-5th International Conference on Information Processing in Sensor
-Networks}, 60--68.
-\url{https://api.semanticscholar.org/CorpusID:17590956}.
-
-\bibitem[\citeproctext]{ref-ab}
-Tamburrelli, Giordano, and Alessandro Margara. 2014. {``Towards
-Automated a/b Testing.''} In \emph{Search-Based Software Engineering}.
-\url{https://doi.org/10.1007/978-3-319-09940-8_13}.
-
-\bibitem[\citeproctext]{ref-AL_app_robotics}
-Taylor, Annalisa T., Thomas A. Berrueta, and Todd D. Murphey. 2021.
-{``Active Learning in Robotics: A Review of Control Principles.''}
-\emph{ArXiv} abs/2106.13697.
-\url{https://api.semanticscholar.org/CorpusID:235652039}.
-
-\bibitem[\citeproctext]{ref-pref3}
-Varian, Hal R. 2006. {``Revealed Preference.''} In \emph{The SAGE
-Encyclopedia of Business Ethics and Society}.
-\url{https://api.semanticscholar.org/CorpusID:1632873}.
-
-\bibitem[\citeproctext]{ref-lus-log}
-Viappiani, Paolo, and Craig Boutilier. 2010. {``Optimal Bayesian
-Recommendation Sets and Myopically Optimal Choice Query Sets.''}
-\emph{NIPS}, 2352--60.
-
-\bibitem[\citeproctext]{ref-YangNaiman+2014+477+496}
-Yang, Sitan, and Daniel Q. Naiman. 2014. {``Multiclass Cancer
-Classification Based on Gene Expression Comparison.''} \emph{Statistical
-Applications in Genetics and Molecular Biology} 13 (4): 477--96.
-\url{https://doi.org/doi:10.1515/sagmb-2013-0053}.
-
-\bibitem[\citeproctext]{ref-AL_uncertainty}
-Zhu, Jingbo, Huizhen Wang, Benjamin Ka-Yin T'sou, and Matthew Y. Ma.
-2010. {``Active Learning with Sampling by Uncertainty and Density for
-Data Annotations.''} \emph{IEEE Transactions on Audio, Speech, and
-Language Processing} 18:1323--31.
-\url{https://api.semanticscholar.org/CorpusID:5777911}.
-
-\end{CSLReferences}
-
-\bookmarksetup{startatroot}
-
-\chapter{Decisions}\label{decisions}
-
-\section{Dueling Bandit}\label{dueling-bandit}
-
-The multi-armed bandit (MAB) problem involves a gambler deciding which
-lever to pull on an MAB machine to maximize the winning rate, despite
-not knowing which machine is the most rewarding. This scenario
-highlights the need to balance exploration (trying new machines to
-discover potential higher rewards) and exploitation (using current
-knowledge to maximize gains). MAB algorithms address this dilemma by
-making decisions under uncertainty to achieve the best possible outcomes
-based on gathered data. At the core of the MAB problem is a set of
-actions, or `arms,' denoted by \(\mathcal{A} = \{1, 2, \ldots, K\}\),
-where \(K\) signifies the total number of arms. For each round \(t\),
-the agent selects an arm \(a_t \in \mathcal{A}\) and receives a reward
-\(r_t\), sampled from an arm-specific, unknown probability distribution.
-The expected reward of pulling arm \(a\) is represented as
-\(\mu_a = \mathbb{E}[r_t | a]\).
-
-The multi-armed bandit framework can be extended in various ways to
-model more complex scenarios. In the infinite-armed bandit problem, the
-set of possible arms \(\mathcal{A}\) is either very large or infinite.
-This introduces significant challenges in exploration, as the agent
-cannot afford to explore each arm even once. Algorithms for
-infinite-armed bandits typically assume some regularity or structure of
-the reward function across arms to make the problem tractable. The
-contextual bandit problem extends the bandit framework by incorporating
-observable external states or contexts that influence the reward
-distributions of arms. The agent's task is to learn policies that map
-contexts to arms to maximize reward. This model is particularly powerful
-for personalized recommendations, where the context can include user
-features or historical interactions. In dueling bandit problems, the
-agent chooses two arms to pull simultaneously and receives feedback only
-on which of the two is better, not the actual reward values. This
-pairwise comparison model is especially useful in scenarios where
-absolute evaluations are difficult, but relative preferences are easier
-to determine, such as in ranking systems.
-
-Contextual bandits extend the multi-armed bandits by making decisions
-conditional on the state of the environment and previous observations.
-The benefit of such a model is that observing the environment can
-provide additional information, potentially leading to better rewards
-and outcomes. In each iteration, the agent is presented with the context
-of the environment, then decides on an action based on the context and
-previous observations. Finally, the agent observes the action's outcome
-and reward. Throughout this process, the agent aims to maximize the
-expected reward.
-
-In many real-world contexts, one may not have a real-valued reward (or
-at least a reliable one) associated with a decision. Instead, we may
-only have observations indicating which of a set of bandits was optimal
-in a given scenario. The assumption is that within these observations of
-preferred choices among a set of options, there is an implicit reward or
-payoff encapsulated in that decision. Consider the following examples:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  \textbf{Dietary preferences}: When providing food recommendations to
-  humans, it is often not possible to quantify an explicit reward from
-  recommending a specific food item. Instead, we can offer meal options
-  and observe which one the person selects.
-\item
-  \textbf{Video recommendation}: Websites like YouTube and TikTok
-  recommend specific videos to users. It is typically not feasible to
-  measure the reward a person gains from watching a video. However, we
-  can infer that a user preferred one video over another. From these
-  relative preference observations, we can develop a strategy to
-  recommend videos they are likely to enjoy.
-\item
-  \textbf{Exoskeleton gait optimization}: Tucker et al.~(2020) created a
-  framework that uses human-evaluated preferences for an exoskeleton
-  gait algorithm to develop an optimal strategy for the exoskeleton to
-  assist a human in walking. A human cannot reliably produce a numerical
-  value for how well the exoskeleton helped them walk but can reliably
-  indicate which option performed best according to their preferences.
-\end{enumerate}
-
-Generally, we assume access to a set of actions. A noteworthy assumption
-is that any observations we make are unbiased estimates of the payoff.
-This means that if we observe a human preferred one option over another
-(or several others), the preferred option had a higher implicit reward
-or payoff than the alternatives. In the case of dietary preferences,
-this may mean that a human liked the preferred option; in the case of
-video recommendations, a user was more entertained, satisfied, or
-educated by the video they selected than the other options.
-
-The overarching context is that we do not have direct or reliable access
-to rewards. We may not have a reward at all (for some decisions, it may
-be impossible to define a real value to the outcome), or it may be noisy
-(for example, if we ask a human to rate their satisfaction on a scale of
-1 to 10). We use relative comparisons to evaluate the best of multiple
-options in this case. Our goal is to minimize total regret in the face
-of noisy comparisons. Humans may not always provide consistent
-observations (since human decision-making is not guaranteed to be
-consistent). However, we can still determine an optimal strategy with
-the observed comparisons. We aim to minimize the frequency of
-sub-optimal decisions according to human preferences. In practice, many
-formulations of bandits can allow for infinitely many bandits (for
-example, in continuous-value and high-dimensional spaces). However, this
-situation can be intractable when determining an optimal decision
-strategy. With infinite options, how can we always ensure we have chosen
-the best? We will constrain our bandits to a discrete space to enable
-efficient exploration. We will assume that we have \(k\) bandits,
-\(b_i, i \in [1, k]\), and our task is to choose the one that will
-minimize regret.
-
-With the framework outlined, we now define our approach more formally.
-This method was introduced by (\citeproc{ref-YUE20121538}{Yue et al.
-2012}), and proofs for the guarantees and derivations of parameters can
-be found in their work.
-
-To determine the optimal action, we will compare pairwise to ascertain
-the probability that an action \(b_i\) is preferred over another
-\(b_j\), where \(i \ne j\). Concretely, we assume access to a function
-\(\epsilon\) that helps determine this probability; in practice, this
-can be done with an oracle, such as asking a human which of two options
-they prefer: \[P(b_i > b_j) = \varepsilon(b_i, b_j) + \frac{1}{2}.\]
-With this model, three basic properties govern the values provided by
-\(\epsilon\):
-\[\epsilon(b_i, b_j) = -\epsilon(b_j, b_i), \epsilon(b_i, b_i) = 0, \epsilon(b_i, b_j) \in \left(-\frac{1}{2}, \frac{1}{2} \right).\]
-
-We assume there is a total ordering of bandits, such that
-\(b_i \succ b_j\) implies \(\epsilon(b_i, b_j) > 0\). We impose two
-constraints to properly model comparisons:
-
-\begin{itemize}
-\item
-  \textbf{Strong Stochastic Transitivity}: We must maintain our total
-  ordering of bandits, and as such, the comparison model also respects
-  this ordering:
-  \begin{equation}\phantomsection\label{eq-stochastic-transitivity}{b_i \succ b_j \succ b_k \Rightarrow \epsilon(b_i, b_k) \ge \text{max}\{\epsilon(b_i, b_j), \epsilon(b_j, b_k)\}.}\end{equation}
-\item
-  \textbf{Stochastic Triangle Inequality}: We also impose a triangle
-  inequality, which captures the condition that the probability of a
-  bandit winning (or losing) a comparison will exhibit diminishing
-  returns as it becomes increasingly superior (or inferior) to the
-  competing bandit:
-  \begin{equation}\phantomsection\label{eq-triangle-inequality}{b_i \succ b_j \succ b_k \Rightarrow \epsilon(b_i, b_k) \le \epsilon(b_i, b_j) + \epsilon(b_j, b_k).}\end{equation}
-\end{itemize}
-
-These assumptions may initially seem limiting; however, common models
-for comparisons satisfy these constraints. For example, the
-Bradley-Terry Model follows
-\(P(b_i > b_j) = \frac{\mu_i}{\mu_i + \mu_j}\). The Gaussian model with
-unit variance also satisfies these constraints:
-\(P(b_i > b_j) = P(X_i - X_j > 0)\), where
-\(X_i - X_j \sim N(\mu_i - \mu_j, 2)\).
-
-To accurately model the preferences between bandits in our framework of
-pairwise bandit comparisons and regret, we must track certain parameters
-in our algorithm. First, we will maintain a running empirical estimate
-of the probability of bandit preferences based on our observations. It
-is important to note that we do not have direct access to an
-\(\epsilon\) function. Instead, we must present two bandits to a human,
-who selects a winner. To do this, we define:
-\[\hat{P}_{i, j} = \frac{\# b_i\ \text{wins}}{\# \text{comparisons between}\ i \text{and}\ j}.\]
-
-We will also compute confidence intervals at each timestep for each of
-the entries in \(\hat{P}\) as
-\[\hat{C}_t = \left( \hat{P}_t - c_t, \hat{P}_t + c_t \right),\] where
-\(c_t = \sqrt{\frac{4\log(\frac{1}{\delta})}{t}}\). Note that
-\(\delta = \frac{1}{TK^2}\), where \(T\) is the time horizon and \(K\)
-is the number of bandits.
-
-Previously, we discussed approaches for finding the best action in a
-specific context. Now, we consider changing contexts, which means there
-is no longer a static hidden preference matrix \(P\). Instead, at every
-time step, there is a preference matrix \(P_C\) depending on context
-\(C\). We consider a context \(C\) and a preference matrix \(P_C\) to be
-chosen by nature as a result of the given environment (Yue et al.,
-2012). The goal of a contextual bandits algorithm is to find a policy
-\(\pi\) that maps contexts to a Von Neumann winner distribution over our
-bandits. That is, our policy \(\pi\) should map any context to some
-distribution over our bandits such that sampling from that distribution
-is preferred to a random action for that context.
-
-\subsection{Regret}\label{regret}
-
-The agent aims to pick a sequence of arms \((a_1, a_2, \ldots, a_T)\)
-across a succession of time steps \(t = 1\) to \(t = T\) to maximize the
-total accumulated reward. Formally, the strategy seeks to maximize the
-sum of the expected rewards:
-\(\max_{a_1, \ldots, a_T} \mathbb{E} \left[\sum_{t=1}^{T} r_t\right]\).
-Regret is defined as the difference between the cumulative reward that
-could have been obtained by always pulling the best arm (in hindsight,
-after knowing the reward distributions) and the cumulative reward
-actually obtained by the algorithm. Formally, if \(\mu^*\) is the
-expected reward of the best arm and \(\mu_{a_t}\) is the expected reward
-of the arm chosen at time \(t\), the regret after \(T\) time steps is
-given by \(R(T) = T \cdot \mu^* - \sum_{t=1}^{T} \mu_{a_t}\). The
-objective of a bandit algorithm is to minimize this regret over time,
-effectively learning to make decisions that are as close as possible to
-the decisions of an oracle that knows the reward distributions
-beforehand. Low regret indicates an algorithm that has often learned to
-choose well-performing arms, balancing the exploration of unknown arms
-with the exploitation of arms that are already known to perform well.
-Thus, an efficient bandit algorithm exhibits sub-linear regret growth,
-meaning that the average regret per round tends to zero as the number of
-rounds \(T\) goes to infinity:
-\(\lim_{T \to \infty} \frac{R(T)}{T} = 0\). Minimizing regret is a
-cornerstone in the design of bandit algorithms, and its analysis helps
-in understanding the long-term efficiency and effectiveness of different
-bandit strategies.
-
-As previously discussed, our goal is to select the bandit that minimizes
-a quantity that reflects regret or the cost of not selecting the optimal
-bandit at all times. We can leverage our comparison model to define a
-quantity for regret over some time horizon \(T\), which is the number of
-decisions we make (selecting what we think is the best bandit at each
-iteration). Assuming we know the best bandit \(b^*\) (and we know that
-there \emph{is} a best bandit, since there is a total ordering of our
-discrete bandits), we can define two notions of regret:
-
-\begin{itemize}
-\item
-  Strong regret: aims to capture the fraction of users who would prefer
-  the optimal bandit \(b^*\) over the \emph{worse} of the options
-  \(b_1, b_2\) we provide at a given
-  step:\(R_T = \sum_{t = 1}^T \text{max} \left\{ \epsilon(b^*, b_1^{(t)}), \epsilon(b^*, b_2^{(t)}) \right\}\)
-\item
-  Weak regret: aims to capture the fraction of users who would prefer
-  the optimal bandit \(b^*\) over the \emph{better} of the options
-  \(b_1, b_2\) we provide at a given
-  step:\(\tilde{R}_T = \sum_{t = 1}^T \text{min} \left\{ \epsilon(b^*, b_1^{(t)}), \epsilon(b^*, b_2^{(t)}) \right\}\)
-\end{itemize}
-
-The best bandit described in our regret definition is called a
-\textbf{Condorcet Winner}. This is the strongest form of winner. It's
-the action \textbf{\(A_{i}\)} which is preferred to each other action
-\textbf{\(A_j\)} with \(p > 0.5\) in a head-to-head election. While the
-above introduced notions of regret assume an overall best bandit to
-exist, there might be settings, where no bandit wins more than half
-head-to-head duels. A set of actions without a Condorcet winner is
-described by the following preference matrix, where each entry
-\(\Delta_{jk}\) is \(p(j \succ k) - 0.5\), the probability that action
-\(j\) is preferred over action \(k\) minus 0.5. There is no Condorcet
-winner as there is no action that is preferred with \(p > 0.5\) over all
-other actions. Imagine, you want to find the best pizza to eat
-(=action). There may not be a pizza that wins more than half of the
-head-to-head duels against every other pizza.
-
-However, we might still have an intuition of the best pizza. Therefore
-Sui et al., 2018 introduce the concepts of different
-\(\textit{winners}\) in dueling bandit problems
-(\citeproc{ref-advancements_dueling}{Sui et al. 2018}). In this example,
-we might define the best pizza as the most popular one. We call the
-Pizza receiving the most votes in a public vote the \textbf{Borda
-Winner}, or formally, Borda winner
-\(j = \arg\max_{i \in A, i \neq j} \left(\sum p(j \succ i)\right)\). In
-contrast to the Condorcet Winner setting, there is always guaranteed to
-be one or more (in the case of a tie) Borda winners for a set of
-actions. However - if there is a Condorcet Winner, this might not
-necessarily be the same as a Borda Winner: In our Pizza example, a
-Pepperoni Pizza might win more than half of its head-to-head duels,
-while the Cheese-Pizza is still the most popular in a public poll.
-
-A more generic concept of winner is the \textbf{Von Neumann Winner},
-which describes a probability distribution rather than a single bandit
-winner. A Von Neumann winner simply prescribes a probability
-distribution \(W\) such that sampling from this distribution `beats' an
-action from the random uniform distribution with \(p > 0.5\). In our
-pizza example, this would correspond to trusting a friend to order
-whichever Pizza he likes, because this may still be preferred to
-ordering randomly. Formally, \(W\) is a Von Neumann if
-\((j \sim W, k \sim R) [p(p(j \succ k) > 0.5) > 0.5]\) where \(R\)
-describes the uniform probability distribution over our actions. The
-concept of a Von Neumann winner is useful in contextual bandits, which
-will be introduced later. In these settings, the preference matrix
-depends on different context, which may have different Borda winners,
-just as different parties may vote for different pizzas.
-
-\begin{figure}
-
-\centering{
-
-\begin{longtable*}[]{@{}lcccccc@{}}
-\toprule\noalign{}
-& A & B & C & D & E & F \\
-\midrule\noalign{}
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-A & 0 & \textbf{0.03} & \textbf{-0.02} & 0.06 & 0.10 & 0.11 \\
-B & -0.03 & 0 & \textbf{0.03} & 0.05 & 0.08 & 0.11 \\
-C & & -0.03 & 0 & 0.04 & 0.07 & 0.09 \\
-D & -0.06 & -0.05 & -0.04 & 0 & 0.05 & 0.07 \\
-E & -0.10 & -0.08 & -0.07 & -0.05 & 0 & 0.03 \\
-F & -0.11 & -0.11 & -0.09 & -0.07 & -0.03 & 0 \\
-\end{longtable*}
-
-}
-
-\caption{\label{fig-condorcet_violation}Violation of Condorcet Winner.
-Highlighted entries are different from Table 1. No Condorcet winner
-exists as no arm could beat every other arm.}
-
-\end{figure}%
-
-Next, we introduce two performance measures for the planner. The
-\textbf{asymptotic ex-post regret} is defined as
-\[\text{Regret}(\mu_1, \ldots \mu_K) = T\cdot \max_i \mu_i - \sum_{i=1}^T E[\mu_{I_t}].\]
-
-Intuitively, this represents the difference between the reward achieved
-by always taking the action with the highest possible reward and the
-expected welfare of the recommendation algorithm (based on the actions
-it recommends at each timestep).
-
-We also define a weaker performance measure, the \textbf{Bayesian
-regret}, which is defined as
-\[\text {Bayesian regret}=E_{\mu_1, \ldots, \mu_K \sim \text {Prior}}\left[\operatorname{Regret}\left(\mu_1, \ldots, \mu_K\right)\right]\]
-
-With a Bayesian optimal policy, we would like either definition of
-regret to vanish as \(T\to \infty\); we are considering ``large-market
-optimal" settings where there are many short-lived, rather than a few
-long-term, users. Note the fact that ex-post regret is prior-free makes
-it robust to inaccuracies on the prior.
-
-\subsection{Acquisition Functions}\label{acquisition-functions}
-
-Various strategies have been developed to balance the
-exploration-exploitation trade-off. These strategies differ in selecting
-arms based on past experiences and rewards.
-
-\subsubsection{Classical Acquisition
-Functions}\label{classical-acquisition-functions}
-
-\textbf{Uniform} acquisition function is the most straightforward
-approach where each arm is selected uniformly randomly over time. This
-strategy does not consider the past rewards and treats each arm equally
-promising regardless of the observed outcomes. It is a purely
-explorative strategy that ensures each arm is sampled enough to estimate
-its expected reward, but it does not exploit the information to optimize
-rewards. In mathematical terms, if \(N_t(a)\) denotes the number of
-times arm \(a\) has been selected up to time \(t\), the Uniform Strategy
-would ensure that \(N_t(a) \approx \frac{t}{K}\) for all arms \(a\) as
-\(t\) grows large: \(P(a_t = a) = \frac{1}{K}\)
-
-The \textbf{Epsilon Greedy} is a popular method that introduces a
-balance between exploration and exploitation. With a small probability
-\(\epsilon\), it explores by choosing an arm at random, and with a
-probability \(1 - \epsilon\), it exploits by selecting the arm with the
-highest estimated reward so far. This strategy incrementally favors
-actions that have historically yielded higher rewards, but still allows
-for occasional exploration to discover better options potentially. The
-parameter \(\epsilon\) is chosen based on the desired exploration level,
-often set between 0.01 and 0.1. \[P(a_t = a) =
-\begin{cases} 
-\frac{\epsilon}{K} + 1 - \epsilon & \text{if } a = \arg\max_{a'} \hat{\mu}_{a'} \\
-\frac{\epsilon}{K} & \text{otherwise}
-\end{cases}\]
-
-\textbf{Upper Confidence Bound} (UCB) acquisition function takes a more
-sophisticated approach to the exploration-exploitation dilemma. It
-selects arms based on both the estimated rewards and the uncertainty or
-variance associated with those estimates. Specifically, it favors arms
-with high upper confidence bounds on the estimated rewards, which is a
-sum of the estimated mean and a confidence interval that decreases with
-the number of times the arm has been played. This ensures that arms with
-less certainty (those played less often) are considered more often,
-naturally balancing exploration with exploitation as the uncertainty is
-reduced over time.
-
-\[P(a_t = a) =
-\begin{cases} 
-1 & \text{if } a = \arg\max_{a'} \left( \hat{\mu}_{a'} + \sqrt{\frac{2 \ln t}{N_t(a')}} \right) \\
-0 & \text{otherwise}
-\end{cases}\]
-
-\subsubsection{Interleaved Filter}\label{interleaved-filter}
-
-This algorithm tries to find the best bandit (Condorcet Winner) in a
-discrete, limited bandit-space via pairwise comparisons of the bandits.
-We will now introduce the algorithm for the Interleaved Filter as
-provided in (\citeproc{ref-YUE20121538}{Yue et al. 2012}) to solve a
-dueling bandit setup. It starts with a randomly defined \emph{best
-bandit} \(\hat{b}\) and iteratively compares it to set \(W\) containing
-the remaining bandits \(b\) resulting in winning probabilities
-\(\hat{P}_{\hat{b},b}\) and confidence interval \(\hat{C}_{\hat{b},b}\).
-If a bandit \(b\) is \emph{confidently worse} than \(\hat{b}\), it is
-removed from \(W\). If a bandit \(b'\) is \emph{confidently better} than
-\(\hat{b}\), it is set as new \emph{best bandit} \(\hat{b}\) and bandit
-\(\hat{b}\) as well as every other bandit \(b\) \emph{worse} than
-\(\hat{b}\) are removed from \(W\). This is done, until \(W\) is empty,
-leaving the final \(\hat{b}\) as the predicted best bandit.
-
-\textbf{input:} \(T\), \(B=\{b_1, \dots, b_k\}\)
-\(\delta \gets 1/(TK^2)\) Choose \(\hat{b} \in B\) randomly
-\(W \gets \{b_1, \dots, b_k\} \backslash \{\hat{b}\}\)
-\(\forall b \in W\), maintain estimate \(\hat{P}_{\hat{b},b}\) of
-\(P(\hat{b} > b)\) according to (6) \(\forall b \in W\), maintain
-\(1 - \delta\) confidence interval \(\hat{C}_{\hat{b},b}\) of
-\(\hat{P}_{\hat{b},b}\) according to (7), (8) compare \(\hat{b}\) and
-\(b\) update \(\hat{P}_{\hat{b},b}\), \(\hat{C}_{\hat{b},b}\)
-\(W \gets W \backslash \{b\}\)
-
-\(W \gets W \backslash \{b\}\) \(\hat{b} \gets b'\),
-\(W \gets W \backslash \{b'\}\) \(\forall b \in W\), reset
-\(\hat{P}_{\hat{b},b}\) and \(\hat{C}_{\hat{b},b}\) \(\hat{T} \gets\)
-Total Comparisons Made \((\hat{b}, \hat{T})\)
-
-\begin{description}
-\item[Parameter Initialization]
-In lines 1-6 of the algorithm, we take the inputs and first compute the
-value \(\delta\) which is used to compute our confidence intervals. We
-select an initial guess of an optimal bandit \(\hat{b}\) by uniformly
-sampling from all bandits \(\mathcal{B}\). We also keep a running set of
-bandit candidates \(W\), which is initialized to be
-\(\mathcal{B} \setminus \{\hat{b}\}\). At this point, we also initialize
-our empirical estimates for \(\hat{P}, \hat{C}\).
-
-Next, we will repeat several steps until our working set of bandit
-candidates \(W\) is empty.
-\item[Update Estimates Based on Comparisons]
-The first step at each iteration (lines 8-11) is to look at all
-candidates in \(W\), and compare them to our current guess \(\hat{b}\)
-using an oracle (e.g.~by asking a human which of \(\hat{b}\) or
-\(b \in W\) is preferred). With this new set of wins and comparisons, we
-update our estimates of \(\hat{P}, \hat{C}\).
-\item[Prune Suboptimal Bandits]
-In lines 12-13, with updated comparison win probabilities and
-corresponding confidence intervals, we can remove bandit candidates from
-\(W\) that we are \emph{confident} \(\hat{b}\) is better than. The
-intuition here is that we are mostly sure that our current best guess is
-better than some of the candidates, and we don't need to consider those
-candidates in future iterations.
-\item[Check for Better Bandits from Candidate Set]
-Now that our candidate set of bandits may be smaller, in lines 15-21 we
-check if there are any bandits \(b'\) that we are \emph{confident} are
-better than our current best guess. If we do find such a candidate, we
-remove bandits which \(\hat{P}\) indicates \(b\) is \emph{likely} worse
-than \(\hat{b}\). Note that in this step, we do not require the
-probability to be outside the confidence interval, since we already
-found one we believe to be significantly closer to optimal than our
-current best guess.
-
-Once we remove the candidates \emph{likely} worse than \(\hat{b}\), we
-crown \(b'\) as the new best guess, e.g.~\(\hat{b} := b'\).
-Consequently, we remove \(b'\) from \(W\) and reset our empirical win
-counters \(\hat{P}, \hat{C}\).
-\end{description}
-
-With this algorithm defined, let us look at some provisions of the
-method with respect to identifying the optimal strategy. Note that the
-proofs and derivations for these quantities are provided in
-(\citeproc{ref-YUE20121538}{Yue et al. 2012}).
-
-First, the method guarantees that for the provided time horizon \(T\),
-the algorithm returns the correct bandit with probability
-\(P \ge 1 - \frac{1}{T}\). It is interesting and useful to note that if
-one has a strict requirement for the probability of identifying the
-correct bandit, one can compute the time horizon \(T\) that guarantees
-this outcome at that probability. Furthermore, a time horizon of 1
-leaves no probabilistic guarantee of a successful outcome, and
-increasing \(T\) has diminishing returns. Second, in the event that the
-algorithm returns an incorrect bandit, the maximal regret incurred is
-linear with respect to \(T\), e.g.~\(\mathcal(O)(T)\). This is also a
-useful provision as it allows us to estimate the overall cost in the
-worst case outcome. Based on these two provisions, we can compute the
-expected cumulative regret from running the Interleaved Filter
-algorithm, which is:
-\[\mathbb{E}\left[R_T\right] \le \left(1 - \frac{1}{T}\right) \mathbb{E}\left[ R_T^{IF} \right] + \frac{1}{T}\mathcal{O}(T) \\
-= \mathcal{O}\left(\mathbb{E}\left[ R_T^{IF} \right] + 1\right)\]
-
-Interestingly, the original work shows that these bounds hold for both
-strong and weak regret. As demonstrated, the Interleaved Filter
-algorithm \hyperref[fig-if]{{[}fig-if{]}} provides a robust method to
-ascertain the optimal bandit or strategy given a set of options and only
-noisy comparisons. In most real-world scenarios for modeling human
-preferences, it is not possible to observe a real-world reward value, or
-at least a reliable one and as such this method is a useful way to
-properly model human preferences.
-
-Furthermore, the algorithm provides strong guarantees for the
-probability of selecting the correct bandit, maximal regret, and the
-number of comparisons required. It is even more impressive that the
-method can do so without severely limiting constraints; as demonstrated,
-the most commonly used models satisfy the imposed constraints.
-
-As we look to model human preferences, we can certainly leverage this
-method for k-armed dueling bandits to identify the best strategy to
-solve human-centric challenges, from video recommendation to meal
-selection and exoskeleton-assisted walking.
-
-\subsubsection{Dueling Bandit Gradient
-Descent}\label{dueling-bandit-gradient-descent}
-
-This algorithm tries to find the best bandit in a continuous
-bandit-space. Here, the set of all bandits is regarded as an
-Information-Retrieval (IR) system with infinite bandits uniquely defined
-by \(w\). We will cover the \emph{Dueling Bandit Gradient Descent}
-algorithm from Yue and Joachims 2009 (\citeproc{ref-IR}{Yue and Joachims
-2009}). Yue and Joachims use the dueling bandits formulation for online
-IR optimization. They propose a retrieval system parameterized by a set
-of continuous variables lying in \(W\), a \(d\)-dimensional unit-sphere.
-The DBGD algorithm adapts the current parameters \(w_t\) of IR system by
-comparison with slightly altered parameters \(w_t'\) both querying query
-\(q_t\). Only if the IR outcome using \(w_t'\) is preferred, the
-parameters are changed in their direction. We will now discuss the
-algorithm more detailed.
-
-\textbf{input:} \(\gamma\), \(\delta\), \(w_1\)
-
-Sample unit vector \(u_t\) uniformly
-
-\(w_t' \gets P_W(w_t + \delta u_t)\)
-
-Compare \(w_t\) and \(w_t'\)
-
-\(w_{t+1} \gets P_W(w_t + \gamma u_t)\)
-
-\(w_{t+1} \gets w_t\)
-
-We first choose exploration step length \(\delta\), exploitation step
-length \(\gamma\), and starting point (in unit-sphere) \(w_1\). Choose a
-query and sample a random unit vector \(u_t\). We duel \(w_t\) and
-\(w_t'\), where \(w_t\) is our current point in the sphere, and \(w_t'\)
-is our exploratory comparison, which is generated by taking a random
-step of length \(\delta\), such that \(w_t' = w_t + \delta u_t\). The
-objective of this duel is to ascertain the binary preference of users
-with respect to the results yielded by the IR systems parameterized by
-\(w_t\) and \(w_t'\) respectively, taking query \(q_t\) as an input. The
-parameters that get the majority of the votes in the head to head win.
-If \(w_t\) wins, then we keep the parameters for the next iteration. If
-\(w_t'\) wins the duel, we update our parameters in the direction of
-\(u_t\) by taking a step of length \(\gamma\). Note that the algorithm
-describes projection operation \(P_W(\overrightarrow{v})\). Since
-\(u_t\) is chosen randomly, \(w_t + \delta u_t\) or \(w_t + \gamma u_t\)
-could exist outside of the unit sphere where all possible parameter
-configurations lie. In this case, we simply project the point back onto
-the sphere using said projection \(P_W(\overrightarrow{v})\).
-
-Yue and Joachims show that this algorithm has sublinear regret in \(T\),
-the number of iterations. We note that the algorithm assumes that there
-exists a hidden reward function \(R(w)\) that maps system parameters
-\(w_t\) to a reward value which is smooth and strictly concave over the
-input space \(W\).
-
-Lastly, we would also like to give motivation behind \(\delta\) and
-\(\gamma\) being different values. We need a \(\delta\) that is
-sufficiently large that the comparison between a system parameterized by
-\(w_t\) and \(w_t'\) is meaningful. On the other hand, we may wish to
-take a smaller step in the direction of \(w_t'\) during our update step,
-as during a duel, we only score \(w_t\) against \(w_t'\) over the
-results on one query \(q_t\). Having \(\delta > \gamma\) allows us to
-get reward signal from meaningfully different points while also updating
-our belief of the best point \(w_{\text{best}}\) gradually.
-
-\subsubsection*{Sparring EXP4}\label{sparring-exp4}
-\addcontentsline{toc}{subsubsection}{Sparring EXP4}
-
-Zoghi et al.~2015 propose one algorithm for this problem --- sparring
-EXP4, which duels two traditional EXP4 - algorithms. The (traditional)
-EXP4 algorithm solves the traditional contextual bandits --- the case
-where we can directly observe a reward for a choice of bandit given a
-context. The EXP4 algorithm embeds each bandit as a vector. When the
-algorithm sees the context (called `advice' in this formulation), it
-produces a probability distribution over the choices based on an
-adjusted softmax function on the inner product between the context and
-the bandit vectors. The probability function is different from a softmax
-as we assign some minimum probability that any action gets chosen to
-enforce exploration. A reward is then observed for the choice and
-propagated back through the embedding of the chosen bandit.
-
-Sparring EXP4 runs two instances of the EXP4 algorithm against each
-other. Each EXP4 instance samples an action given a context, and then
-these choices are `dueled' against each other. Instead of directly
-observing a reward, as for traditional EXP4, we instead observe two
-converse reward --- a positive reward for the choice that won the duel
-and a negative reward to the choice that lost. The reward is
-proportional to the degree to which the bandit wins the duel, i.e.~how
-likely the bandit is to be preferred over the other when users are
-queried for binary preferences. Like in traditional EXP4, the reward or
-negative reward is then propagated back through the representations of
-the bandits.
-
-\subsubsection{Feel-good Thompson
-sampling}\label{feel-good-thompson-sampling}
-
-This algorithm is a solution for the contextual dueling bandit setting,
-and tries to minimize cumulative average regret (= find WHAT WINNER?!Von
-Neumann???):
-\[\text{Regret}(T) := \sum_{t=1}^{T} \left[ r_{*}(x_t, a_{t}^{*}) - \frac{r_{*}(x_t, a_{t}^{1}) + r_{*}(x_t, a_{t}^{2})}{2} \right],\]
-where \(r_{*}(x_t, a_{t})\) is the true, hidden reward function of a
-context \(x_t\) and action \(a_t\). Thompson sampling is an iterative
-process of receiving preference over two actions, each maximizing a
-different approximation of the reward function based on past data and
-adding this new information to the data.
-
-Finding good approximations of the reward function at time \(t\) is done
-by sampling two reward function parameters \(\theta_t^{j=1}\) and
-\(\theta_t^{j=2}\) from a posterior distribution based on all previous
-data \(p_j(\cdot \mid S_{t-1})\). This posterior distribution is
-proportional to the multiplication of the prior and the likelihood
-function, which is a Gaussian in standard Thompson sampling. In
-Feel-Good Thompson sampling, an additional term called "Feel-good
-exploration" encourages parameters \(\theta\) with a large maximum
-reward in previous rounds. This change to the likelihood function may
-increase probabilities in uncertain areas, thus exploring those regions.
-All that's left is to select an action maximizing each reward function
-approximation and receive a preference \(y_t\) on one of them to add the
-new information to the dataset(\citeproc{ref-fgts_cdb}{Zhang 2021}).
-
-Initialize \(S_0 = \varnothing\). Receive prompt \(x_t\) and action
-space \(\mathcal{A}_t\). Sample model parameter \(\theta_t^j\) from the
-posterior distribution \(p^j(\cdot \mid S_{t-1})\) Select response
-\(a_t^j = \arg\max_{a \in \mathcal{A}_t} \langle \theta_t^j, \phi(x_t, a) \rangle\).
-Receive preference \(y_t\). Update dataset
-\(S_t \leftarrow S_{t-1} \cup \{(x_t, a_t^1, a_t^2, y_t)\}\).
-
-\subsection{Applications}\label{applications}
-
-There are many applications where contextual bandits are used. Many of
-these applications can utilize human preferences. One particular
-application illustrates the benefits a contextual bandit would have over
-a multi-armed bandit: a website deciding which app to show someone
-visiting the website. A multi-armed bandit might decide to show someone
-an ad for a swimsuit because the swimsuit ads have gotten the most user
-clicks (which indicates human preference). A contextual bandit might
-choose differently, however. A contextual bandit will also take into
-account the context, which in this case might mean information about the
-user (location, previously visited pages, and device information). If it
-discovers the user lives in a cold environment, for example, it might
-suggest a sweater ad for the user instead and get a better chance of a
-click. There are many more examples of where contextual bandits can be
-applied. They can be applied in other web applications, such as to
-optimize search results, medical applications, such as how much of a
-medication to prescribe based on a patient's history, and gaming
-applications, such as basing moves off of the state of a chess board to
-try to win. In each of the above examples, human feedback could have
-been introduced during training and leveraged to learn a reward
-function.
-
-We explored different versions of bandits that address the
-exploration-exploitation trade-off in various real-world scenarios.
-These models have been employed across various fields, including but not
-limited to healthcare, finance, dynamic pricing, and anomaly detection.
-This section provides a deep dive into some real-world applications,
-emphasizing the value and advancements achieved by incorporating bandit
-methodologies. The content of this section draws upon the findings from
-the survey cited in reference
-(\citeproc{ref-bouneffouf2020survey}{Bouneffouf, Rish, and Aggarwal
-2020}).
-
-In healthcare, researchers have been applying bandits to address
-challenges in clinical trials and behavioral modeling
-(\citeproc{ref-bouneffouf2017bandit}{Bouneffouf, Rish, and Cecchi 2017};
-\citeproc{ref-bastani2020online}{Bastani and Bayati 2020}). One of the
-examples is drug dosing. Warfarin, an oral anticoagulant, has
-traditionally been administered using fixed dosing protocols. Physicians
-would then make subsequent adjustments based on the patient's emerging
-symptoms. Nonetheless, inaccuracies in the initial dosage---whether too
-low or too high---can lead to serious complications like strokes and
-internal bleeding. In a pivotal study, researchers in
-(\citeproc{ref-bastani2020online}{Bastani and Bayati 2020}) modeled the
-Warfarin initial dosing as a contextual bandit problem to assign dosages
-to individual patients appropriately based on their medication history.
-Their contributions include the adaptation of the LASSO estimator to the
-bandit setting, achieving a theoretical regret bound of
-\(O({s_0}^2 \log^2(dT)\), where \(d\) represents the number of
-covariates, \(s_0 << d\) signifies the number of pertinent covariates,
-and \(T\) indicates the total number of users. Additionally, they
-conducted empirical experiments to validate the robustness of their
-methodology.
-
-Within the finance sector, bandits have been instrumental in reshaping
-the landscape of portfolio optimization. Portfolio optimization is an
-approach to designing a portfolio based on the investor's return and
-risk criteria, which fits the exploration-exploitation nature of the
-bandit problems. (\citeproc{ref-shen2015portfolio}{Shen et al. 2015})
-utilized multi-armed bandits to exploit correlations between the
-instruments. They constructed orthogonal portfolios and integrated them
-with the UCB policy to achieve a cumulative regret bound of
-\(\frac{8n}{\Delta*} \ln(m) + 5n\), where \(n\), \(m\), and \(\Delta*\)
-denotes the number of available assets, total time steps, and the gap
-between the best-expected reward and the expected reward. On the other
-hand, (\citeproc{ref-huo2017risk}{Huo and Fu 2017}) focused on
-risk-awareness online portfolio optimization by incorporating a compute
-of the minimum spanning tree in the bipartite graph, which encodes a
-combination of financial institutions and assets that helps diversify
-and reduce exposure to systematic risk during the financial crisis.
-
-Dynamic pricing, also known as demand-based pricing, refers to the
-strategy of setting flexible prices for products or services based on
-current market demands. The application of bandits in dynamic pricing
-offers a systematic approach to making real-time pricing decisions while
-balancing the trade-off between exploring new price points and
-exploiting known optimal prices. (\citeproc{ref-misra2019dynamic}{Misra,
-Schwartz, and Abernethy 2019}) proposed a policy where the company has
-only incomplete demand information. They derived an algorithm that
-balances immediate and future profits by combining multi-armed bandits
-with partial identification of consumer demand from economic theory.
-
-are essential components of numerous online platforms, guiding users
-through vast content landscapes to deliver tailored suggestions. These
-systems are instrumental in platforms like e-commerce sites, streaming
-platforms, and social media networks. However, the challenge of
-effectively recommending items to users is non-trivial, given the
-dynamic nature of user preferences and the vast amount of content
-available.
-
-One of the most significant challenges in recommendation systems is the
-"cold start" problem. This issue arises when a new user joins a
-platform, and the system has limited or no information about the user's
-preferences. Traditional recommendation algorithms struggle in such
-scenarios since they rely on historical user-item interactions. As
-discussed in (\citeproc{ref-zhou2017large}{Zhou et al. 2017}), the
-bandit setting is particularly suitable for large-scale recommender
-systems with a vast number of items. By continuously exploring user
-preferences and exploiting known interactions, bandit-based recommender
-systems can quickly adapt to new users, ensuring relevant
-recommendations in a few interactions. The continuous exploration
-inherent in bandit approaches also means that as a user's preferences
-evolve, the system can adapt, ensuring that recommendations remain
-relevant. Recommending content that is up to date is also another
-important aspect of a recommendation system. In
-(\citeproc{ref-bouneffouf2012a}{Bouneffouf, Bouzeghoub, and Gançarski
-2012}), the concept of "freshness" in content is explored through the
-lens of the bandit problem. The Freshness-Aware Thompson Sampling
-algorithm introduced in this study aims to manage the recommendation of
-fresh documents according to the user's risk of the situation.
-
-Dialogue systems, often termed conversational agents or chatbots, aim to
-simulate human-like conversations with users. These systems are deployed
-across various platforms, including customer support, virtual
-assistants, and entertainment applications, and they are crucial for
-enhancing user experience and engagement. Response selection is
-fundamental to creating a natural and coherent dialogue flow.
-Traditional dialogue systems rely on a predefined set of responses or
-rules, which can make interactions feel scripted and inauthentic. In
-(\citeproc{ref-liu2018customized}{Liu et al. 2018}), the authors
-proposed a contextual multi-armed bandit model for online learning of
-response selection. Specifically, they utilized bidirectional LSTM to
-produce the distributed representations of a dialogue context and
-responses and customized the Thompson sampling method.
-
-To create a more engaging and dynamic interaction, there's a growing
-interest in developing pro-active dialogue systems that can initiate
-conversations without user initiation.
-(\citeproc{ref-perez2018contextual}{perez and Silander 2018}) proposed a
-novel approach to this challenge with contextual bandits. By introducing
-memory models into the bandit framework, the system can recall past
-interactions, making its proactive responses more contextually relevant.
-Their contributions include the Contextual Attentive Memory Network,
-which implements a differentiable attention mechanism over past
-interactions.
-
-(\citeproc{ref-upadhyay2019a}{Upadhyay et al. 2019}) addressed the
-challenge of orchestrating multiple independently trained dialogue
-agents or skills in a unified system. They attempted online posterior
-dialogue orchestration, defining it as selecting the most suitable
-subset of skills in response to a user's input, which studying a
-context-attentive bandit model that operates under a skill execution
-budget, ensuring efficient and accurate response selection.
-
-Anomaly detection refers to the task of identifying samples that behave
-differently from the majority. In
-(\citeproc{ref-ding2019interactive}{Ding, Li, and Liu 2019}), the
-authors delve into anomaly detection in an interactive setting, allowing
-the system to actively engage with human experts through a limited
-number of queries about genuine anomalies. The goal is to present as
-many true anomalies to the human expert as possible after a fixed query
-budget is used up. They applied the multi-armed contextual bandit
-framework to address this issue. This algorithm adeptly integrates both
-nodal attributes and node dependencies into a unified model, efficiently
-managing the exploration-exploitation trade-off during anomaly queries.
-
-There are many challenges associated with contextual bandits. The first
-challenge is that each action only reveals the reward for that
-particular action. Therefore, the algorithm has to work with incomplete
-information. This leads to the dilemma of exploitation versus
-exploration: when should the algorithm choose the best-known option
-versus trying new options for potentially better outcomes? Another
-significant challenge for contextual bandits is using context
-effectively. The context the environment gives needs to be explored to
-figure out which action is best for each context.
-
-The overarching goal in systems designed for recommending options of
-high value to users is to achieve an optimal balance between exploration
-and exploitation. This dual approach is crucial in environments where
-user preferences and needs are dynamic and diverse. Exploration refers
-to the process of seeking out new options, learning about untried
-possibilities, and gathering fresh information that could lead to
-high-value recommendations. In contrast, exploitation involves utilizing
-existing knowledge and past experiences to recommend the best options
-currently known. This balance is key to maintaining a system that
-continuously adapts to changing user preferences while ensuring the
-reliability of its recommendations.
-
-A key observation in such systems is the dual role of users as both
-producers and consumers of information. Each user's experience
-contributes valuable data that informs future recommendations for
-others. For instance, platforms like Waze, Netflix, and Trip Advisor
-rely heavily on user input and feedback. Waze uses real-time traffic
-data from drivers to recommend optimal routes; Netflix suggests movies
-and shows based on viewing histories and ratings; Trip Advisor relies on
-traveler reviews to guide future tourists. In these examples, the
-balance between gathering new information (exploration) and recommending
-the best-known options (exploitation) is dynamically managed to enhance
-user experience and satisfaction. This approach underscores the
-importance of user engagement in systems where monetary incentives are
-not (or can not be) the primary driver.
-
-Recommendation systems often face the challenge of overcoming user
-biases that can lead to a narrow exploration of options. Users come with
-preconceived notions and preferences, which can cause them to overlook
-potentially valuable options that initially appear inferior or unaligned
-with their interests. This predisposition can significantly limit the
-effectiveness of recommendation systems, as users might miss out on
-high-value choices simply due to their existing biases.
-
-To counteract this, it is crucial for recommendation systems to actively
-incentivize exploration among users. One innovative approach to achieve
-this is through the strategic use of \textbf{information asymmetry}. By
-controlling and selectively presenting information, these systems can
-guide users to explore options they might not typically consider. This
-method aims to reveal the true potential of various options by nudging
-users out of their comfort zones and encouraging a broader exploration
-of available choices. An important note here is that the system is not
-lying to users - it only selectively reveals information it has.
-
-The concept of incentivizing exploration becomes even more complex when
-considering different types of users. For instance, systems often
-encounter short-lived users who have little to gain from contributing to
-the system's learning process, as their interactions are infrequent or
-based on immediate needs. Similarly, some users may operate under a
-`greedy' principle, primarily seeking immediate gratification rather
-than contributing to the long-term accuracy and effectiveness of the
-system. In such scenarios, managing information asymmetry can be a
-powerful tool. By selectively revealing information, recommendation
-systems can create a sense of novelty and interest, prompting even the
-most transient or self-interested users to engage in exploration,
-thereby enhancing the system's overall knowledge base and recommendation
-quality.
-
-\section{Preferential Bayesian
-Optimization}\label{preferential-bayesian-optimization}
-
-The traditional Bayesian optimization (BO) problem is described as
-follows. There is a black-box objective function
-\(g: \mathcal{X} \rightarrow \Re\) defined on a bounded subset
-\(\mathcal{X} \subseteq \Re^q\) such that direct queries to the function
-are expensive or not possible. However, we would like to solve the
-global optimization problem of finding
-\(\mathbf{x}_{\min }=\arg \min _{\mathbf{x} \in \mathcal{X}} g(\mathbf{x})\).
-This is highly analogous to modeling human preferences, since it is the
-case that direct access to a human's latent preference function is not
-possible but we would still like to find its optimum, such as in A/B
-tests or recommender systems.
-
-We approach this problem for human preferences with \emph{Preferential
-Bayesian Optimization} (PBO), as the key difference is that we are able
-to query the preference function through pairwise comparisons of data
-points, i.e.~\emph{duels}. This is a form of indirect observation of the
-objective function, which models real-world scenarios closely: we
-commonly need to to optimize a function via data about preferences. With
-humans, it has been demonstrated that we are better at evaluating
-differences rather than absolute magnitudes
-(\citeproc{ref-kahneman_tversky_1979}{Kahneman and Tversky 1979}) and
-therefore PBO models can be applied in various contexts.
-
-\subsection{Problem statement}\label{problem-statement}
-
-The problem of finding the optimum of a latent preference function
-defined on \(\mathcal{X}\) can be reduced to determining a sequence of
-duels on \(\mathcal{X} \times \mathcal{X}\). From each duel
-\(\left[\mathbf{x}, \mathbf{x}^{\prime}\right] \in\)
-\(\mathcal{X} \times \mathcal{X}\) we obtain binary feedback \(\{0,1\}\)
-indicating whether or not \(\mathbf{x}\) is preferred over
-\(\mathbf{x}^{\prime}\) (\(g(\mathbf{x}) < g(\mathbf{x}^{\prime})\)). We
-consider that \(\mathbf{x}\) is the winner of the duel if the output is
-\(\{1\}\) and that \(\mathbf{x}^{\prime}\) wins the duel if the output
-is \(\{0\}\). The aim is to find \(\mathbf{x}_{\min }\) by reducing as
-much as possible the number of queried duels.
-
-The key idea in PBO is to learn a preference function in the space of
-duels using a Gaussian process. We define a joint reward
-\(f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)\) on each
-duel which is never directly observed. Instead, the feedback we obtain
-after each pair is a binary output \(y \in\) \(\{0,1\}\) indicating
-which of the two inputs is preferred. One definition of f we will use
-(though others are possible) is
-\(f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)=g\left(\mathbf{x}^{\prime}\right)-g(\mathbf{x})\).
-The more \(\mathbf{x}^{\prime}\) is preferred over \(\mathbf{x}\), the
-bigger the reward.
-
-We define the model of preference using a Bernoulli likelihood, where
-\(p\left(y=1 \mid\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)=\pi_f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)\)
-and
-\(p\left(y=0 \mid\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)=\pi_f\left(\left[\mathbf{x}^{\prime}, \mathbf{x}\right]\right)\)
-for some inverse link function \(\pi: \Re \times \Re \rightarrow[0,1]\).
-\(\pi_f\) has the property that
-\(\pi_f\left(\left[\mathbf{x}^{\prime}, \mathbf{x}\right]\right)=1-\pi_f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)\).
-A natural choice for \(\pi_f\) is the logistic function
-\[\label{eq:bernoulli_pref}
-\pi_f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)=\sigma\left(f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)\right)=\frac{1}{1+e^{-f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)}},\]
-but others are possible. Therefore we have that for any duel
-\(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\) in which
-\(g(\mathbf{x}) \leq g\left(\mathbf{x}^{\prime}\right)\) it holds that
-\(\pi_f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right) \geq 0.5\).
-\(\pi_f\) is a preference function that maps each query
-\(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\) to the probability of
-having a preference on the left input \(\mathbf{x}\) over the right
-input \(\mathbf{x}^{\prime}\).
-
-When we marginalize over the right input \(\mathbf{x}^{\prime}\) of
-\(f\) (is this correct?), the global minimum of \(f\) in \(\mathcal{X}\)
-coincides with \(\mathbf{x}_{\min }\). We also introduce the definition
-of the \emph{Copeland score function} for a point \(\mathbf{x}\) as
-\[S(\mathbf{x})=\operatorname{Vol}(\mathcal{X})^{-1} \int_{\mathcal{X}} \mathbb{I}_{\left\{\pi_f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right) \geq 0.5\right\}} d \mathbf{x}^{\prime}\]
-where
-\(\operatorname{Vol}(\mathcal{X})=\int_{\mathcal{X}} d \mathbf{x}^{\prime}\)
-is a normalizing constant that bounds \(S(\mathbf{x})\) in the interval
-\([0,1]\). If \(\mathcal{X}\) is a finite set, the Copeland score is
-simply the proportion of duels that a certain element \(\mathbf{x}\)
-will win with probability larger than 0.5. A soft variant we will use
-instead of the Copeland score is the \emph{soft-Copeland score}, defined
-as \[\label{eq:soft-copeland}
-C(\mathbf{x})=\operatorname{Vol}(\mathcal{X})^{-1} \int_{\mathcal{X}} \pi_f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right) d \mathbf{x}^{\prime}\]
-where the probability function \(\pi_f\) is integrated over
-\(\mathcal{X}\). This score aims to capture the average probability of
-\(\mathbf{x}\) being the winner of a duel.
-
-We define the \emph{Condorcet winner} \(\mathbf{x}_c\) as the point with
-maximal soft-Copeland score. Note that this corresponds to the global
-minimum of \(f\), since the defining integral takes maximum value for
-points \(\mathbf{x} \in \mathcal{X}\) where
-\(f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)=\)
-\(g\left(\mathbf{x}^{\prime}\right)-g(\mathbf{x})>0\) or all
-\(\mathbf{x}^{\prime}\), occurring only if \(\mathbf{x}_c\) is a minimum
-of \(f\). Therefore, if the preference function \(\pi_f\) can be learned
-by observing the results of duels then our optimization problem of
-finding the minimum of \(f\) can be solved by finding the Condorcet
-winner of the Copeland score.
-
-\subsection{Acquisition Functions}\label{acquisition-functions-1}
-
-We describe several acquisition functions for sequential learning of the
-Condorcet winner. Our dataset
-\(\mathcal{D}=\left\{\left[\mathbf{x}_i, \mathbf{x}_i^{\prime}\right], y_i\right\}_{i=1}^N\)
-represents the \(N\) duels that have been performed so far. We aim to
-define a sequential policy
-\(\alpha\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right] ; \mathcal{D}_j, \theta\right)\)
-for querying duels, where \(\theta\) is a vector of model
-hyper-parameters, in order to find the minimum of the latent function
-\(g\) as quickly as possible. Using Gaussian processes (GP) for
-classification with our dataset \(\mathcal{D}\) allows us to perform
-inference over \(f\) and \(\pi_f\).
-
-\subsubsection*{Pure Exploration}\label{pure-exploration}
-\addcontentsline{toc}{subsubsection}{Pure Exploration}
-
-The output variable \(y_{\star}\) of a prediction follows a Bernoulli
-distribution with probability given by the preference function
-\(\pi_f\). To carry out exploration as a policy, one method is to search
-for the duel where GP is most uncertain about the probability of the
-outcome (has the highest variance of \(\sigma\left(f_{\star}\right)\) ),
-which is the result of transforming out epistemic uncertainty about
-\(f\), modeled by a GP, through the logistic function. The first order
-moment of this distribution coincides with the expectation of
-\(y_{\star}\) but its variance is \[\begin{aligned}
-\mathbb{V}\left[\sigma\left(f_{\star}\right)\right] & =\int\left(\sigma\left(f_{\star}\right)-\mathbb{E}\left[\sigma\left(f_{\star}\right)\right]\right)^2 p\left(f_{\star} \mid \mathcal{D},\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right) d f_{\star} \\
-& =\int \sigma\left(f_{\star}\right)^2 p\left(f_{\star} \mid \mathcal{D},\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right) d f_{\star}-\mathbb{E}\left[\sigma\left(f_{\star}\right)\right]^2
-\end{aligned}\] which explicitly takes into account the uncertainty over
-\(f\). Hence, pure exploration of duels space can be carried out by
-maximizing
-\[\alpha_{\mathrm{PE}}\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right] \mid \mathcal{D}_j\right)=\mathbb{V}\left[\sigma\left(f_{\star}\right)\left|\left[\mathbf{x}_{\star}, \mathbf{x}_{\star}^{\prime}\right]\right| \mathcal{D}_j\right] .\]
-
-Note that in this case, duels that have been already visited will have a
-lower chance of being visited again even in cases in which the objective
-takes similar values in both players. In practice, this acquisition
-functions requires computation of an intractable integral, that we
-approximate using Monte-Carlo.
-
-\subsubsection*{Principled Optimistic Preferential Bayesian Optimization
-(POP-BO)}\label{principled-optimistic-preferential-bayesian-optimization-pop-bo}
-\addcontentsline{toc}{subsubsection}{Principled Optimistic Preferential
-Bayesian Optimization (POP-BO)}
-
-In a slightly modified problem setup
-(\citeproc{ref-xu2024principledpreferentialbayesianoptimization}{Xu et
-al. 2024}), the algorithm tries to solve for the MLE \(\hat{g}\) and its
-confidence set \(\mathcal{B}_g\) where \(g\) is the ground truth
-black-box function. Assumptions include that \(g\) is a member of a
-reproducing kernel Hilbert space (RKHS) \(\mathcal{H}_k\) for some
-kernel function
-\(k: \mathbb{R}^d \times \mathbb{R}^d \rightarrow \mathbb{R}\), and
-\(\|g\|_k \leq B\) so that
-\(\mathcal{B}_g = \left\{\tilde{g} \in \mathcal{H}_k \mid\|\tilde{g}\|_k \leq B\right\}\).
-Similarly defining
-\(f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)=g\left(\mathbf{x}^{\prime}\right)-g(\mathbf{x})\),
-we model the preference function with a Bernoulli distribution as in
-Equation \hyperref[eq:bernoulli_pref]{{[}eq:bernoulli\_pref{]}} and also
-assume that probabilities follow the Bradley-Terry model, i.e.
-\[\pi_f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)=\sigma\left(f\left(\left[\mathbf{x}, \mathbf{x}^{\prime}\right]\right)\right)=\frac{e^{g(\mathbf{x})}}{e^{g(\mathbf{x})}+e^{g\left(\mathbf{x^{\prime}}\right)}}\]
-
-The update rule for MLE \(\hat{g}\) is (equation 8,6,5)
-\[\begin{aligned}
-\hat{g}_t^{\text {MLE }}&:= \arg \underset{\tilde{g} \in \mathcal{B}^t_g}{\max}\ell_t(\tilde{g}) \\
-\ell_t(\tilde{g}) &:= \log \prod_{\tau=1}^t y_\tau \pi_{\tilde{f}}([\mathbf{x_\tau}, \mathbf{x^{\prime}_\tau}])+\left(1-y_\tau\right)\left(1-\pi_{\tilde{f}}([\mathbf{x_\tau}, \mathbf{x^{\prime}_\tau}])\right) \\
-&=\sum_{\tau=1}^t \log \left(\frac{e^{\tilde{g}(\mathbf{x_\tau})} y_\tau+e^{\tilde{g}(\mathbf{x_\tau^\prime})}\left(1-y_\tau\right)}{e^{\tilde{g}(\mathbf{x_\tau})}+e^{\tilde{g}(\mathbf{x_\tau^\prime})}}\right) \\
-&=\sum_{\tau=1}^t\left(\tilde{g}(\mathbf{x_\tau}) y_\tau+\tilde{g}(\mathbf{x_\tau^\prime})\left(1-y_\tau\right)\right)-\sum_{\tau=1}^t \log \left(e^{\tilde{g}(\mathbf{x_\tau})}+e^{\tilde{g}(\mathbf{x_\tau^\prime})}\right)
-\end{aligned}\]
-
-(Eq 22 shows how to represent this as a convex optimisation problem so
-that it can be solved)
-
-The update rule for the confidence set \(\mathcal{B}_f^{t+1}\) is, (eq
-9, 10?)
-
-\[\begin{aligned}
-&\forall \epsilon, \delta > 0 \\
-&\mathcal{B}_g^{t+1}:=\left\{\tilde{g} \in \mathcal{B}_g \mid \ell_t(\tilde{g}) \geq \ell_t\left(\hat{g}_t^{\mathrm{MLE}}\right)-\beta_1(\epsilon, \delta, t)\right\}
-\end{aligned}\] where
-\[\beta_1(\epsilon, \delta, t):=\sqrt{32 t B^2 \log \frac{\pi^2 t^2 \mathcal{N}\left(\mathcal{B}_f, \epsilon,\|\cdot\|_{\infty}\right)}{6 \delta}}+ C_L \epsilon t=\mathcal{O}\left(\sqrt{t \log \frac{t \mathcal{N}\left(\mathcal{B}_f, \epsilon,\|\cdot\|_{\infty}\right)}{\delta}}+\epsilon t\right),\]
-with \(C_L\) a constant independent of \(\delta, t\) and \(\epsilon\).
-\(\epsilon\) is typically chosen to be \(1 / T\), where T is the running
-horizon of the algorithm. This satisfies the theorem that,
-\[\mathbb{P}\left(g \in \mathcal{B}_g^{t+1}, \forall t \geq 1\right) \geq 1-\delta .\]
-
-Intuitively, the confidence set \(\mathcal{B}_g^{t+1}\) includes the
-functions with the log-likelihood value that is only `a little worse'
-than the maximum likelihood estimator, and the theorem states that
-\(\mathcal{B}_g^{t+1}\) contains the ground-truth function \(g\) with
-high probability.
-
-Inner level optimization in Line 4 of the algorithm can also be
-represented as a convex optimisation problem so that it can be solved,
-Eq 24, 25. The outer optimisation can be solved using grid search or Eq
-26 for medium size problems.
-
-Given the initial point \(\mathbf{x_0} \in \mathcal{X}\) and set
-\(\mathcal{B}_g^1 = \mathcal{B}_g\) Set the reference point
-\(\mathbf{x_t^{\prime}} = \mathbf{x_{t-1}}\) Compute
-\(\mathbf{x_t} \in \arg\max_{\mathbf{x} \in \mathcal{X}} \max_{\tilde{g} \in \mathcal{B}_g^t} (\tilde{g}(\mathbf{x}) - \tilde{g}(\mathbf{x_t^{\prime}}))\),
-with the inner optimal function denoted as \(\tilde{g}_t\) Obtain the
-output of the duel \(y_t\) and append the new data point to
-\(\mathcal{D}_t\) Update the maximum likelihood estimator
-\(\hat{g}_t^{\mathrm{MLE}}\) and the posterior confidence set
-\(\mathcal{B}_g^{t+1}\).
-
-\subsubsection*{qEUBO: Decision-Theoretic
-EUBO}\label{qeubo-decision-theoretic-eubo}
-\addcontentsline{toc}{subsubsection}{qEUBO: Decision-Theoretic EUBO}
-
-qEUBO
-(\citeproc{ref-astudillo2023qeubodecisiontheoreticacquisitionfunction}{Astudillo
-et al. 2023}) derives an acquisition function that extends duels to
-\(q>2\) options which we call \emph{queries}. Let
-\(X=\left(\mathbf{x_1}, \ldots, \mathbf{x_q}\right) \in \mathcal{X}^q\)
-denote a query containing two points or more, and let
-\(g: \mathcal{X} \rightarrow \Re\) be the latent preference function.
-Then after \(n\) user queries, we define the \emph{expected utility of
-the best option} (qEUBO) as
-\[\mathrm{qEUBO}_n(X)=\mathbb{E}_n\left[\max \left\{g\left(x_1\right), \ldots, g\left(x_q\right)\right\}\right].\]
-
-We now show that qEUBO is one-step Bayes optimal, meaning that each step
-chooses the query that maximises the expected utility received by the
-human. For a query \(X \in \mathcal{X}^q\), let
-\[V_n(X)=\mathbb{E}_n\left[\max _{x \in \mathbb{X}} \mathbb{E}_{n+1}[g(x)] \mid X_{n+1}=X\right] .\]
-Then \(V_n\) defines the expected utility received if an additional
-query \(X_{n+1}=X\) is performed, and maximizing \(V_n\) is one-step
-Bayes optimal. Since \(\max _{x \in \mathbb{X}} \mathbb{E}_n[f(x)]\)
-does not depend on \(X_{n+1}\), we can also equivalently maximize
-\[\mathbb{E}_n\left[\max _{x \in \mathbb{X}} \mathbb{E}_{n+1}[g(x)]-\max _{x \in \mathbb{X}} \mathbb{E}_n[g(x)] \mid X_{n+1}=X\right],\]
-which takes the same form as the knowledge gradient acquisition function
-(\citeproc{ref-wu2018parallelknowledgegradientmethod}{Wu and Frazier
-2018}) in standard Bayesian optimization.
-
-\(V_n\) involves a nested stochastic optimization task, while qEUBO is a
-much simpler policy. When human responses are noise-free, we are able to
-use qEUBO as a sufficient policy due to the following theorem:
-
-\[\underset{X \in \mathbb{X}^q}{\operatorname{argmax}} \mathrm{qEUBO}_n(X) \subseteq \underset{X \in \mathbb{X}^q}{\operatorname{argmax}} V_n(X) .\]
-
-\begin{proof}
-\emph{Proof.} For a query \(X \in \mathcal{X}^q\), let
-\(x^{+}(X, i) \in \operatorname{argmax}_{x \in \mathbb{X}} \mathbb{E}_n[g(x) \mid(X, i)]\)
-and define \(X^{+}(X)=\)
-\(\left(x^{+}(X, 1), \ldots, x^{+}(X, q)\right)\).
-
-\textbf{Claim 1} \(V_n(X) \leq \mathrm{qEUBO}_n\left(X^{+}(X)\right) .\)
-We see that \[\begin{aligned}
-V_n(X) & =\sum_{i=1}^q \mathbf{P}_n(r(X)=i) \mathbb{E}_n[g\left(x^{+}(X, i)\right) ] \\
-& \leq \sum_{i=1}^q \mathbf{P}_n(r(X)=i) \mathbb{E}_n[\max _{i=1, \ldots, q} g(x^{+}(X, i))] \\
-& =\mathbb{E}_n\left[\max _{i=1, \ldots, q} g\left(x^{+}(X, i)\right)\right] \\
-& =\mathrm{qEUBO}_n\left(X^{+}(X)\right),
-\end{aligned}\] as claimed.
-
-\textbf{Claim 2} \(\mathrm{qEUBO}_n(X) \leq V_n(X) .\) For any given
-\(X \in \mathbb{X}^q\) we have
-\[\mathbb{E}_n\left[f\left(x_{r(X)}\right) \mid(X, r(X))\right] \leq \max _{x \in \mathbb{X}} \mathbb{E}_n[f(x) \mid(X, r(X))] .\]
-Since
-\(f\left(x_{r(X)}\right)=\max _{i=1, \ldots, q} f\left(x_i\right)\),
-taking expectations over \(r(X)\) on both sides obtains the required
-result.
-
-Now building on the arguments above, let
-\(X^* \in \operatorname{argmax}_{X \in \mathbb{X}^q} \mathrm{qEUBO}_n(X)\)
-and suppose for contradiction that
-\(X^* \notin \operatorname{argmax}_{X \in \mathbb{X}^q} V_n(X)\). Then,
-there exists \(\widetilde{X} \in \mathbb{X}^q\) such that
-\(V_n(\widetilde{X})>V_n\left(X^*\right)\). We have \[\begin{aligned}
-\operatorname{qEUBO}_n\left(X^{+}(\tilde{X})\right) & \geq V_n(\tilde{X}) \\
-& >V_n\left(X^*\right) \\
-& \geq \operatorname{qEUBO}_n\left(X^*\right) \\
-& \geq \operatorname{qEUBO}_n\left(X^{+}(\tilde{X})\right) .
-\end{aligned}\]
-
-The first inequality follows from (1). The second inequality is due to
-our supposition for contradiction. The third inequality is due to (2).
-Finally, the fourth inequality holds since
-\(X^* \in \operatorname{argmax}_{X \in \mathbb{X}^q} \mathrm{qEUBO}_n(X)\).
-This contradiction concludes the proof. ◻
-\end{proof}
-
-Therefore a sufficient condition for following one-step Bayes optimality
-is by maximizing \(\text{qEUBO}_n\).
-
-In experiments that were ran comparing qEUBO to other state-of-the-art
-acquisition functions, qEUBO consistently outperformed on most problems
-and was closely followed by qEI and qTS. These results also extended to
-experiments with multiple options when \(q>2\). In fact, there is faster
-convergence in regret when using more options in human queries. {[}Prove
-Theorem 3: Regret analysis{]}
-
-\subsubsection*{qEI: Batch Expected
-Improvement}\label{qei-batch-expected-improvement}
-\addcontentsline{toc}{subsubsection}{qEI: Batch Expected Improvement}
-
-\[\begin{aligned}
-\mathrm{qEI}= & \mathbb{E}_{\mathbf{y}}\left[\left(\max _{i \in[1, \ldots, q]}\left(\mu_{\min }-y_i\right)\right)_{+}\right] \\
-= & \sum_{i=1}^q \mathbb{E}_{\mathbf{y}}\left(\mu_{\min }-y_i \mid y_i \leq \mu_{\min }, y_i \leq y_j \forall j \neq i\right) \\
-& p\left(y_i \leq \mu_{\min }, y_i \leq y_j \forall j \neq i\right) .
-\end{aligned}\]
-
-\subsubsection*{qTS: Batch Thompson
-Sampling}\label{qts-batch-thompson-sampling}
-\addcontentsline{toc}{subsubsection}{qTS: Batch Thompson Sampling}
-
-Initial data
-\(\mathcal{D}_{\mathcal{I}(1)}=\{(\mathbf{x}_i, y_i)\}_{i \in \mathcal{I}(1)}\)
-Compute current posterior
-\(p(\boldsymbol{\theta} \mid \mathcal{D}_{\mathcal{I}(t)})\) Sample
-\(\boldsymbol{\theta}\) from
-\(p(\boldsymbol{\theta} \mid \mathcal{D}_{\mathcal{I}(t)})\) Select
-\(k \leftarrow \arg \max_{j \notin \mathcal{I}(t)} \mathbb{E}[y_j \mid \mathbf{x}_j, \boldsymbol{\theta}]\)
-Collect \(y_k\) by evaluating \(f\) at \(\mathbf{x}_k\)
-\(\mathcal{D}_{\mathcal{I}(t+1)} \leftarrow \mathcal{D}_{\mathcal{I}(t)} \cup \{(\mathbf{x}_k, y_k)\}\)
-
-Initial data
-\(\mathcal{D}_{\mathcal{I}(1)}=\{\mathbf{x}_i, y_i\}_{i \in \mathcal{I}(1)}\),
-batch size \(S\) Compute current posterior
-\(p(\boldsymbol{\theta} \mid \mathcal{D}_{\mathcal{I}(t)})\) Sample
-\(\boldsymbol{\theta}\) from
-\(p(\boldsymbol{\theta} \mid \mathcal{D}_{\mathcal{I}(t)})\) Select
-\(k(s) \leftarrow \arg \max_{j \notin \mathcal{I}(t)} \mathbb{E}[y_j \mid \mathbf{x}_j, \boldsymbol{\theta}]\)
-\(\mathcal{D}_{\mathcal{I}(t+1)} = \mathcal{D}_{\mathcal{I}(t)} \cup \{\mathbf{x}_{k(s)}, y_{k(s)}\}_{s=1}^S\)
-
-\subsection{Regret Analysis}\label{regret-analysis}
-
-\subsubsection*{qEUBO Regret}\label{qeubo-regret}
-\addcontentsline{toc}{subsubsection}{qEUBO Regret}
-
-With the definition of Bayesian simple regret, we have that qEUBO
-converges to zero at a rate of \(o(1/n)\), i.e.
-
-\[\label{th:quebo_regret}
-\mathbb{E}\left[f\left(x^*\right)-f\left(\widehat{x}_n^*\right)\right]=o(1 / n)\]
-
-where \(x^*=\operatorname{argmax}_{x \in \mathrm{X}} f(x)\) and
-\(\widehat{x}_n^* \in \operatorname{argmax}_{x \in \mathrm{X}} \mathbb{E}_n[f(x)]\).
-
-This theorem holds under the following assumptions:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  \textbf{\(f\) is injective} \(\mathbf{P}(f(x)=f(y))=0\) for any
-  \(x, y \in \mathbb{X}\) with \(x \neq y\).
-\item
-  \textbf{\(f\) represents the preferred option} \(\exists a>1 / 2\)
-  s.t.
-  \(\mathbf{P}\left(r(X) \in \operatorname{argmax}_{i=1, \ldots, 2} f\left(x_i\right) \mid f(X)\right) \geq a \forall\)
-  \(X=\left(x_1, x_2\right) \in \mathbb{X}^2\) with \(x_1 \neq x_2\)
-  almost surely under the prior on \(f\).
-\item
-  \textbf{Expected difference in utility is proportional to probability
-  of greater utility} \(\exists \Delta \geq \delta>0\) s.t.
-  \(\forall \mathcal{D}^{(n)} \text{and} \forall x, y \in \mathbb{X}\)
-  (potentially depending on \(\mathcal{D}^{(n)}\)),
-  \[\delta \mathbf{P}^{(n)}(f(x)>f(y)) \leq \mathbb{E}^{(n)}\left[\{f(x)-f(y)\}^{+}\right] \leq \Delta \mathbf{P}^{(n)}(f(x)>f(y))\]
-  almost surely under the prior on \(f\).
-\end{enumerate}
-
-Further lemmas leading to a proof of Theorem
-\hyperref[th:quebo_regret]{{[}th:quebo\_regret{]}} is given in
-(\citeproc{ref-astudillo2023qeubodecisiontheoreticacquisitionfunction}{Astudillo
-et al. 2023}) Section B.
-
-\subsubsection*{qEI Regret}\label{qei-regret}
-\addcontentsline{toc}{subsubsection}{qEI Regret}
-
-The following theorem shows that, under the same assumptions used for
-qEUBO regret, simple regret of qEI can fail to converge to 0.
-
-There exists a problem instance (i.e., \(\mathbb{X}\) and Bayesian prior
-distribution over f) satisfying the assumptions described in Theorem
-\hyperref[th:quebo_regret]{{[}th:quebo\_regret{]}} such that if the
-sequence of queries is chosen by maximizing qEI, then
-\(\mathbb{E}\left[f\left(x^*\right)-\right.\)
-\(\left.f\left(\widehat{x}_n^*\right)\right] \geq R\) for all \(n\), for
-a constant \(R>0\).
-
-\begin{proof}
-\emph{Proof.} Let \(X = \{1, 2, 3, 4\}\) and consider the functions
-\(f_i:X \rightarrow R\), for \(i=1,2,3,4\), given by \(f_i(1) = -1\) and
-\(f_i(2) = 0\) for all \(i\), and \[\begin{aligned}
-    f_1(x) = \begin{cases}
-    1, &\ x=3\\
-    \frac{1}{2}, &\ x=4
-    \end{cases},
-\hspace{0.5cm}
-f_2(x) = \begin{cases}
-    \frac{1}{2}, &\ x=3\\
-    1, &\ x=4
-    \end{cases},
-\hspace{0.5cm}
-f_3(x) = \begin{cases}
-    -\frac{1}{2}, &\ x=3\\
-    -1, &\ x=4
-    \end{cases},
-\hspace{0.5cm}
-f_4(x) = \begin{cases}
-    -1, &\ x=3\\
-    -\frac{1}{2}, &\ x=4
-    \end{cases}.
-\end{aligned}\]
-
-Let \(p\) be a number with \(0 < p < 1/3\) and set \(q=1-p\). We
-consider a prior distribution on \(f\) with support \(\{f_i\}_{i=1}^4\)
-such that \[\begin{aligned}
-p_i = Pr(f=f_i) = 
-    \begin{cases}
-        p/2, i =1,2,\\
-        q/2, i=3,4.
-    \end{cases}
-\end{aligned}\] We also assume the user's response likelihood is given
-by \(Pr(r(X)=1\mid f(x_1) > f(x_2)) = a\) for some \(a\) such that
-\(1/2 < a < 1\),
-
-Let \(D^{(n)}\) denote the set of observations up to time \(n\) and let
-\(p_i^{(n)} = Pr(f=f_i \mid \mathbb{E}^{(n)})\) for \(i=1,2,3,4\). We
-let the initial data set be
-\(\mathcal{D}^{(0)} = \{(X^{(0)}, r^{(0)})\}\), where
-\(X^{(0)}= (1,2)\). We will prove that the following statements are true
-for all \(n\geq 0\).
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  \(p_i^{(n)} > 0\) for \(i=1,2,3,4\).
-\item
-  \(p_1^{(n)} < \frac{1}{2}p_3^{(n)}\) and
-  \(p_2^{(n)} < \frac{1}{2}p_4^{(n)}\).
-\item
-  \(\arg \max_{x\in\mathcal{X}}\mathbb{E}^{(n)}[f(x)]=\{2\}\).
-\item
-  \(\arg \max_{X\in\mathcal{X}^2}\text{qEI}^{(n)}(X) = \{(3, 4)\}\).
-\end{enumerate}
-
-We prove this by induction over \(n\). We begin by proving this for
-\(n=0\). Since \(f_i(1) < f_i(2)\) for all \(i\), the posterior
-distribution on \(f\) given \(\mathcal{D}^{(0)}\) remains the same as
-the prior; i.e., \(p_i^{(0)} = p_i\) for \(i=1,2,3,4\). Using this,
-statements 1 and 2 can be easily verified. Now note that
-\(\mathbb{E}^{(0)}[f(1)]=-1\), \(\mathbb{E}^{(0)}[f(2)]=0\), and
-\(\mathbb{E}^{(0)}[f(3)] = \mathbb{E}^{(0)}[f(4)] = \frac{3}{2}(p - q)\).
-Since \(p < q\), it follows that
-\(\arg \max_{x\in\mathcal{X}}\mathbb{E}^{(n)}[f(x)]=\{2\}\); i.e.,
-statement 3 holds. Finally, since
-\(\max_{x\in\{1,2\}}\mathbb{E}^{(0)}[f(x)] = 0\), the qEI acquisition
-function at time \(n=0\) is given by
-\(\text{qEI}^{(0)}(X) = \mathbb{E}^{(0)}[\{\max\{f(x_1), f(x_2)\}\}^+]\).
-A direct calculation can now be performed to verify that statement 4
-holds. This completes the base case.
-
-Now suppose statements 1-4 hold for some \(n\geq 0\). Since
-\(X^{(n+1)} = (3, 4)\), the posterior distribution on \(f\) given
-\(D^{(n+1)}\) is given by \[\begin{aligned}
-p_i^{(n+1)} \propto \begin{cases}
-                        p_i^{(n)}\ell, \ i=1,3,\\
-                         p_i^{(n)} (1 - \ell), \ i=2,4,
-                        \end{cases}
-\end{aligned}\] where
-\[\ell = a I\{r^{(n+1)} = 1\} + (1-a)I\{r^{(n+1)} = 2\}.\] Observe that
-\(0< \ell < 1\) since \(0 < a < 1\). Thus, \(\ell > 0\) and
-\(1-\ell > 0\). Since \(p_i^{(n)} > 0\) by the induction hypothesis, it
-follows from this that \(p_i^{(n+1)} > 0\) for \(i=1,2,3,4\). Moreover,
-since \(p_i^{(n+1)} \propto p_i^{(n)}\ell\) for \(i=1,3\) and
-\(p_1^{(n)} < \frac{1}{2}p_3^{(n)}\) by the induction hypothesis, it
-follows that \(p_1^{(n+1)} < \frac{1}{2}p_3^{(n+1)}\). Similarly,
-\(p_2^{(n+1)} < \frac{1}{2}p_4^{(n+1)}\). Thus, statements 1 and 2 hold
-at time \(n+1\).
-
-Now observe that \[\begin{aligned}
-    \mathbb{E}^{(n+1)}[f(3)] &= p_1^{(n+1)} + \frac{1}{2}p_2^{(n+1)} - \frac{1}{2}p_3^{(n+1)} - p_4^{(n+1)}\\
-    &= \left(p_1^{(n+1)} - \frac{1}{2}p_3^{(n+1)}\right) + \left(\frac{1}{2}p_2^{(n+1)} - p_4^{(n+1)}\right)\\
-    &\leq \left(p_1^{(n+1)} - \frac{1}{2}p_3^{(n+1)}\right) + \left(p_2^{(n+1)} - \frac{1}{2}p_4^{(n+1)}\right)\\
-    &\leq 0,
-\end{aligned}\] where the last inequality holds since
-\(p_1^{(n+1)} < \frac{1}{2}p_3^{(n+1)}\) and
-\(p_2^{(n+1)} < \frac{1}{2}p_4^{(n+1)}\). Similarly, we see that
-\(\mathbb{E}^{(n+1)}[f(4)] \leq 0\). Since
-\(\mathbb{E}^{(n+1)}[f(1)]=-1\) and \(\mathbb{E}^{(n+1)}[f(2)]=0\), it
-follows that
-\(\arg \max_{x\in\mathcal{X}}\mathbb{E}^{(n+1)}[f(x)]=\{2\}\); i.e.,
-statement 3 holds at time \(n+1\).
-
-Since \(\max_{x\in\mathcal{X}}\mathbb{E}^{(0)}[f(x)] = 0\), the qEI
-acquisition function at time \(n+1\) is given by
-\(\text{qEI}^{(n+1)}(X) = \mathbb{E}^{(n+1)}[\{\max\{f(x_1), f(x_2)\}\}^+]\).
-Since \(f(1) \leq f(x)\) almost surely under the prior for all
-\(x\in\mathcal{X}\), there is always a maximizer of qEI that does not
-contain \(1\). Thus, to find the maximizer of qEI, it suffices to
-analyse its value at the pairs \((2, 3)\), \((3,4)\) and \((4,2)\). We
-have \[\text{qEI}^{(n+1)}(2, 3) = p_1^{(n+1)} + 1/2 p_2^{(n+1)},\]
-\[\operatorname{qEI}^{(n+1)}(3, 4) = p_1^{(n+1)} + p_2^{(n+1)}\] and
-\[\operatorname{qEI}^{(n+1)}(4, 2) = 1/2p_1^{(n+1)} + p_2^{(n+1)}.\]
-Since \(p_1^{(n+1)} > 0\) and \(p_2^{(n+1)} > 0\), it follows that
-\(\arg \max_{X \in X^2}\text{qEI}^{(n+1)}(X) = \{(3, 4)\}\), which
-concludes the proof by induction.
-
-Finally, since \(\arg \max_{x\in X}\mathbb{E}^{(n)}[f(x)]=\{2\}\) for
-all \(n\), the Bayesian simple regret of qEI is given by
-\[\begin{aligned}
-    \mathbb{E}\left[f(x^*) - f(2)\right] &= \sum_{i=1}p_i\left(\max_{x\in X}f_i(x) - f_i(2)\right)\\
-    &= p
-\end{aligned}\] for all \(n\). ◻
-\end{proof}
-
-\subsubsection*{POP-BO Regret}\label{pop-bo-regret}
-\addcontentsline{toc}{subsubsection}{POP-BO Regret}
-
-Commonly used kernel functions within the RKHS are:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  Linear: \[k(x, \bar{x})=x^{\top} \bar{x} .\]
-\item
-  Squared Exponential (SE):
-  \[k(x, \bar{x})=\sigma_{\mathrm{SE}}^2 \exp \left\{-\frac{\|x-\bar{x}\|^2}{l^2}\right\},\]
-  where \(\sigma_{\mathrm{SE}}^2\) is the variance parameter and \(l\)
-  is the lengthscale parameter.
-\item
-  Matérn:
-  \[k(x, \bar{x})=\frac{2^{1-\nu}}{\Gamma(\nu)}\left(\sqrt{2 \nu} \frac{\|x-\bar{x}\|}{\rho}\right)^\nu K_\nu\left(\sqrt{2 \nu} \frac{\|x-\bar{x}\|}{\rho}\right),\]
-  where \(\rho\) and \(\nu\) are the two positive parameters of the
-  kernel function, \(\Gamma\) is the gamma function, and \(K_\nu\) is
-  the modified Bessel function of the second kind. \(\nu\) captures the
-  smoothness of the kernel function.
-\end{enumerate}
-
-With the definition of Bayesian simple regret, we have the following
-theorem defining the regret bound:
-
-With probability at least \(1-\delta\), the cumulative regret of POP-BO
-satisfies,
-\[R_T=\mathcal{O}\left(\sqrt{\beta_T \gamma_T^{f f^{\prime}} T}\right),\]
-where
-\[\beta_T=\beta(1 / T, \delta, T)=\mathcal{O}\left(\sqrt{T \log \frac{T \mathcal{N}\left(\mathcal{B}_f, 1 / T,\|\cdot\|_{\infty}\right)}{\delta}}\right).\]
-
-The guaranteed convergence rate is characterised as:
-
-{[}{]}\{\#th: popbo\_converge label=``th: popbo\_converge''\} Let
-\(t^{\star}\) be defined as in Eq. (19). With probability at least
-\(1-\delta\),
-\[f\left(x^{\star}\right)-f\left(x_{t^{\star}}\right) \leq \mathcal{O}\left(\frac{\sqrt{\beta_T \gamma_T^{f f^{\prime}}}}{\sqrt{T}}\right)\]
-
-Theorem \hyperref[th:ux5cux2520popbo_converge]{{[}th:
-popbo\_converge{]}} highlights that by minimizing the known term
-\(2\left(2 B+\lambda^{-1 / 2} \sqrt{\beta\left(\epsilon, \frac{\delta}{2}, t\right)}\right) \sigma_t^{f f^{\prime}}\left(\left(x_t, x_t^{\prime}\right)\right)\),
-the reported final solution \(x_{t^{\star}}\) has a guaranteed
-convergence rate.
-
-Further kernel-specific regret bounds for POP-BO are calculated as
-follows:
-
-Setting \(\epsilon=1 / T\) and running our POP-BO algorithm in Alg. 1,
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  If \(k(x, y)=\langle x, y\rangle\), we have,
-  \[R_T=\mathcal{O}\left(T^{3 / 4}(\log T)^{3 / 4}\right) .\]
-\item
-  If \(k(x, y)\) is a squared exponential kernel, we have,
-  \[R_T=\mathcal{O}\left(T^{3 / 4}(\log T)^{3 / 4(d+1)}\right) .\]
-\item
-  If \(k(x, y)\) is a Matérn kernel, we have,
-  \[\left.R_T=\mathcal{O}\left(T^{3 / 4}(\log T)^{3 / 4} T^{\frac{d}{\nu}\left(\frac{1}{4}+\frac{d+1}{4+2(d+1)^d / \nu}\right.}\right)\right).\]
-\end{enumerate}
-
-\section{Case Study 1: Foundation Models for
-Robotics}\label{case-study-1-foundation-models-for-robotics}
-
-Modern foundation models have been ubiquitous in discussions of
-powerful, general purpose AI systems that can accomplish myriad tasks
-across many disciplines such as programming, medicine, law, open
-question-answering and much more, with rapidly increasing capabilities
-(\citeproc{ref-bommasani2022opportunities}{Bommasani et al. 2022}).
-However, despite successes from large labs in controlled environments
-(\citeproc{ref-brohan2023rt2}{Brohan et al. 2023}) foundation models
-have not seen ubiquitous use in robotics due to shifting robot
-morphology, lack of data, and the sim to real gap in robotics
-(\citeproc{ref-walke2023bridgedata}{Walke et al. 2023}). For this
-subsection we explore two promising approaches known as R3M and Voltron
-which are the first to leverage pre-training on vast amounts of data
-towards performance improvement on downstream robotic tasks despite the
-aforementioned issues (\citeproc{ref-nair2022r3m}{Nair et al. 2022};
-\citeproc{ref-karamcheti2023languagedriven}{Karamcheti et al. 2023}).
-
-R3M represents a significant advancement in the field of robotic
-manipulation and learning. This model diverges from traditional
-approaches that rely on training from scratch within the same domain on
-the same robot data as instead it leverags pretraining on large
-datasets, akin to the practices in computer vision and natural language
-processing (NLP) where models are trained on diverse, large-scale
-datasets to create reusable, general-purpose representations. The core
-principle behind R3M is its training methodology. It is pre-trained on a
-wide array of human videos, encompassing various activities and
-interactions. This diverse dataset enables the model to capture a broad
-spectrum of physical interactions and dynamics, which are crucial for
-effective robotic manipulation known as EGO4D
-(\citeproc{ref-grauman2022ego4d}{Grauman et al. 2022}). However, prior
-papers could not fit this dataset well, and R3M leveraged. The training
-utilizes a unique objective that combines time contrastive learning,
-video-language alignment, and a sparsity penalty. This objective ensures
-that R3M not only understands the temporal dynamics of scenes (i.e., how
-states transition over time) but also focuses on semantically relevant
-features, such as objects and their interrelations, while maintaining a
-compact and efficient representation. What sets R3M apart in the realm
-of robotics is its efficiency and effectiveness in learning from a
-limited amount of data. The model demonstrates remarkable performance in
-learning tasks in the real world with minimal human supervision --
-typically less than 10 minutes. This is a stark contrast to traditional
-models that require extensive and often prohibitively large datasets for
-training. Furthermore, R3M's pre-trained nature allows for its
-application across a variety of tasks and environments without the need
-for retraining from scratch, making it a versatile tool in robotic
-manipulation. The empirical results from using R3M are compelling,
-leading to a 10\% improvement over training from a pretrained image-net
-model, self-supervised approaches such as MoCo or even CLIP
-(\citeproc{ref-deng2009imagenet}{Deng et al. 2009};
-\citeproc{ref-he2020momentum}{He et al. 2020};
-\citeproc{ref-radford2021learning}{Radford et al. 2021}). Note however,
-that R3m does not use any language data which leaves quite a bit of
-supervision to be desired.
-
-Building off the success of R3M, Voltron proposes a further extension of
-leveraging self-supervision and advancements in foundation models, and
-multi-modality. Voltron takes on an intuitive and simple dual use
-objective, where the trained model alternates between predicting the
-task in an image through natural language and classifying images based
-on a natural text label. This forces a nuanced understanding of both
-modalities (\citeproc{ref-radford2021learning}{Radford et al. 2021}).
-Voltron's approach is distinguished by its versatility and depth of
-learning. It is adept at handling a wide range of robotic tasks, from
-low-level spatial feature recognition to high-level semantic
-understanding required in language-conditioned imitation and intent
-scoring. This flexibility makes it suitable for various applications in
-robotic manipulation, from grasping objects based on descriptive
-language to performing complex sequences of actions in response to
-verbal instructions. The authors rigorously test Voltron in scenarios
-such as dense segmentation for grasp affordance prediction, object
-detection in cluttered scenes, and learning multi-task
-language-conditioned policies for real-world manipulation with up to
-15\% improvement over baselines. In each of these domains, Voltron has
-shown a remarkable ability to outperform existing models like MVP and
-R3M, showcasing its superior adaptability and learning capabilities
-(\citeproc{ref-xiao2022masked}{Xiao et al. 2022}). Moreover, Voltron's
-framework allows for a balance between encoding low-level and high-level
-features, which is critical in the context of robotics. This balance
-enables the model to excel in both control tasks and those requiring
-deeper semantic understanding, offering a comprehensive solution in the
-realm of robotic vision and manipulation.
-
-Voltron stands as a groundbreaking approach in the field of robotics,
-offering a language-driven, versatile, and efficient approach to
-learning and manipulation. Its ability to seamlessly integrate visual
-and linguistic data makes it a potent tool in the ever-evolving
-landscape of robotic technology, with potential applications that extend
-far beyond current capabilities. Interesting the authors show Voltron
-does not beat R3M off the shelf but only when trained on similar amounts
-of data. Nevertheless, Voltron's success in diverse tasks and
-environments heralds a new era in robotic manipulation, where language
-and vision coalesce to create more intelligent, adaptable, and capable
-robotic systems.
-
-On the note of applying AL to RL and environment settings, there have
-been many recent papers that have attempted to extend this to more
-modern RL environments. For example, the paper ``When to Ask for Help''
-(\citeproc{ref-ask_help}{Xie et al. 2022}) examines the intersection of
-autonomous and AL. Instead of just expecting an RL agent to autonomously
-solve a task, making the assumption that an agent could get stuck and
-need human input to get ``unstuck'' is a key insight of the paper. In
-general, there has been an emphasis in recent literature in robotics on
-not just blindly using demonstration data as a form of human input, but
-rather actively querying a human and using this to better synthesize
-correct actions.
-
-AL holds promise for enhancing AI models in real-world scenarios, yet
-several challenges persist. This discussion aims to provide an overview
-of these challenges.
-
-Task-Specific Considerations: For certain tasks, the input space of a
-model may have some rare yet extremely important pockets which may never
-be discovered by AL and may cause severe blindspots in the model. In
-medical imaging for instance, there can be rare yet critical diseases.
-Designing AL strategies for medical image analysis must prioritize rare
-classes, such as various forms of cancers. Oftentimes, collecting data
-around those rare classes is not a recommendation of the AL process
-because these examples constitute heavy distribution drifts from the
-input distribution a model has seen.
-
-Complex Task Adaptation: AL has predominantly been adopted for simple
-classification tasks, leaving more other types of tasks (generative ones
-for instance), less explored. In Natural Language Processing, tasks like
-natural language inference, question-answering pose additional
-complexities that affect the direct application of the AL process. While
-machine translation has seen AL applications, generation tasks in NLP
-require more thorough exploration. Challenges arise in obtaining
-unlabeled data, particularly for tasks with intricate inputs.
-
-Unsupervised and Semi-Supervised Approaches: In the presence of large
-datasets without sufficient labels, unsupervised and semi-supervised
-approaches become crucial. These methods offer a means to extract
-information without relying on labeled data for every data point,
-potentially revolutionizing fields like medical image analysis. There is
-an ongoing need for methods that combine self/semi-supervised learning
-with AL.
-
-Algorithm Scalability: Scalability is a critical concern for online AL
-algorithms, particularly when dealing with large datasets and
-high-velocity data streams. The computational demands of AL can become
-prohibitive as data volume increases, posing challenges for practical
-deployment. Issues of catastrophic forgetting and model plasticity
-further complicate scalability, requiring careful consideration in
-algorithm design.
-
-Labeling Quality Assurance: The effectiveness of most online AL
-strategies hinges on the quality of labeled data. Ensuring labeling
-accuracy in real-world scenarios is challenging, with human annotators
-prone to errors, biases, and diverse interpretations. Addressing
-imperfections in labeling through considerations of oracle imperfections
-becomes essential in real-life AL applications. Solutions for cleaning
-up data and verifying its quality need to be more aggressively pursued.
-
-Data Drift Challenges: Real-world settings introduce data drift, where
-distributions shift over time, challenging models to adapt for accurate
-predictions. These shifts can impact the quality of labeled data
-acquired in the AL process. For example, the criterion or proxy used for
-selecting informative instances may be thrown off when the distribution
-a model is trained on, and the distribution we want it to perform well
-on, are too far away from one another.
-
-Evaluation in Real-Life Scenarios: While AL methods are often evaluated
-assuming access to ground-truth labels, the real motivation for AL lies
-in label scarcity. Assessing the effectiveness of AL strategies becomes
-challenging in real-life scenarios where ground-truth labels may be
-limited. In other words, one may verify the goodness of an AL algorithm
-within the lab, but once the algorithm is deployed for improving all
-sorts of models on all sorts of data distributions, verifying whether AL
-is actually improving a model is tricky, especially when collecting and
-labeling data from the target distribution is expensive and defeats the
-purpose of using AL in the first place.
-
-By systematically addressing these challenges, the field of AL in AI can
-progress towards more effective and practical applications. In summary,
-AL is a promising modern tool to model training that presents potential
-benefits. As was mentioned at the start, there are numerous approaches
-that can be employed by AL, starting from reducing error of model's
-prediction, reducing variance, to more conformal predictions. The flavor
-of AL heavily depends on the applications, which include robotics, LLM,
-autonomous vehicles, and more. We discussed in more detail how to
-perform AL for variance reduction in the case of predicting kinematics
-of the robotic arms, which showed decrease in MSE as well as more stable
-reduction in it. Next we talked about using AL for reducing the number
-of comparisons required to create a ranking of objects, and the examples
-discussed were able to achieve that but with some loss in the prediction
-accuracy. Finally, we discussed how AL can be used for modeling of
-reward functions within a dynamical system, which demonstrated
-improvements in performance and time required to achieve it. For a more
-hands-on experience with AL and demonstrated example, we encourage the
-readers to explore a blogpost by Max Halford
-(\citeproc{ref-max_halford}{Halford 2023}).
-
-\section{Exercises}\label{exercises-2}
-
-\subsection*{Question 1: Preferential Bayesian Optimization (30
-points)}\label{sec-question-1-preferential-bayesian-optimization-30-points}
-\addcontentsline{toc}{subsection}{Question 1: Preferential Bayesian
-Optimization (30 points)}
-
-\textbf{Preferential Bayesian Optimization (PBO)} is a variant of
-Bayesian Optimization (BO) designed to handle scenarios where feedback
-is provided in terms of preferences between alternatives rather than
-explicit numeric evaluations. Suppose you are optimizing an unknown
-function \(f\) over a space \(\mathcal{X}\), but instead of receiving
-function values, you only receive pairwise comparisons between different
-points in the input space. That is, given two points
-\(x_1, x_2 \in \mathcal{X}\), you receive feedback in the form of a
-preference: \(x_1 \succ x_2\) implies \(f(x_1) > f(x_2)\).
-
-The \textbf{Gaussian Process (GP)} framework is used to model \(f\), and
-the optimization is guided by this model. Let \(p(x_1 \succ x_2 | f)\)
-be the probability that \(x_1\) is preferred over \(x_2\), which can be
-modeled using a Bradley-Terry or Thurstone model based on the GP prior.
-
-Using the paper ``Preferential Bayesian Optimization''
-(\url{https://proceedings.mlr.press/v70/gonzalez17a/gonzalez17a.pdf}),
-answer the following:
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{Modeling Preferences (6 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{Likelihood Derivation (Written, 2 points):} Given two points
-    \(x_1\) and \(x_2\) and their corresponding latent function values
-    \(f(x_1)\) and \(f(x_2)\), derive the likelihood of a preference
-    \(x_1 \succ x_2\) using the Bradley-Terry model. Your solution here.
-  \item
-    \textbf{Incorporating into GP (Written, 2 points):} Explain how this
-    likelihood can be incorporated into the GP framework to model
-    preferences probabilistically. Specifically, describe how the
-    covariance function of the GP affects the joint distribution of
-    preferences and discuss any assumptions made regarding the
-    smoothness or structure of \(f\).
-  \item
-    \textbf{Posterior Update (Written, 2 points):} Write out an
-    expression for the posterior mean and variance at new query points
-    by using the posterior predictive distribution based on previously
-    observed preferences (no need to simplify since it's intractable
-    analytically). Suggest an approach that can be used to approximate
-    the mean and variance.
-  \end{enumerate}
-\item
-  \textbf{Acquisition Function Adaptation (6 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{Expected Improvement (EI) for Preferences (Written, 2
-    points):} Explain how the Expected Improvement (EI) acquisition
-    function is adapted in the context of PBO to handle preferences
-    rather than absolute function values. Please read the paper for
-    this.
-  \item
-    \textbf{EI Computation for Pairwise Comparisons (Written, 2
-    points):} Derive the expression for EI when dealing with pairwise
-    comparisons. Show how the computation of EI differs from the
-    standard BO setting and discuss how uncertainty in the GP model is
-    used in this context.
-  \item
-    \textbf{Selection Strategy (Written, 2 points):} Describe how the
-    acquisition function uses the pairwise preference data to select the
-    next query point. Provide a rigorous justification for this
-    selection strategy in terms of maximizing expected information gain.
-  \end{enumerate}
-\item
-  \textbf{Exploration-Exploitation Balance in PBO (6 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{Exploration Mechanism (Written, 2 points):} Explain how
-    exploration is handled in the PBO framework. Describe how
-    uncertainty in the preference model (the GP posterior) influences
-    the selection of new points for evaluation.
-  \item
-    \textbf{Uncertainty Quantification (Written, 2 points):} Define how
-    the variance in the GP posterior represents uncertainty in the model
-    and show how this uncertainty is updated as new preferences are
-    observed.
-  \item
-    \textbf{Empirical Validation (Written, 2 points):} Design an
-    experiment to empirically validate the balance between exploration
-    and exploitation in PBO. Describe the setup, including the objective
-    function, the experimental conditions, and the evaluation metric for
-    measuring the quality of exploration-exploitation balance.
-  \end{enumerate}
-\item
-  \textbf{Scalability and Practical Considerations (6 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{Challenges in Preference Feedback (Written, 2 points):}
-    Discuss the challenges associated with preference feedback in
-    real-world applications, such as inconsistency in user preferences
-    and potential biases.
-  \item
-    \textbf{GP Scalability (Written, 2 points):} Explain how the
-    scalability of the GP model affects the performance of PBO,
-    especially as the number of observations increases. Include a
-    discussion on computational complexity and possible solutions.
-  \item
-    \textbf{Extensions for Large-Scale Problems (Written, 2 points):}
-    Propose potential extensions or modifications to improve the
-    applicability of PBO to large-scale optimization problems. For
-    example, discuss the feasibility of sparse GPs or other
-    approximation techniques and evaluate their potential impact on PBO
-    performance.
-  \end{enumerate}
-\item
-  \textbf{Empirical Experimentation (6 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{Copeland Score (Coding, 2 points):} Implement
-    \texttt{compute\_max\_copeland\_score} in\\
-    \texttt{pbo/forrester\_duel.py}.
-  \item
-    \textbf{Copeland Acquisition (Coding, 4 points):} Implement
-    \texttt{copeland\_acquisition}. Run \texttt{forrester\_duel.py} and
-    briefly discuss any patterns you observe in the chosen duels (black
-    Xs on the heatmap).
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{from}\NormalTok{ sklearn.gaussian\_process }\ImportTok{import}\NormalTok{ GaussianProcessClassifier}
-\ImportTok{from}\NormalTok{ sklearn.gaussian\_process.kernels }\ImportTok{import}\NormalTok{ RBF, ConstantKernel }\ImportTok{as}\NormalTok{ C}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ tqdm }\ImportTok{import}\NormalTok{ tqdm}
-
-\CommentTok{\# Define the Forrester function}
-\KeywordTok{def}\NormalTok{ forrester\_function(x):}
-    \CommentTok{"""}
-\CommentTok{    Evaluates the Forrester function at the given input.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} x (float or numpy.ndarray): Input value(s) in the range [0, 1].}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} float or numpy.ndarray: Evaluated Forrester function value(s).}
-\CommentTok{    """}
-    \ControlFlowTok{return}\NormalTok{ (}\DecValTok{6} \OperatorTok{*}\NormalTok{ x }\OperatorTok{{-}} \DecValTok{2}\NormalTok{)}\OperatorTok{**}\DecValTok{2} \OperatorTok{*}\NormalTok{ np.sin(}\DecValTok{12} \OperatorTok{*}\NormalTok{ x }\OperatorTok{{-}} \DecValTok{4}\NormalTok{)}
-
-\CommentTok{\# Sigmoid function for probabilistic preferences}
-\KeywordTok{def}\NormalTok{ sigmoid(x):}
-    \CommentTok{"""}
-\CommentTok{    Computes the sigmoid function for the given input.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} x (float or numpy.ndarray): Input value(s).}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} float or numpy.ndarray: Sigmoid{-}transformed value(s).}
-\CommentTok{    """}
-    \ControlFlowTok{return} \DecValTok{1} \OperatorTok{/}\NormalTok{ (}\DecValTok{1} \OperatorTok{+}\NormalTok{ np.exp(}\OperatorTok{{-}}\NormalTok{x))}
-
-\CommentTok{\# Simulate duel outcome probabilistically}
-\KeywordTok{def}\NormalTok{ simulate\_duel\_outcome(x, x\_prime):}
-    \CommentTok{"""}
-\CommentTok{    Simulates the outcome of a duel between two candidates based on probabilistic preferences.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} x (float): First candidate\textquotesingle{}s input value.}
-\CommentTok{    {-} x\_prime (float): Second candidate\textquotesingle{}s input value.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} int: 1 if x wins, 0 otherwise.}
-\CommentTok{    """}
-\NormalTok{    prob }\OperatorTok{=}\NormalTok{ sigmoid(forrester\_function(x\_prime) }\OperatorTok{{-}}\NormalTok{ forrester\_function(x))  }\CommentTok{\# Probability x beats x\textquotesingle{}}
-    \ControlFlowTok{return}\NormalTok{ np.random.choice([}\DecValTok{1}\NormalTok{, }\DecValTok{0}\NormalTok{], p}\OperatorTok{=}\NormalTok{[prob, }\DecValTok{1} \OperatorTok{{-}}\NormalTok{ prob])}
-
-\CommentTok{\# Compute the Soft Copeland score for all candidates (vectorized)}
-\KeywordTok{def}\NormalTok{ compute\_max\_copeland\_score(candidates, gp, landmarks):}
-    \CommentTok{"""}
-\CommentTok{    Computes the maximum Copeland score for given candidates using predicted win probabilities.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} candidates (numpy.ndarray): Array of candidate points.}
-\CommentTok{    {-} gp (GaussianProcessClassifier): Trained Gaussian process classifier for preference modeling.}
-\CommentTok{    {-} landmarks (numpy.ndarray): Array of landmark points used for Monte Carlo approximation.}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} tuple: Maximum Copeland score and the best candidate.}
-\CommentTok{    """}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}6 lines)}
-        \CommentTok{\# 1. Generate all pairs between candidates and landmarks.}
-        \CommentTok{\# 2. Get win probabilities and average}
-        \CommentTok{\# 3. Return appropriate maximum and best candidate.}
-    \ControlFlowTok{pass} 
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-\CommentTok{\# Acquisition function with GP retraining and maximum Copeland score for each outcome}
-\KeywordTok{def}\NormalTok{ copeland\_acquisition(x, x\_prime, x\_candidates, gp, train\_X, train\_y, landmarks, max\_copeland\_score):}
-    \CommentTok{"""}
-\CommentTok{    Computes the acquisition value for a candidate pair by simulating outcomes and retraining the GP.}
-\CommentTok{    }
-\CommentTok{    Args:}
-\CommentTok{    {-} x (float): First value of duel.}
-\CommentTok{    {-} x\_prime (float): Second value of duel.}
-\CommentTok{    {-} x\_candidates (numpy.ndarray): Array of candidate points to evaluate soft Copeland on.}
-\CommentTok{    {-} gp (GaussianProcessClassifier): Trained Gaussian process classifier for preference modeling.}
-\CommentTok{    {-} train\_X (numpy.ndarray): Current training input pairs.}
-\CommentTok{    {-} train\_y (numpy.ndarray): Current training labels.}
-\CommentTok{    {-} landmarks (numpy.ndarray): Array of landmark points used for Monte Carlo approximation.}
-\CommentTok{    {-} max\_copeland\_score (float): Maximum copeland score prior to acquiring any new pair}
-\CommentTok{    }
-\CommentTok{    Returns:}
-\CommentTok{    {-} float: Acquisition value for the given pair (x, x\_prime).}
-\CommentTok{    """}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}14{-}16 lines)}
-        \CommentTok{\# 1. Predict dueling probabilities}
-        \CommentTok{\# 2. Simulate adding (x, x\textquotesingle{}) with y=1 (x beats x\textquotesingle{}) and fit GP }
-        \CommentTok{\# 3. Simulate adding (x, x\textquotesingle{}) with y=0 (x\textquotesingle{} beats x) and fit GP }
-        \CommentTok{\# 4. Compute expected improvement in max Copeland score}
-        \CommentTok{\# 5. Return weighted acquisition value}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-\ControlFlowTok{if} \VariableTok{\_\_name\_\_} \OperatorTok{==} \StringTok{"\_\_main\_\_"}\NormalTok{:}
-    \CommentTok{\# Initialization}
-\NormalTok{    np.random.seed(}\DecValTok{42}\NormalTok{)}
-\NormalTok{    kernel }\OperatorTok{=}\NormalTok{ C(}\FloatTok{28.0}\NormalTok{, constant\_value\_bounds}\OperatorTok{=}\StringTok{\textquotesingle{}fixed\textquotesingle{}}\NormalTok{) }\OperatorTok{*}\NormalTok{ RBF(length\_scale}\OperatorTok{=}\FloatTok{0.15}\NormalTok{, length\_scale\_bounds}\OperatorTok{=}\StringTok{\textquotesingle{}fixed\textquotesingle{}}\NormalTok{)}
-\NormalTok{    gp }\OperatorTok{=}\NormalTok{ GaussianProcessClassifier(kernel}\OperatorTok{=}\NormalTok{kernel)}
-
-    \CommentTok{\# Generate initial training data (random pairs)}
-\NormalTok{    train\_X }\OperatorTok{=}\NormalTok{ np.array([[}\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{], [}\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{]]) }\CommentTok{\#np.random.uniform(0, 1, (10, 2))  \# 20 random dueling pairs [x, x\textquotesingle{}]}
-\NormalTok{    train\_y }\OperatorTok{=}\NormalTok{ np.array([simulate\_duel\_outcome(pair[}\DecValTok{0}\NormalTok{], pair[}\DecValTok{1}\NormalTok{]) }\ControlFlowTok{for}\NormalTok{ pair }\KeywordTok{in}\NormalTok{ train\_X])}
-
-    \CommentTok{\# Fixed landmark points and their function values}
-\NormalTok{    landmarks }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{30}\NormalTok{)  }\CommentTok{\# 10 fixed landmarks}
-
-    \CommentTok{\# Generate candidate pairs for optimization}
-\NormalTok{    x\_candidates }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{30}\NormalTok{)  }\CommentTok{\# Reduced grid for efficiency}
-\NormalTok{    X, X\_prime }\OperatorTok{=}\NormalTok{ np.meshgrid(x\_candidates, x\_candidates)}
-\NormalTok{    candidate\_pairs }\OperatorTok{=}\NormalTok{ np.c\_[X.ravel(), X\_prime.ravel()]}
-
-    \CommentTok{\# Optimization loop}
-\NormalTok{    n\_iterations }\OperatorTok{=} \DecValTok{20}
-    \ControlFlowTok{for}\NormalTok{ iteration }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(n\_iterations):}
-        \CommentTok{\# Retrain the GP with current training data}
-\NormalTok{        gp.fit(train\_X, train\_y)}
-
-        \CommentTok{\# Compute global maximum Copeland score}
-\NormalTok{        max\_copeland\_score, condorcet\_winner }\OperatorTok{=}\NormalTok{ compute\_max\_copeland\_score(x\_candidates, gp, landmarks)}
-        \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Condorcet winner iteration }\SpecialCharTok{\{}\NormalTok{iteration}\SpecialCharTok{\}}\SpecialStringTok{: }\SpecialCharTok{\{}\NormalTok{condorcet\_winner}\SpecialCharTok{\}}\SpecialStringTok{ with soft{-}Copeland score }\SpecialCharTok{\{}\NormalTok{max\_copeland\_score}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
-
-        \CommentTok{\# Evaluate acquisition values for all candidate pairs}
-\NormalTok{        acquisition\_values }\OperatorTok{=}\NormalTok{ np.zeros(}\BuiltInTok{len}\NormalTok{(candidate\_pairs))}
-        \ControlFlowTok{for}\NormalTok{ idx, (x, x\_prime) }\KeywordTok{in}\NormalTok{ tqdm(}\BuiltInTok{enumerate}\NormalTok{(candidate\_pairs), total}\OperatorTok{=}\BuiltInTok{len}\NormalTok{(candidate\_pairs)):}
-\NormalTok{            acquisition\_values[idx] }\OperatorTok{=}\NormalTok{ copeland\_acquisition(}
-\NormalTok{                x, x\_prime, x\_candidates, gp, train\_X, train\_y, landmarks, max\_copeland\_score}
-\NormalTok{            )}
-
-        \CommentTok{\# Select the pair with the highest acquisition value}
-\NormalTok{        best\_idx }\OperatorTok{=}\NormalTok{ np.argmax(acquisition\_values)}
-\NormalTok{        next\_x, next\_x\_prime }\OperatorTok{=}\NormalTok{ candidate\_pairs[best\_idx]}
-
-        \CommentTok{\# Simulate the actual outcome of the duel}
-\NormalTok{        outcome }\OperatorTok{=}\NormalTok{ simulate\_duel\_outcome(next\_x, next\_x\_prime)}
-
-        \CommentTok{\# Update training data with the new duel outcome}
-\NormalTok{        train\_X }\OperatorTok{=}\NormalTok{ np.vstack([train\_X, [next\_x, next\_x\_prime]])}
-\NormalTok{        train\_y }\OperatorTok{=}\NormalTok{ np.append(train\_y, outcome)}
-
-    \CommentTok{\# Generate heatmaps}
-\NormalTok{    x }\OperatorTok{=}\NormalTok{ np.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{100}\NormalTok{)}
-\NormalTok{    X, X\_prime }\OperatorTok{=}\NormalTok{ np.meshgrid(x, x)}
-\NormalTok{    pairs }\OperatorTok{=}\NormalTok{ np.c\_[X.ravel(), X\_prime.ravel()]}
-
-    \CommentTok{\# Ground Truth Preference Probabilities}
-\NormalTok{    gt\_preferences }\OperatorTok{=}\NormalTok{ np.array([}
-\NormalTok{        sigmoid(forrester\_function(x\_prime) }\OperatorTok{{-}}\NormalTok{ forrester\_function(x))}
-        \ControlFlowTok{for}\NormalTok{ x, x\_prime }\KeywordTok{in}\NormalTok{ pairs}
-\NormalTok{    ]).reshape(X.shape)}
-
-    \CommentTok{\# GP{-}Predicted Preferences}
-\NormalTok{    gp\_predictions }\OperatorTok{=}\NormalTok{ gp.predict\_proba(pairs)[:, }\DecValTok{1}\NormalTok{].reshape(X.shape)}
-
-    \CommentTok{\# Plot Ground Truth Preference Heatmap}
-\NormalTok{    plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{12}\NormalTok{, }\DecValTok{6}\NormalTok{))}
-\NormalTok{    plt.subplot(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{1}\NormalTok{)}
-\NormalTok{    plt.contourf(X, X\_prime, gt\_preferences, levels}\OperatorTok{=}\DecValTok{50}\NormalTok{, cmap}\OperatorTok{=}\StringTok{\textquotesingle{}jet\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.colorbar(label}\OperatorTok{=}\StringTok{"Ground Truth Preference Probability"}\NormalTok{)}
-\NormalTok{    plt.title(}\StringTok{"Ground Truth Preference Heatmap"}\NormalTok{)}
-\NormalTok{    plt.xlabel(}\StringTok{"x"}\NormalTok{)}
-\NormalTok{    plt.ylabel(}\StringTok{"x\textquotesingle{}"}\NormalTok{)}
-
-    \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Chosen duels: }\SpecialCharTok{\{}\NormalTok{train\_X[}\OperatorTok{{-}}\NormalTok{n\_iterations:]}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-    \CommentTok{\# Plot GP{-}Predicted Preference Heatmap}
-\NormalTok{    plt.subplot(}\DecValTok{1}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{2}\NormalTok{)}
-\NormalTok{    plt.contourf(X, X\_prime, gp\_predictions, levels}\OperatorTok{=}\DecValTok{50}\NormalTok{, cmap}\OperatorTok{=}\StringTok{\textquotesingle{}jet\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.colorbar(label}\OperatorTok{=}\StringTok{"GP{-}Predicted Preference Probability"}\NormalTok{)}
-\NormalTok{    plt.scatter(train\_X[}\OperatorTok{{-}}\NormalTok{n\_iterations:, }\DecValTok{0}\NormalTok{], train\_X[}\OperatorTok{{-}}\NormalTok{n\_iterations:, }\DecValTok{1}\NormalTok{], c}\OperatorTok{=}\StringTok{\textquotesingle{}black\textquotesingle{}}\NormalTok{, label}\OperatorTok{=}\StringTok{"Last Iterations"}\NormalTok{, s}\OperatorTok{=}\DecValTok{30}\NormalTok{, marker}\OperatorTok{=}\StringTok{\textquotesingle{}x\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.title(}\StringTok{"GP{-}Predicted Preference Heatmap"}\NormalTok{)}
-\NormalTok{    plt.xlabel(}\StringTok{"x"}\NormalTok{)}
-\NormalTok{    plt.ylabel(}\StringTok{"x\textquotesingle{}"}\NormalTok{)}
-
-\NormalTok{    plt.tight\_layout()}
-\NormalTok{    plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\subsection*{Question 2: Linear Dueling Bandit (30
-points)}\label{sec-question-2-linear-dueling-bandit-30-points}
-\addcontentsline{toc}{subsection}{Question 2: Linear Dueling Bandit (30
-points)}
-
-In the linear dueling bandit problem, feedback is provided through
-pairwise comparisons between actions, rather than direct rewards.
-Consider a finite set of \(K\) actions, each represented by a feature
-vector \(x_1, x_2, \dots, x_K \in \mathbb{R}^d\). Let the unknown
-preference scores be \(f(x_i) = \theta^\top x_i\) and
-\(f(x_j) = \theta^\top x_j\), where \(\theta \in \mathbb{R}^d\) is an
-unknown parameter vector. The goal is to identify the best action by
-iteratively comparing pairs of actions while minimizing cumulative
-regret. Using qEUBO from \url{https://arxiv.org/pdf/2303.15746},
-complete the following:
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{Acquisition Functions for Regret Minimization (Written, 10
-  points)}: Write out the expression for the acquisition function
-  Expected Improvement discussed in Q1 and qEUBO in the context of the
-  linear dueling bandit. Discuss conditions under which each acquisition
-  function could outperform the others in minimizing cumulative regret.
-\item
-  \textbf{Experimental Evaluation of Acquisition Functions (Written +
-  Coding, 10 points)}: Benchmark the performance of the two acquisition
-  functions experimentally.
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    Finish implementing the acquisition functions in a linear dueling
-    bandit simulation with \(K = 10\) and \(d = 5\), using synthetic
-    data by completing the function \texttt{calculate\_regret\_from\_gp}
-    in \texttt{linear\_dueling/run.py}.
-  \item
-    Measure and compare cumulative regret over \(T = 200\) rounds for
-    each acquisition function.
-  \item
-    Report and analyze the empirical regret curves, discussing any
-    notable performance differences.
-  \end{enumerate}
-\item
-  \textbf{Effect of Dimensionality on Regret (Written + Coding, 10
-  points)}: Analyze how increasing feature dimensionality impacts
-  regret.
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    Experimentally evaluate the regret for different values of \(d\)
-    (e.g., \(d = 5, 10, 20\)) while keeping \(K\) constant.
-  \item
-    Plot the regret against \(d\) and explain any observed trends.
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{from}\NormalTok{ \_\_future\_\_ }\ImportTok{import}\NormalTok{ annotations}
-
-\ImportTok{from}\NormalTok{ typing }\ImportTok{import}\NormalTok{ Optional}
-\ImportTok{import}\NormalTok{ itertools}
-
-\ImportTok{import}\NormalTok{ torch}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{from}\NormalTok{ torch }\ImportTok{import}\NormalTok{ Tensor}
-\ImportTok{from}\NormalTok{ tqdm }\ImportTok{import}\NormalTok{ tqdm}
-\ImportTok{from}\NormalTok{ botorch.acquisition.preference }\ImportTok{import}\NormalTok{ qExpectedUtilityOfBestOption}
-\ImportTok{from}\NormalTok{ botorch.acquisition.logei }\ImportTok{import}\NormalTok{ qLogExpectedImprovement}
-\ImportTok{from}\NormalTok{ botorch.fit }\ImportTok{import}\NormalTok{ fit\_gpytorch\_mll}
-\ImportTok{from}\NormalTok{ botorch.models.gpytorch }\ImportTok{import}\NormalTok{ GPyTorchModel}
-\ImportTok{from}\NormalTok{ botorch.utils.sampling }\ImportTok{import}\NormalTok{ draw\_sobol\_samples}
-\ImportTok{from}\NormalTok{ botorch.sampling }\ImportTok{import}\NormalTok{ SobolQMCNormalSampler}
-\ImportTok{from}\NormalTok{ botorch.posteriors.gpytorch }\ImportTok{import}\NormalTok{ GPyTorchPosterior}
-\ImportTok{from}\NormalTok{ gpytorch.distributions }\ImportTok{import}\NormalTok{ base\_distributions}
-\ImportTok{from}\NormalTok{ gpytorch.likelihoods }\ImportTok{import}\NormalTok{ Likelihood}
-\ImportTok{from}\NormalTok{ gpytorch.distributions }\ImportTok{import}\NormalTok{ MultivariateNormal}
-\ImportTok{from}\NormalTok{ gpytorch.kernels }\ImportTok{import}\NormalTok{ Kernel, RBFKernel, ScaleKernel}
-\ImportTok{from}\NormalTok{ gpytorch.mlls.variational\_elbo }\ImportTok{import}\NormalTok{ VariationalELBO}
-\ImportTok{from}\NormalTok{ gpytorch.means }\ImportTok{import}\NormalTok{ ConstantMean}
-\ImportTok{from}\NormalTok{ gpytorch.models }\ImportTok{import}\NormalTok{ ApproximateGP}
-\ImportTok{from}\NormalTok{ gpytorch.priors.torch\_priors }\ImportTok{import}\NormalTok{ GammaPrior}
-\ImportTok{from}\NormalTok{ gpytorch.variational }\ImportTok{import}\NormalTok{ (}
-\NormalTok{    CholeskyVariationalDistribution,}
-\NormalTok{    UnwhitenedVariationalStrategy,}
-\NormalTok{    VariationalStrategy,}
-\NormalTok{)}
-
-
-\KeywordTok{class}\NormalTok{ PreferentialSoftmaxLikelihood(Likelihood):}
-    \CommentTok{r"""}
-\CommentTok{    Implements the softmax likelihood used for GP{-}based preference learning.}
-
-\CommentTok{    .. math::}
-\CommentTok{        p(\textbackslash{}mathbf y \textbackslash{}mid \textbackslash{}mathbf f) = }\CharTok{\textbackslash{}t}\CommentTok{ext\{Softmax\} \textbackslash{}left( \textbackslash{}mathbf f }\CharTok{\textbackslash{}r}\CommentTok{ight)}
-
-\CommentTok{    :param int num\_alternatives: Number of alternatives (i.e., q).}
-\CommentTok{    """}
-
-    \KeywordTok{def} \FunctionTok{\_\_init\_\_}\NormalTok{(}\VariableTok{self}\NormalTok{, num\_alternatives):}
-        \BuiltInTok{super}\NormalTok{().}\FunctionTok{\_\_init\_\_}\NormalTok{()}
-        \VariableTok{self}\NormalTok{.num\_alternatives }\OperatorTok{=}\NormalTok{ num\_alternatives}
-        \VariableTok{self}\NormalTok{.noise }\OperatorTok{=}\NormalTok{ torch.tensor(}\FloatTok{1e{-}4}\NormalTok{)  }\CommentTok{\# This is only used to draw RFFs{-}based}
-        \CommentTok{\# samples. We set it close to zero because we want noise{-}free samples}
-        \VariableTok{self}\NormalTok{.sampler }\OperatorTok{=}\NormalTok{ SobolQMCNormalSampler(}
-\NormalTok{            sample\_shape}\OperatorTok{=}\NormalTok{torch.Size([}\DecValTok{512}\NormalTok{]))  }\CommentTok{\# This allows for}
-        \CommentTok{\# SAA{-}based optimization of the ELBO}
-
-    \KeywordTok{def}\NormalTok{ \_draw\_likelihood\_samples(}
-        \VariableTok{self}\NormalTok{, function\_dist, }\OperatorTok{*}\NormalTok{args, sample\_shape}\OperatorTok{=}\VariableTok{None}\NormalTok{, }\OperatorTok{**}\NormalTok{kwargs}
-\NormalTok{    ):}
-\NormalTok{        function\_samples }\OperatorTok{=} \VariableTok{self}\NormalTok{.sampler(}
-\NormalTok{            GPyTorchPosterior(function\_dist)).squeeze(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{)}
-        \ControlFlowTok{return} \VariableTok{self}\NormalTok{.forward(function\_samples, }\OperatorTok{*}\NormalTok{args, }\OperatorTok{**}\NormalTok{kwargs)}
-
-    \KeywordTok{def}\NormalTok{ forward(}\VariableTok{self}\NormalTok{, function\_samples, }\OperatorTok{*}\NormalTok{params, }\OperatorTok{**}\NormalTok{kwargs):}
-\NormalTok{        function\_samples }\OperatorTok{=}\NormalTok{ function\_samples.reshape(}
-\NormalTok{            function\_samples.shape[:}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}
-            \OperatorTok{+}\NormalTok{ torch.Size(}
-\NormalTok{                (}
-                    \BuiltInTok{int}\NormalTok{(function\_samples.shape[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{] }\OperatorTok{/} \VariableTok{self}\NormalTok{.num\_alternatives),}
-                    \VariableTok{self}\NormalTok{.num\_alternatives,}
-\NormalTok{                )}
-\NormalTok{            )}
-\NormalTok{        )  }\CommentTok{\# Reshape samples as if they came from a multi{-}output model (with \textasciigrave{}q\textasciigrave{} outputs)}
-\NormalTok{        num\_alternatives }\OperatorTok{=}\NormalTok{ function\_samples.shape[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}
-
-        \ControlFlowTok{if}\NormalTok{ num\_alternatives }\OperatorTok{!=} \VariableTok{self}\NormalTok{.num\_alternatives:}
-            \ControlFlowTok{raise} \PreprocessorTok{RuntimeError}\NormalTok{(}\StringTok{"There should be }\SpecialCharTok{\%d}\StringTok{ points"} \OperatorTok{\%}
-                               \VariableTok{self}\NormalTok{.num\_alternatives)}
-
-\NormalTok{        res }\OperatorTok{=}\NormalTok{ base\_distributions.Categorical(}
-\NormalTok{            logits}\OperatorTok{=}\NormalTok{function\_samples)  }\CommentTok{\# Passing the}
-        \CommentTok{\# function values as logits recovers the softmax likelihood}
-        \ControlFlowTok{return}\NormalTok{ res}
-
-
-\KeywordTok{class}\NormalTok{ VariationalPreferentialGP(GPyTorchModel, ApproximateGP):}
-    \KeywordTok{def} \FunctionTok{\_\_init\_\_}\NormalTok{(}
-        \VariableTok{self}\NormalTok{,}
-\NormalTok{        queries: Tensor,}
-\NormalTok{        responses: Tensor,}
-\NormalTok{        use\_withening: }\BuiltInTok{bool} \OperatorTok{=} \VariableTok{True}\NormalTok{,}
-\NormalTok{        covar\_module: Optional[Kernel] }\OperatorTok{=} \VariableTok{None}\NormalTok{,}
-\NormalTok{    ) }\OperatorTok{{-}\textgreater{}} \VariableTok{None}\NormalTok{:}
-        \CommentTok{r"""}
-\CommentTok{        Args:}
-\CommentTok{            queries: A \textasciigrave{}n x q x d\textasciigrave{} tensor of training inputs. Each of the \textasciigrave{}n\textasciigrave{} queries is constituted}
-\CommentTok{                by \textasciigrave{}q\textasciigrave{} \textasciigrave{}d\textasciigrave{}{-}dimensional decision vectors.}
-\CommentTok{            responses: A \textasciigrave{}n x 1\textasciigrave{} tensor of training outputs. Each of the \textasciigrave{}n\textasciigrave{} responses is an integer}
-\CommentTok{                between 0 and \textasciigrave{}q{-}1\textasciigrave{} indicating the decision vector selected by the user.}
-\CommentTok{            use\_withening: If true, use withening to enhance variational inference.}
-\CommentTok{            covar\_module: The module computing the covariance matrix.}
-\CommentTok{        """}
-        \VariableTok{self}\NormalTok{.queries }\OperatorTok{=}\NormalTok{ queries}
-        \VariableTok{self}\NormalTok{.responses }\OperatorTok{=}\NormalTok{ responses}
-        \VariableTok{self}\NormalTok{.input\_dim }\OperatorTok{=}\NormalTok{ queries.shape[}\OperatorTok{{-}}\DecValTok{1}\NormalTok{]}
-        \VariableTok{self}\NormalTok{.q }\OperatorTok{=}\NormalTok{ queries.shape[}\OperatorTok{{-}}\DecValTok{2}\NormalTok{]}
-        \VariableTok{self}\NormalTok{.num\_data }\OperatorTok{=}\NormalTok{ queries.shape[}\OperatorTok{{-}}\DecValTok{3}\NormalTok{]}
-\NormalTok{        train\_x }\OperatorTok{=}\NormalTok{ queries.reshape(}
-\NormalTok{            queries.shape[}\DecValTok{0}\NormalTok{] }\OperatorTok{*}\NormalTok{ queries.shape[}\DecValTok{1}\NormalTok{], queries.shape[}\DecValTok{2}\NormalTok{]}
-\NormalTok{        )  }\CommentTok{\# Reshape queries in the form of "standard training inputs"}
-\NormalTok{        train\_y }\OperatorTok{=}\NormalTok{ responses.squeeze(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{)  }\CommentTok{\# Squeeze out output dimension}
-\NormalTok{        bounds }\OperatorTok{=}\NormalTok{ torch.tensor(}
-\NormalTok{            [[}\DecValTok{0}\NormalTok{, }\DecValTok{1}\NormalTok{] }\ControlFlowTok{for}\NormalTok{ \_ }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\VariableTok{self}\NormalTok{.input\_dim)], dtype}\OperatorTok{=}\NormalTok{torch.double}
-\NormalTok{        ).T  }\CommentTok{\# This assumes the input space has been normalized beforehand}
-        \CommentTok{\# Construct variational distribution and strategy}
-        \ControlFlowTok{if}\NormalTok{ use\_withening:}
-\NormalTok{            inducing\_points }\OperatorTok{=}\NormalTok{ draw\_sobol\_samples(}
-\NormalTok{                bounds}\OperatorTok{=}\NormalTok{bounds,}
-\NormalTok{                n}\OperatorTok{=}\DecValTok{2} \OperatorTok{*} \VariableTok{self}\NormalTok{.input\_dim,}
-\NormalTok{                q}\OperatorTok{=}\DecValTok{1}\NormalTok{,}
-\NormalTok{                seed}\OperatorTok{=}\DecValTok{0}\NormalTok{,}
-\NormalTok{            ).squeeze(}\DecValTok{1}\NormalTok{)}
-\NormalTok{            inducing\_points }\OperatorTok{=}\NormalTok{ torch.cat([inducing\_points, train\_x], dim}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-\NormalTok{            variational\_distribution }\OperatorTok{=}\NormalTok{ CholeskyVariationalDistribution(}
-\NormalTok{                inducing\_points.size(}\OperatorTok{{-}}\DecValTok{2}\NormalTok{)}
-\NormalTok{            )}
-\NormalTok{            variational\_strategy }\OperatorTok{=}\NormalTok{ VariationalStrategy(}
-                \VariableTok{self}\NormalTok{,}
-\NormalTok{                inducing\_points,}
-\NormalTok{                variational\_distribution,}
-\NormalTok{                learn\_inducing\_locations}\OperatorTok{=}\VariableTok{False}\NormalTok{,}
-\NormalTok{            )}
-        \ControlFlowTok{else}\NormalTok{:}
-\NormalTok{            inducing\_points }\OperatorTok{=}\NormalTok{ train\_x}
-\NormalTok{            variational\_distribution }\OperatorTok{=}\NormalTok{ CholeskyVariationalDistribution(}
-\NormalTok{                inducing\_points.size(}\OperatorTok{{-}}\DecValTok{2}\NormalTok{)}
-\NormalTok{            )}
-\NormalTok{            variational\_strategy }\OperatorTok{=}\NormalTok{ UnwhitenedVariationalStrategy(}
-                \VariableTok{self}\NormalTok{,}
-\NormalTok{                inducing\_points,}
-\NormalTok{                variational\_distribution,}
-\NormalTok{                learn\_inducing\_locations}\OperatorTok{=}\VariableTok{False}\NormalTok{,}
-\NormalTok{            )}
-        \BuiltInTok{super}\NormalTok{().}\FunctionTok{\_\_init\_\_}\NormalTok{(variational\_strategy)}
-        \VariableTok{self}\NormalTok{.likelihood }\OperatorTok{=}\NormalTok{ PreferentialSoftmaxLikelihood(}
-\NormalTok{            num\_alternatives}\OperatorTok{=}\VariableTok{self}\NormalTok{.q)}
-        \VariableTok{self}\NormalTok{.mean\_module }\OperatorTok{=}\NormalTok{ ConstantMean()}
-\NormalTok{        scales }\OperatorTok{=}\NormalTok{ bounds[}\DecValTok{1}\NormalTok{, :] }\OperatorTok{{-}}\NormalTok{ bounds[}\DecValTok{0}\NormalTok{, :]}
-
-        \ControlFlowTok{if}\NormalTok{ covar\_module }\KeywordTok{is} \VariableTok{None}\NormalTok{:}
-            \VariableTok{self}\NormalTok{.covar\_module }\OperatorTok{=}\NormalTok{ ScaleKernel(}
-\NormalTok{                RBFKernel(}
-\NormalTok{                    ard\_num\_dims}\OperatorTok{=}\VariableTok{self}\NormalTok{.input\_dim,}
-\NormalTok{                    lengthscale\_prior}\OperatorTok{=}\NormalTok{GammaPrior(}\FloatTok{3.0}\NormalTok{, }\FloatTok{6.0} \OperatorTok{/}\NormalTok{ scales),}
-\NormalTok{                ),}
-\NormalTok{                outputscale\_prior}\OperatorTok{=}\NormalTok{GammaPrior(}\FloatTok{2.0}\NormalTok{, }\FloatTok{0.15}\NormalTok{),}
-\NormalTok{            )}
-        \ControlFlowTok{else}\NormalTok{:}
-            \VariableTok{self}\NormalTok{.covar\_module }\OperatorTok{=}\NormalTok{ covar\_module}
-        \VariableTok{self}\NormalTok{.\_num\_outputs }\OperatorTok{=} \DecValTok{1}
-        \VariableTok{self}\NormalTok{.train\_inputs }\OperatorTok{=}\NormalTok{ (train\_x,)}
-        \VariableTok{self}\NormalTok{.train\_targets }\OperatorTok{=}\NormalTok{ train\_y}
-
-    \KeywordTok{def}\NormalTok{ forward(}\VariableTok{self}\NormalTok{, X: Tensor) }\OperatorTok{{-}\textgreater{}}\NormalTok{ MultivariateNormal:}
-\NormalTok{        mean\_X }\OperatorTok{=} \VariableTok{self}\NormalTok{.mean\_module(X)}
-\NormalTok{        covar\_X }\OperatorTok{=} \VariableTok{self}\NormalTok{.covar\_module(X)}
-        \ControlFlowTok{return}\NormalTok{ MultivariateNormal(mean\_X, covar\_X)}
-
-    \AttributeTok{@property}
-    \KeywordTok{def}\NormalTok{ num\_outputs(}\VariableTok{self}\NormalTok{) }\OperatorTok{{-}\textgreater{}} \BuiltInTok{int}\NormalTok{:}
-        \CommentTok{r"""The number of outputs of the model."""}
-        \ControlFlowTok{return} \DecValTok{1}
-
-
-\CommentTok{\# Objective function for pairwise comparisons}
-\KeywordTok{def}\NormalTok{ f(x):}
-    \CommentTok{"""}
-\CommentTok{    Computes the preference score for a given action.}
-
-\CommentTok{    Args:}
-\CommentTok{        x (torch.Tensor): A feature vector of dimension \textasciigrave{}d\textasciigrave{}.}
-
-\CommentTok{    Returns:}
-\CommentTok{        torch.Tensor: The computed preference score.}
-\CommentTok{    """}
-    \ControlFlowTok{return}\NormalTok{ x }\OperatorTok{@}\NormalTok{ theta\_true}
-
-\CommentTok{\# Simulate pairwise comparisons}
-
-
-\KeywordTok{def}\NormalTok{ simulate\_comparison(x1, x2):}
-    \CommentTok{"""}
-\CommentTok{    Simulates a pairwise comparison between two actions based on their preference scores.}
-
-\CommentTok{    Args:}
-\CommentTok{        x1 (torch.Tensor): Feature vector of the first action.}
-\CommentTok{        x2 (torch.Tensor): Feature vector of the second action.}
-
-\CommentTok{    Returns:}
-\CommentTok{        torch.Tensor: The feature vector of the preferred action.}
-\CommentTok{    """}
-\NormalTok{    prob\_x1 }\OperatorTok{=}\NormalTok{ torch.sigmoid(f(x1) }\OperatorTok{{-}}\NormalTok{ f(x2))}
-    \ControlFlowTok{return}\NormalTok{ x1 }\ControlFlowTok{if}\NormalTok{ torch.rand(}\DecValTok{1}\NormalTok{).item() }\OperatorTok{\textless{}}\NormalTok{ prob\_x1 }\ControlFlowTok{else}\NormalTok{ x2}
-
-\CommentTok{\# Function to fit a Variational GP model}
-
-
-\KeywordTok{def}\NormalTok{ fit\_variational\_gp(train\_X, train\_Y):}
-    \CommentTok{"""}
-\CommentTok{    Fits a Variational Gaussian Process (GP) model to the given training data.}
-
-\CommentTok{    Args:}
-\CommentTok{        train\_X (torch.Tensor): Training feature pairs of shape [n, 2, d].}
-\CommentTok{        train\_Y (torch.Tensor): Training preferences of shape [n, 1].}
-
-\CommentTok{    Returns:}
-\CommentTok{        VariationalPreferentialGP: A fitted GP model.}
-\CommentTok{    """}
-\NormalTok{    queries }\OperatorTok{=}\NormalTok{ train\_X.reshape(train\_X.shape[}\DecValTok{0}\NormalTok{], }\DecValTok{2}\NormalTok{, d)}
-\NormalTok{    responses }\OperatorTok{=}\NormalTok{ train\_Y}
-    \ControlFlowTok{return}\NormalTok{ fit\_model(queries, responses)}
-
-
-\KeywordTok{def}\NormalTok{ fit\_model(queries, responses):}
-    \CommentTok{"""}
-\CommentTok{    Internal helper to train a VariationalPreferentialGP.}
-
-\CommentTok{    Args:}
-\CommentTok{        queries (torch.Tensor): Training feature pairs.}
-\CommentTok{        responses (torch.Tensor): Training responses (preferences).}
-
-\CommentTok{    Returns:}
-\CommentTok{        VariationalPreferentialGP: Trained GP model.}
-\CommentTok{    """}
-\NormalTok{    model }\OperatorTok{=}\NormalTok{ VariationalPreferentialGP(queries, responses)}
-\NormalTok{    model.train()}
-\NormalTok{    model.likelihood.train()}
-\NormalTok{    mll }\OperatorTok{=}\NormalTok{ VariationalELBO(}
-\NormalTok{        likelihood}\OperatorTok{=}\NormalTok{model.likelihood,}
-\NormalTok{        model}\OperatorTok{=}\NormalTok{model,}
-\NormalTok{        num\_data}\OperatorTok{=}\DecValTok{2} \OperatorTok{*}\NormalTok{ model.num\_data,}
-\NormalTok{    )}
-\NormalTok{    fit\_gpytorch\_mll(mll)}
-\NormalTok{    model.}\BuiltInTok{eval}\NormalTok{()}
-\NormalTok{    model.likelihood.}\BuiltInTok{eval}\NormalTok{()}
-    \ControlFlowTok{return}\NormalTok{ model}
-
-\CommentTok{\# Acquisition function definition}
-
-
-\KeywordTok{def}\NormalTok{ get\_acquisition\_functions(gp):}
-    \CommentTok{"""}
-\CommentTok{    Returns acquisition functions (qLogEI and qEUBO) for a given GP model.}
-
-\CommentTok{    Args:}
-\CommentTok{        gp (VariationalPreferentialGP): The fitted GP model.}
-
-\CommentTok{    Returns:}
-\CommentTok{        tuple: qLogExpectedImprovement and qExpectedUtilityOfBestOption acquisition functions.}
-\CommentTok{    """}
-    \ControlFlowTok{with}\NormalTok{ torch.no\_grad():}
-\NormalTok{        posterior }\OperatorTok{=}\NormalTok{ gp.posterior(gp.train\_inputs[}\DecValTok{0}\NormalTok{])}
-\NormalTok{        best\_f }\OperatorTok{=}\NormalTok{ posterior.mean.squeeze(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{).}\BuiltInTok{max}\NormalTok{()}
-
-\NormalTok{    qLogEI }\OperatorTok{=}\NormalTok{ qLogExpectedImprovement(model}\OperatorTok{=}\NormalTok{gp, best\_f}\OperatorTok{=}\NormalTok{best\_f)}
-\NormalTok{    qEUBO }\OperatorTok{=}\NormalTok{ qExpectedUtilityOfBestOption(pref\_model}\OperatorTok{=}\NormalTok{gp)}
-    \ControlFlowTok{return}\NormalTok{ qLogEI, qEUBO}
-
-\CommentTok{\# Evaluate acquisition function on pairs}
-
-
-\KeywordTok{def}\NormalTok{ evaluate\_acquisition\_on\_pairs(acq\_function, arms):}
-    \CommentTok{"""}
-\CommentTok{    Computes acquisition values for all possible pairs of arms.}
-
-\CommentTok{    Args:}
-\CommentTok{        acq\_function: The acquisition function to evaluate.}
-\CommentTok{        arms (torch.Tensor): All available arms (feature vectors).}
-
-\CommentTok{    Returns:}
-\CommentTok{        tuple: A list of pairs and their corresponding acquisition values.}
-\CommentTok{    """}
-\NormalTok{    pairs }\OperatorTok{=} \BuiltInTok{list}\NormalTok{(itertools.combinations(arms, }\DecValTok{2}\NormalTok{))}
-\NormalTok{    pair\_values }\OperatorTok{=}\NormalTok{ []}
-    \ControlFlowTok{with}\NormalTok{ torch.no\_grad():}
-        \ControlFlowTok{for}\NormalTok{ x1, x2 }\KeywordTok{in}\NormalTok{ pairs:}
-\NormalTok{            pair }\OperatorTok{=}\NormalTok{ torch.stack([x1, x2]).unsqueeze(}\DecValTok{0}\NormalTok{)}
-\NormalTok{            pair\_values.append(acq\_function(pair))}
-    \ControlFlowTok{return}\NormalTok{ pairs, torch.tensor(pair\_values)}
-
-\CommentTok{\# Regret calculation}
-
-
-\KeywordTok{def}\NormalTok{ calculate\_regret\_from\_gp(gp, actions):}
-    \CommentTok{"""}
-\CommentTok{    Computes the regret for the current GP model.}
-
-\CommentTok{    Args:}
-\CommentTok{        gp (VariationalPreferentialGP): The fitted GP model.}
-\CommentTok{        actions (torch.Tensor): Feature vectors of arms.}
-
-\CommentTok{    Returns:}
-\CommentTok{        torch.Tensor: The calculated regret.}
-\CommentTok{    """}
-    \CommentTok{\# YOUR CODE HERE (\textasciitilde{}6 lines)}
-    \CommentTok{\# Compare the ground truth optimal arm to the GP\textquotesingle{}s believed best arm}
-    \CommentTok{\# Hint: To find GP believed best arm in expectation, use gp.posterior which returns with a mean property.}
-    \ControlFlowTok{pass}
-    \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-
-\ControlFlowTok{if} \VariableTok{\_\_name\_\_} \OperatorTok{==} \StringTok{"\_\_main\_\_"}\NormalTok{:}
-    \CommentTok{\# Set default tensor precision}
-\NormalTok{    torch.set\_default\_dtype(torch.double)}
-
-    \CommentTok{\# Problem settings}
-\NormalTok{    torch.manual\_seed(}\DecValTok{55}\NormalTok{)}
-\NormalTok{    K }\OperatorTok{=} \DecValTok{30}  \CommentTok{\# Number of arms (discrete choices)}
-\NormalTok{    d }\OperatorTok{=} \DecValTok{2}   \CommentTok{\# Dimensionality of feature vectors}
-\NormalTok{    T }\OperatorTok{=} \DecValTok{100}  \CommentTok{\# Number of rounds (iterations)}
-\NormalTok{    bounds }\OperatorTok{=}\NormalTok{ torch.tensor([[}\FloatTok{0.0}\NormalTok{] }\OperatorTok{*}\NormalTok{ d, [}\FloatTok{1.0}\NormalTok{] }\OperatorTok{*}\NormalTok{ d])  }\CommentTok{\# Bounds for action space}
-
-    \CommentTok{\# Generate random actions (feature vectors)}
-\NormalTok{    actions }\OperatorTok{=}\NormalTok{ torch.rand(K, d)}
-
-    \CommentTok{\# Ground{-}truth preference parameter (unknown to the model)}
-\NormalTok{    theta\_true }\OperatorTok{=}\NormalTok{ torch.ones(d)}
-
-    \CommentTok{\# Generate initial observations}
-\NormalTok{    n\_initial }\OperatorTok{=} \DecValTok{5}
-\NormalTok{    indices }\OperatorTok{=}\NormalTok{ torch.randint(}\DecValTok{0}\NormalTok{, K, (n\_initial, }\DecValTok{2}\NormalTok{))}
-\NormalTok{    train\_X\_logei }\OperatorTok{=}\NormalTok{ actions[indices]  }\CommentTok{\# Shape: [n\_initial, 2, d]}
-\NormalTok{    train\_X\_qeubo }\OperatorTok{=}\NormalTok{ train\_X\_logei.clone()}
-\NormalTok{    train\_X\_random }\OperatorTok{=}\NormalTok{ train\_X\_logei.clone()}
-\NormalTok{    train\_Y\_logei }\OperatorTok{=}\NormalTok{ torch.tensor([[}\FloatTok{0.0} \ControlFlowTok{if}\NormalTok{ simulate\_comparison(x1, x2).equal(x1) }\ControlFlowTok{else} \FloatTok{1.0}\NormalTok{]}
-                                  \ControlFlowTok{for}\NormalTok{ x1, x2 }\KeywordTok{in}\NormalTok{ train\_X\_logei])}
-\NormalTok{    train\_Y\_qeubo }\OperatorTok{=}\NormalTok{ train\_Y\_logei.clone()}
-\NormalTok{    train\_Y\_random }\OperatorTok{=}\NormalTok{ train\_Y\_logei.clone()}
-
-    \CommentTok{\# Optimization loop}
-\NormalTok{    cumulative\_regret\_logei }\OperatorTok{=}\NormalTok{ []}
-\NormalTok{    cumulative\_regret\_qeubo }\OperatorTok{=}\NormalTok{ []}
-\NormalTok{    cumulative\_regret\_random }\OperatorTok{=}\NormalTok{ []}
-
-    \ControlFlowTok{for}\NormalTok{ t }\KeywordTok{in}\NormalTok{ tqdm(}\BuiltInTok{range}\NormalTok{(T)):}
-        \CommentTok{\# Fit GP models}
-\NormalTok{        gp\_logei }\OperatorTok{=}\NormalTok{ fit\_variational\_gp(train\_X\_logei, train\_Y\_logei)}
-\NormalTok{        gp\_qeubo }\OperatorTok{=}\NormalTok{ fit\_variational\_gp(train\_X\_qeubo, train\_Y\_qeubo)}
-\NormalTok{        gp\_random }\OperatorTok{=}\NormalTok{ fit\_variational\_gp(train\_X\_random, train\_Y\_random)}
-
-        \CommentTok{\# Define acquisition functions}
-\NormalTok{        qLogEI, \_ }\OperatorTok{=}\NormalTok{ get\_acquisition\_functions(gp\_logei)}
-\NormalTok{        \_, qEUBO }\OperatorTok{=}\NormalTok{ get\_acquisition\_functions(gp\_qeubo)}
-
-        \CommentTok{\# Evaluate acquisition functions}
-\NormalTok{        pairs\_logei, acq\_values\_logei }\OperatorTok{=}\NormalTok{ evaluate\_acquisition\_on\_pairs(}
-\NormalTok{            qLogEI, actions)}
-\NormalTok{        pairs\_qeubo, acq\_values\_qeubo }\OperatorTok{=}\NormalTok{ evaluate\_acquisition\_on\_pairs(}
-\NormalTok{            qEUBO, actions)}
-
-        \CommentTok{\# Select pairs based on acquisition values}
-\NormalTok{        best\_pair\_idx\_logei }\OperatorTok{=}\NormalTok{ torch.argmax(acq\_values\_logei)}
-\NormalTok{        best\_pair\_idx\_qeubo }\OperatorTok{=}\NormalTok{ torch.argmax(acq\_values\_qeubo)}
-\NormalTok{        x1\_logei, x2\_logei }\OperatorTok{=}\NormalTok{ pairs\_logei[best\_pair\_idx\_logei]}
-\NormalTok{        x1\_qeubo, x2\_qeubo }\OperatorTok{=}\NormalTok{ pairs\_qeubo[best\_pair\_idx\_qeubo]}
-
-        \CommentTok{\# Random pair selection}
-\NormalTok{        random\_indices }\OperatorTok{=}\NormalTok{ torch.randint(}\DecValTok{0}\NormalTok{, K, (}\DecValTok{2}\NormalTok{,))}
-\NormalTok{        x1\_random }\OperatorTok{=}\NormalTok{ actions[random\_indices[}\DecValTok{0}\NormalTok{]]}
-\NormalTok{        x2\_random }\OperatorTok{=}\NormalTok{ actions[random\_indices[}\DecValTok{1}\NormalTok{]]}
-
-        \CommentTok{\# Simulate comparisons}
-\NormalTok{        selected\_logei }\OperatorTok{=}\NormalTok{ simulate\_comparison(x1\_logei, x2\_logei)}
-\NormalTok{        selected\_qeubo }\OperatorTok{=}\NormalTok{ simulate\_comparison(x1\_qeubo, x2\_qeubo)}
-\NormalTok{        selected\_random }\OperatorTok{=}\NormalTok{ simulate\_comparison(x1\_random, x2\_random)}
-
-        \CommentTok{\# Update training data}
-\NormalTok{        train\_X\_logei }\OperatorTok{=}\NormalTok{ torch.cat(}
-\NormalTok{            [train\_X\_logei, torch.stack([x1\_logei, x2\_logei]).unsqueeze(}\DecValTok{0}\NormalTok{)])}
-\NormalTok{        train\_Y\_logei }\OperatorTok{=}\NormalTok{ torch.cat([train\_Y\_logei, torch.tensor(}
-\NormalTok{            [[}\FloatTok{0.0} \ControlFlowTok{if}\NormalTok{ selected\_logei.equal(x1\_logei) }\ControlFlowTok{else} \FloatTok{1.0}\NormalTok{]])])}
-\NormalTok{        train\_X\_qeubo }\OperatorTok{=}\NormalTok{ torch.cat(}
-\NormalTok{            [train\_X\_qeubo, torch.stack([x1\_qeubo, x2\_qeubo]).unsqueeze(}\DecValTok{0}\NormalTok{)])}
-\NormalTok{        train\_Y\_qeubo }\OperatorTok{=}\NormalTok{ torch.cat([train\_Y\_qeubo, torch.tensor(}
-\NormalTok{            [[}\FloatTok{0.0} \ControlFlowTok{if}\NormalTok{ selected\_qeubo.equal(x1\_qeubo) }\ControlFlowTok{else} \FloatTok{1.0}\NormalTok{]])])}
-\NormalTok{        train\_X\_random }\OperatorTok{=}\NormalTok{ torch.cat(}
-\NormalTok{            [train\_X\_random, torch.stack([x1\_random, x2\_random]).unsqueeze(}\DecValTok{0}\NormalTok{)])}
-\NormalTok{        train\_Y\_random }\OperatorTok{=}\NormalTok{ torch.cat([train\_Y\_random, torch.tensor(}
-\NormalTok{            [[}\FloatTok{0.0} \ControlFlowTok{if}\NormalTok{ selected\_random.equal(x1\_random) }\ControlFlowTok{else} \FloatTok{1.0}\NormalTok{]])])}
-
-        \CommentTok{\# Calculate regrets}
-\NormalTok{        regret\_logei }\OperatorTok{=}\NormalTok{ calculate\_regret\_from\_gp(gp\_logei, actions)}
-\NormalTok{        regret\_qeubo }\OperatorTok{=}\NormalTok{ calculate\_regret\_from\_gp(gp\_qeubo, actions)}
-\NormalTok{        regret\_random }\OperatorTok{=}\NormalTok{ calculate\_regret\_from\_gp(gp\_random, actions)}
-
-        \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Regret LogEI: }\SpecialCharTok{\{}\NormalTok{regret\_logei}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-        \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Regret qEUBO: }\SpecialCharTok{\{}\NormalTok{regret\_qeubo}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-        \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f\textquotesingle{}Regret Random: }\SpecialCharTok{\{}\NormalTok{regret\_random}\SpecialCharTok{\}}\SpecialStringTok{\textquotesingle{}}\NormalTok{)}
-
-\NormalTok{        cumulative\_regret\_logei.append(regret\_logei)}
-\NormalTok{        cumulative\_regret\_qeubo.append(regret\_qeubo)}
-\NormalTok{        cumulative\_regret\_random.append(regret\_random)}
-
-    \CommentTok{\# Plot cumulative regret}
-\NormalTok{    plt.plot(torch.cumsum(torch.tensor(}
-\NormalTok{        cumulative\_regret\_logei), dim}\OperatorTok{=}\DecValTok{0}\NormalTok{), label}\OperatorTok{=}\StringTok{\textquotesingle{}qLogEI\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.plot(torch.cumsum(torch.tensor(}
-\NormalTok{        cumulative\_regret\_qeubo), dim}\OperatorTok{=}\DecValTok{0}\NormalTok{), label}\OperatorTok{=}\StringTok{\textquotesingle{}qEUBO\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.plot(torch.cumsum(torch.tensor(}
-\NormalTok{        cumulative\_regret\_random), dim}\OperatorTok{=}\DecValTok{0}\NormalTok{), label}\OperatorTok{=}\StringTok{\textquotesingle{}Random\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.xlabel(}\StringTok{\textquotesingle{}Round\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.ylabel(}\StringTok{\textquotesingle{}Cumulative Regret\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.legend()}
-\NormalTok{    plt.title(}\StringTok{\textquotesingle{}Comparison of qLogEI, qEUBO, and Random Sampling\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\subsection*{Question 3: Multi-Objective Thompson Sampling in Linear
-Contextual Bandits (30
-points)}\label{sec-question-3-multi-objective-thompson-sampling-in-linear-contextual-bandits-30-points}
-\addcontentsline{toc}{subsection}{Question 3: Multi-Objective Thompson
-Sampling in Linear Contextual Bandits (30 points)}
-
-Thompson Sampling (TS) is commonly used for reward maximization in
-multi-armed bandit problems, optimizing for the expected reward across
-actions. However, in many real-world scenarios, other objectives, such
-as the interpretability or reusability of learned parameters, are
-equally valuable. This is particularly relevant when modeling unknown
-reward functions with parameters that might offer insights or inform
-future experiments. A purely reward-focused Thompson Sampling approach
-may result in increased false positive rates due to aggressive
-exploitation, whereas a pure exploration approach---such as those used
-in active learning---might better suit the goal of parameter learning.
-
-Assume a multi-objective setting where the goal is to not only maximize
-the cumulative reward but also to accurately learn the parameters of the
-reward function itself in a linear contextual bandit setting. Let each
-arm be represented by a feature vector \(x \in \mathbb{R}^d\), with
-rewards generated by an unknown linear model
-\(r = \theta^\top x + \epsilon\), where
-\(\epsilon \sim \mathcal{N}(0, \sigma^2)\). Given these considerations,
-answer the following:
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{Theoretical Analysis of Multi-Objective Thompson Sampling (8
-  points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 3 points).} Define a cumulative regret objective
-    that balances maximizing the expected reward and minimizing the
-    parameter estimation error \(\|\theta - \hat{\theta}\|_2\). Explain
-    how this multi-objective regret differs from the single-objective
-    regret typically used in linear bandits.
-  \item
-    \textbf{(Written, 3 points).} Derive the expected regret bounds for
-    Thompson Sampling in the single-objective case and describe the
-    additional challenges posed when extending these bounds to the
-    multi-objective case.
-  \item
-    \textbf{(Written, 2 points).} Suppose you were to use a pure
-    exploration approach for parameter estimation. Provide an upper
-    bound for the parameter error \(\|\theta - \hat{\theta}\|_2\) over
-    \(T\) rounds.
-  \end{enumerate}
-\item
-  \textbf{Acquisition Strategies for Multi-Objective Optimization (8
-  points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 3 points).} Explain how to adapt the Upper
-    Confidence Bound (UCB) acquisition function to balance exploration
-    and exploitation for parameter learning alongside reward
-    maximization. Discuss the effect of tuning parameters on
-    exploration.
-  \item
-    \textbf{(Written + Coding, 3 points).} Implement a Thompson Sampling
-    acquisition strategy that alternates between reward maximization and
-    parameter-focused exploration using a multi-objective UCB. Implement
-    the \texttt{select\_arm} function of
-    \texttt{multi\_obj\_thompson/bandit.py}.
-  \item
-    \textbf{(Written, 2 points).} Describe the impact of this
-    alternating acquisition strategy on false positive rates and regret
-    in comparison to standard Thompson Sampling.
-  \end{enumerate}
-\item
-  \textbf{Posterior Distribution Analysis (8 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 2 points).} Given a prior distribution for
-    \(\theta\) and observed rewards, derive the posterior distribution
-    of \(\theta\) at each time step in the context of multi-objective
-    Thompson Sampling. Explain any assumptions needed for computational
-    tractability.
-  \item
-    \textbf{(Coding, 4 points).} Implement a Bayesian update for the
-    posterior of \(\theta\) following each observation. Do this in
-    \texttt{update}.
-  \item
-    \textbf{(Written, 2 points).} Explain how this posterior update
-    accommodates both exploration for parameter estimation and
-    exploitation for reward maximization.
-  \end{enumerate}
-\item
-  \textbf{Empirical Evaluation (6 points)}
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Coding, 3 points).} Design and conduct an experiment
-    comparing standard Thompson Sampling, pure exploration, and your
-    multi-objective TS algorithm. Run this experiment on a synthetic
-    dataset with \(d = 5\) features and \(K = 10\) arms by executing
-    \texttt{run.py}.
-  \item
-    \textbf{(Written, 3 points).} Report and interpret the results by
-    comparing the cumulative reward and parameter estimation error
-    across methods. Provide insights on the trade-offs observed and any
-    patterns in the rate of regret reduction.
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-
-\KeywordTok{class}\NormalTok{ MultiObjectiveThompsonSamplingBandit:}
-    \CommentTok{"""}
-\CommentTok{    A class that implements a multi{-}objective Thompson sampling bandit.}
-
-\CommentTok{    Attributes:}
-\CommentTok{    {-} d (int): Dimension of the feature vector x.}
-\CommentTok{    {-} lambda\_prior (float): Regularization parameter for the prior covariance matrix.}
-\CommentTok{    {-} sigma\_noise (float): Standard deviation of the noise in rewards.}
-\CommentTok{    {-} mu (np.array): Prior mean of theta (initialized as a zero vector).}
-\CommentTok{    {-} Sigma (np.array): Prior covariance of theta (initialized as a scaled identity matrix).}
-\CommentTok{    """}
-
-    \KeywordTok{def} \FunctionTok{\_\_init\_\_}\NormalTok{(}\VariableTok{self}\NormalTok{, d, lambda\_prior}\OperatorTok{=}\FloatTok{1.0}\NormalTok{, sigma\_noise}\OperatorTok{=}\FloatTok{1.0}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Initializes the bandit with a prior on theta and noise variance.}
-
-\CommentTok{        Parameters:}
-\CommentTok{        {-} d (int): Dimension of the feature vector x.}
-\CommentTok{        {-} lambda\_prior (float): Regularization parameter for the prior covariance matrix.}
-\CommentTok{        {-} sigma\_noise (float): Standard deviation of the noise in rewards.}
-\CommentTok{        """}
-        \VariableTok{self}\NormalTok{.d }\OperatorTok{=}\NormalTok{ d}
-        \VariableTok{self}\NormalTok{.lambda\_prior }\OperatorTok{=}\NormalTok{ lambda\_prior}
-        \VariableTok{self}\NormalTok{.sigma\_noise }\OperatorTok{=}\NormalTok{ sigma\_noise}
-
-        \CommentTok{\# Initialize prior mean and covariance matrix}
-        \VariableTok{self}\NormalTok{.mu }\OperatorTok{=}\NormalTok{ np.zeros(d)  }\CommentTok{\# Prior mean of theta}
-        \VariableTok{self}\NormalTok{.Sigma }\OperatorTok{=}\NormalTok{ lambda\_prior }\OperatorTok{*}\NormalTok{ np.eye(d)  }\CommentTok{\# Prior covariance of theta}
-
-    \KeywordTok{def}\NormalTok{ select\_arm(}\VariableTok{self}\NormalTok{, arms, mode):}
-        \CommentTok{"""}
-\CommentTok{        Selects an arm (action) based on the specified mode.}
-
-\CommentTok{        Parameters:}
-\CommentTok{        {-} arms (np.array): A 2D NumPy array of shape (K, d) representing the feature vectors of K arms.}
-\CommentTok{        {-} mode (str): Selection mode, either \textquotesingle{}exploit\textquotesingle{} (reward maximization) or \textquotesingle{}explore\textquotesingle{} (focus on reducing uncertainty in theta).}
-
-\CommentTok{        Returns:}
-\CommentTok{        {-} selected\_arm (np.array): The feature vector of the selected arm.}
-\CommentTok{        {-} arm\_index (int): The index of the selected arm.}
-\CommentTok{        """}
-        \CommentTok{\# Sample a belief for theta from the current posterior}
-\NormalTok{        theta\_sample }\OperatorTok{=}\NormalTok{ np.random.multivariate\_normal(}\VariableTok{self}\NormalTok{.mu, }\VariableTok{self}\NormalTok{.Sigma)}
-
-        \CommentTok{\# Generate reward noise for the arms}
-\NormalTok{        reward\_noise }\OperatorTok{=}\NormalTok{ np.random.normal(}\DecValTok{0}\NormalTok{, }\VariableTok{self}\NormalTok{.sigma\_noise, size}\OperatorTok{=}\BuiltInTok{len}\NormalTok{(arms))}
-
-        \ControlFlowTok{if}\NormalTok{ mode }\OperatorTok{==} \StringTok{\textquotesingle{}exploit\textquotesingle{}}\NormalTok{:}
-            \CommentTok{\# YOUR CODE HERE (\textasciitilde{}2 lines)}
-                \CommentTok{\# 1. Compute expected rewards with noise}
-                \CommentTok{\# 2. Select the arm with the highest expected reward}
-                \ControlFlowTok{pass} 
-            \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-        \ControlFlowTok{elif}\NormalTok{ mode }\OperatorTok{==} \StringTok{\textquotesingle{}explore\textquotesingle{}}\NormalTok{:}
-            \CommentTok{\# Compute posterior covariance norms to evaluate exploration potential for each arm}
-\NormalTok{            posterior\_cov\_norms }\OperatorTok{=}\NormalTok{ []}
-            \ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ arms:}
-\NormalTok{                x }\OperatorTok{=}\NormalTok{ x.reshape(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)  }\CommentTok{\# Reshape to column vector}
-
-                \CommentTok{\# Find posterior covariance hypothetically and get its norm}
-                \CommentTok{\# YOUR CODE HERE (\textasciitilde{}4 lines)}
-                \ControlFlowTok{pass}
-                \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-\NormalTok{                posterior\_cov\_norms.append(norm)}
-
-            \CommentTok{\# Select the arm that minimizes the posterior covariance norm}
-\NormalTok{            arm\_index }\OperatorTok{=}\NormalTok{ np.argmin(posterior\_cov\_norms)}
-
-        \ControlFlowTok{else}\NormalTok{:}
-            \ControlFlowTok{raise} \PreprocessorTok{ValueError}\NormalTok{(}\StringTok{"Mode must be either \textquotesingle{}exploit\textquotesingle{} or \textquotesingle{}explore\textquotesingle{}."}\NormalTok{)}
-
-        \ControlFlowTok{return}\NormalTok{ arms[arm\_index], arm\_index, posterior\_cov\_norms }\ControlFlowTok{if}\NormalTok{ mode }\OperatorTok{==} \StringTok{\textquotesingle{}explore\textquotesingle{}} \ControlFlowTok{else} \VariableTok{None}
-
-    \KeywordTok{def}\NormalTok{ update(}\VariableTok{self}\NormalTok{, x\_t, r\_t):}
-        \CommentTok{"""}
-\CommentTok{        Updates the posterior distribution of theta given a new observation.}
-
-\CommentTok{        Parameters:}
-\CommentTok{        {-} x\_t (np.array): Feature vector of the selected arm at time t.}
-\CommentTok{        {-} r\_t (float): Observed reward at time t.}
-\CommentTok{        """}
-\NormalTok{        x\_t }\OperatorTok{=}\NormalTok{ x\_t.reshape(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)  }\CommentTok{\# Reshape to column vector}
-
-        \CommentTok{\# YOUR CODE HERE (\textasciitilde{}4 lines)}
-        \CommentTok{\# Obtain mu\_new and Sigma\_new of theta posterior. This requires doing some math!}
-        \ControlFlowTok{pass}
-        \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-        \CommentTok{\# Update internal state}
-        \VariableTok{self}\NormalTok{.mu }\OperatorTok{=}\NormalTok{ mu\_new.flatten()}
-        \VariableTok{self}\NormalTok{.Sigma }\OperatorTok{=}\NormalTok{ Sigma\_new}
-
-\ControlFlowTok{if} \VariableTok{\_\_name\_\_} \OperatorTok{==} \StringTok{\textquotesingle{}\_\_main\_\_\textquotesingle{}}\NormalTok{:}
-    \CommentTok{\# Number of features (dimension) and arms}
-\NormalTok{    d }\OperatorTok{=} \DecValTok{5}  \CommentTok{\# Feature dimension}
-\NormalTok{    K }\OperatorTok{=} \DecValTok{10}  \CommentTok{\# Number of arms}
-
-    \CommentTok{\# Generate random arms (feature vectors)}
-\NormalTok{    np.random.seed(}\DecValTok{42}\NormalTok{)}
-\NormalTok{    arms }\OperatorTok{=}\NormalTok{ np.random.randn(K, d)}
-
-    \CommentTok{\# True theta (unknown to the bandit)}
-\NormalTok{    theta\_true }\OperatorTok{=}\NormalTok{ np.random.randn(d)}
-
-    \CommentTok{\# Initialize the bandit}
-\NormalTok{    bandit }\OperatorTok{=}\NormalTok{ MultiObjectiveThompsonSamplingBandit(d)}
-
-    \CommentTok{\# Number of rounds}
-\NormalTok{    T }\OperatorTok{=} \DecValTok{1000}
-
-    \CommentTok{\# Lists to store results}
-\NormalTok{    regrets }\OperatorTok{=}\NormalTok{ []  }\CommentTok{\# Store the regret at each round}
-\NormalTok{    theta\_errors }\OperatorTok{=}\NormalTok{ []  }\CommentTok{\# Store the error between estimated and true theta}
-
-    \CommentTok{\# Simulation loop}
-    \ControlFlowTok{for}\NormalTok{ t }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(T):}
-        \CommentTok{\# Alternate between \textquotesingle{}exploit\textquotesingle{} and \textquotesingle{}explore\textquotesingle{} modes}
-\NormalTok{        mode }\OperatorTok{=} \StringTok{\textquotesingle{}exploit\textquotesingle{}} \ControlFlowTok{if}\NormalTok{ t }\OperatorTok{\%} \DecValTok{2} \OperatorTok{==} \DecValTok{0} \ControlFlowTok{else} \StringTok{\textquotesingle{}explore\textquotesingle{}}
-
-        \CommentTok{\# Select an arm based on the current mode}
-\NormalTok{        x\_t, arm\_index, \_ }\OperatorTok{=}\NormalTok{ bandit.select\_arm(arms, mode}\OperatorTok{=}\NormalTok{mode)}
-
-        \CommentTok{\# Observe the reward with noise}
-\NormalTok{        r\_t }\OperatorTok{=}\NormalTok{ theta\_true }\OperatorTok{@}\NormalTok{ x\_t }\OperatorTok{+}\NormalTok{ np.random.normal(}\DecValTok{0}\NormalTok{, bandit.sigma\_noise)}
-
-        \CommentTok{\# Update the bandit with the new observation}
-\NormalTok{        bandit.update(x\_t, r\_t)}
-
-        \CommentTok{\# Compute regret (difference between optimal reward and received reward)}
-\NormalTok{        optimal\_reward }\OperatorTok{=}\NormalTok{ np.}\BuiltInTok{max}\NormalTok{(arms }\OperatorTok{@}\NormalTok{ theta\_true)  }\CommentTok{\# Best possible reward}
-\NormalTok{        regret }\OperatorTok{=}\NormalTok{ optimal\_reward }\OperatorTok{{-}}\NormalTok{ (theta\_true }\OperatorTok{@}\NormalTok{ x\_t)  }\CommentTok{\# Regret for this round}
-\NormalTok{        regrets.append(regret)}
-
-        \CommentTok{\# Compute parameter estimation error (distance between true and estimated theta)}
-\NormalTok{        theta\_error }\OperatorTok{=}\NormalTok{ np.linalg.norm(theta\_true }\OperatorTok{{-}}\NormalTok{ bandit.mu)}
-\NormalTok{        theta\_errors.append(theta\_error)}
-
-    \CommentTok{\# Final estimates after all rounds}
-\NormalTok{    mu\_estimate, Sigma\_estimate }\OperatorTok{=}\NormalTok{ bandit.mu, bandit.Sigma}
-
-    \CommentTok{\# Print results}
-    \BuiltInTok{print}\NormalTok{(}\StringTok{"Estimated theta:"}\NormalTok{, mu\_estimate)}
-    \BuiltInTok{print}\NormalTok{(}\StringTok{"True theta:"}\NormalTok{, theta\_true)}
-    \BuiltInTok{print}\NormalTok{(}\StringTok{"Cumulative regret:"}\NormalTok{, np.}\BuiltInTok{sum}\NormalTok{(regrets))}
-    \BuiltInTok{print}\NormalTok{(}\StringTok{"Final covariance norm:"}\NormalTok{, np.linalg.norm(Sigma\_estimate))}
-
-    \CommentTok{\# Visualization of results}
-
-    \CommentTok{\# Plot cumulative regret over time}
-\NormalTok{    plt.figure()}
-\NormalTok{    plt.plot(np.cumsum(regrets))}
-\NormalTok{    plt.title(}\StringTok{\textquotesingle{}Cumulative Regret over Time\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.xlabel(}\StringTok{\textquotesingle{}Rounds\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.ylabel(}\StringTok{\textquotesingle{}Cumulative Regret\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.show()}
-
-    \CommentTok{\# Plot estimation error over time}
-\NormalTok{    plt.figure()}
-\NormalTok{    plt.plot(theta\_errors)}
-\NormalTok{    plt.title(}\StringTok{\textquotesingle{}Theta Estimation Error over Time\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.xlabel(}\StringTok{\textquotesingle{}Rounds\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.ylabel(}\StringTok{\textquotesingle{}Estimation Error (L2 Norm)\textquotesingle{}}\NormalTok{)}
-\NormalTok{    plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\subsection*{Question 4: Mechanism Design in Preference Learning (30
-points)}\label{sec-question-4-mechanism-design-in-preference-learning-30-points}
-\addcontentsline{toc}{subsection}{Question 4: Mechanism Design in
-Preference Learning (30 points)}
-
-In mechanism design, a central challenge is optimizing resource
-allocation while accounting for user preferences, which may be private
-and complex. This problem can be addressed using learning techniques to
-infer user preferences, thereby enabling the designer to make informed
-pricing and allocation decisions. Consider a scenario where a designer
-allocates a divisible resource \(B\) among \(N\) players, each with a
-private, continuous, concave utility function \(U_i(x_i)\) over their
-allocated share \(x_i\), where \(x = [x_1, x_2, \dots, x_N]\) denotes
-the allocation vector. The designer aims to maximize social welfare
-while ensuring full resource utilization.
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{Modeling User Preferences (7 points)}:
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 1 point)} Provide a realistic scenario in which we
-    estimate a utility function through eliciting preferences in the
-    context of the mechanism.
-  \item
-    \textbf{(Written, 3 point)} Explain how elliptical slice sampling
-    can be used with a GP in order to estimate a utility function
-    through preferences.
-  \item
-    \textbf{(Written, 3 point)} How can the elliptical slice posterior
-    samples be used to obtain the mean of the posterior predictive for
-    test points? (Hint: Read page \(44\) of
-    \url{https://gaussianprocess.org/gpml/chapters/RW.pdf}.)
-  \end{enumerate}
-\item
-  \textbf{Optimization with Learned Preferences (10 points)}:
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Written, 3 point)} Formulate the designer's optimization
-    problem, maximizing social welfare \(\sum_{i=1}^N U_i(x_i)\) subject
-    to the constraint \(\sum_{i=1}^N x_i \leq B\).
-  \item
-    \textbf{(Written, 4 point)} Using the Lagrange multiplier method,
-    derive the conditions that must be met for optimal allocation and
-    pricing.
-  \item
-    \textbf{(Written, 3 point)} As an alternative approach to Lagrange
-    multipliers, explain how projected gradient descent (PGD) can be
-    used to solve the designer's optimization problem.
-  \end{enumerate}
-\item
-  \textbf{Benchmarking Learning and Allocation Efficiency (13 points)}:
-
-  \begin{enumerate}
-  \def\labelenumii{(\roman{enumii})}
-  \item
-    \textbf{(Coding, 3 point)} Implement \texttt{preference\_loglik} in
-    the file \texttt{gp\_mechanism/preference\_gp.py}.
-  \item
-    \textbf{(Coding, 3 point)} Implement \texttt{predictive\_function}.
-  \item
-    \textbf{(Coding, 3 point)} Implement \texttt{optimize\_allocations}
-    inside \texttt{gp\_mechanism/run.py}.
-  \item
-    \textbf{(Written, 4 point)} Compare GP-approximated utility
-    allocations through PGD, exact utility allocations through PGD, and
-    the optimal Lagrange-based allocation done by hand with each other
-    for your choice of utility functions \(U_i\). Make sure your
-    utilities are continuous and concave.
-  \end{enumerate}
-\end{enumerate}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{from}\NormalTok{ typing }\ImportTok{import}\NormalTok{ Callable}
-
-\ImportTok{import}\NormalTok{ numpy }\ImportTok{as}\NormalTok{ np}
-\ImportTok{import}\NormalTok{ matplotlib.pyplot }\ImportTok{as}\NormalTok{ plt}
-\ImportTok{import}\NormalTok{ torch  }\CommentTok{\# Import PyTorch}
-\ImportTok{from}\NormalTok{ tqdm }\ImportTok{import}\NormalTok{ tqdm}
-
-
-\KeywordTok{class}\NormalTok{ EllipticalSliceSampler:}
-    \KeywordTok{def} \FunctionTok{\_\_init\_\_}\NormalTok{(}\VariableTok{self}\NormalTok{,}
-\NormalTok{                 prior\_cov: np.ndarray,}
-\NormalTok{                 loglik: Callable):}
-        \CommentTok{"""}
-\CommentTok{        Initializes the Elliptical Slice Sampler.}
-
-\CommentTok{        Args:}
-\CommentTok{        {-} prior\_cov (np.ndarray): Prior covariance matrix.}
-\CommentTok{        {-} loglik (Callable): Log{-}likelihood function.}
-\CommentTok{        """}
-        \VariableTok{self}\NormalTok{.prior\_cov }\OperatorTok{=}\NormalTok{ prior\_cov}
-        \VariableTok{self}\NormalTok{.loglik }\OperatorTok{=}\NormalTok{ loglik}
-
-        \VariableTok{self}\NormalTok{.\_n }\OperatorTok{=}\NormalTok{ prior\_cov.shape[}\DecValTok{0}\NormalTok{]  }\CommentTok{\# Dimensionality of the space}
-        \CommentTok{\# Cache Cholesky decomposition}
-        \VariableTok{self}\NormalTok{.\_chol }\OperatorTok{=}\NormalTok{ np.linalg.cholesky(prior\_cov)}
-
-        \CommentTok{\# Initialize state and cache previous states}
-        \VariableTok{self}\NormalTok{.\_state\_f }\OperatorTok{=} \VariableTok{self}\NormalTok{.\_chol }\OperatorTok{@}\NormalTok{ np.random.randn(}\VariableTok{self}\NormalTok{.\_n)}
-
-    \KeywordTok{def}\NormalTok{ \_indiv\_sample(}\VariableTok{self}\NormalTok{):}
-        \CommentTok{"""}
-\CommentTok{        Main algorithm for generating an individual sample using Elliptical Slice Sampling.}
-\CommentTok{        """}
-\NormalTok{        f }\OperatorTok{=} \VariableTok{self}\NormalTok{.\_state\_f  }\CommentTok{\# Previous state}
-        \CommentTok{\# Sample from prior for the ellipse}
-\NormalTok{        nu }\OperatorTok{=} \VariableTok{self}\NormalTok{.\_chol }\OperatorTok{@}\NormalTok{ np.random.randn(}\VariableTok{self}\NormalTok{.\_n)}
-\NormalTok{        log\_y }\OperatorTok{=} \VariableTok{self}\NormalTok{.loglik(f) }\OperatorTok{+}\NormalTok{ np.log(np.random.uniform()}
-\NormalTok{                                        )  }\CommentTok{\# Log{-}likelihood threshold}
-
-\NormalTok{        theta }\OperatorTok{=}\NormalTok{ np.random.uniform(}\FloatTok{0.}\NormalTok{, }\DecValTok{2} \OperatorTok{*}\NormalTok{ np.pi)  }\CommentTok{\# Initial proposal angle}
-\NormalTok{        theta\_min, theta\_max }\OperatorTok{=}\NormalTok{ theta }\OperatorTok{{-}} \DecValTok{2} \OperatorTok{*}\NormalTok{ np.pi, theta  }\CommentTok{\# Define bracketing interval}
-
-        \CommentTok{\# Main loop: Accept sample if it meets log{-}likelihood threshold; otherwise, shrink the bracket.}
-        \ControlFlowTok{while} \VariableTok{True}\NormalTok{:}
-            \CommentTok{\# YOUR CODE HERE (\textasciitilde{}10 lines)}
-            \CommentTok{\# Generate a new sample point based on the current angle.}
-\NormalTok{            f\_prime }\OperatorTok{=}\NormalTok{ f }\OperatorTok{*}\NormalTok{ np.cos(theta) }\OperatorTok{+}\NormalTok{ nu }\OperatorTok{*}\NormalTok{ np.sin(theta)}
-
-            \CommentTok{\# Check if the proposed point meets the acceptance criterion.}
-            \ControlFlowTok{if} \VariableTok{self}\NormalTok{.loglik(f\_prime) }\OperatorTok{\textgreater{}}\NormalTok{ log\_y:  }\CommentTok{\# Accept the sample}
-                \VariableTok{self}\NormalTok{.\_state\_f }\OperatorTok{=}\NormalTok{ f\_prime}
-                \ControlFlowTok{return}
-
-            \ControlFlowTok{else}\NormalTok{:  }\CommentTok{\# If not accepted, adjust the bracket and select a new angle.}
-                \ControlFlowTok{if}\NormalTok{ theta }\OperatorTok{\textless{}} \DecValTok{0}\NormalTok{:}
-\NormalTok{                    theta\_min }\OperatorTok{=}\NormalTok{ theta}
-                \ControlFlowTok{else}\NormalTok{:}
-\NormalTok{                    theta\_max }\OperatorTok{=}\NormalTok{ theta}
-\NormalTok{                theta }\OperatorTok{=}\NormalTok{ np.random.uniform(theta\_min, theta\_max)}
-            \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-    \KeywordTok{def}\NormalTok{ sample(}\VariableTok{self}\NormalTok{,}
-\NormalTok{               n\_samples: }\BuiltInTok{int}\NormalTok{,}
-\NormalTok{               n\_burn: }\BuiltInTok{int} \OperatorTok{=} \DecValTok{500}\NormalTok{) }\OperatorTok{{-}\textgreater{}}\NormalTok{ np.ndarray:}
-        \CommentTok{"""}
-\CommentTok{        Generates samples using Elliptical Slice Sampling.}
-
-\CommentTok{        Args:}
-\CommentTok{        {-} n\_samples (int): Total number of samples to return.}
-\CommentTok{        {-} n\_burn (int): Number of initial samples to discard (burn{-}in).}
-
-\CommentTok{        Returns:}
-\CommentTok{        {-} np.ndarray: Array of samples after burn{-}in.}
-\CommentTok{        """}
-\NormalTok{        samples }\OperatorTok{=}\NormalTok{ []}
-        \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in}\NormalTok{ tqdm(}\BuiltInTok{range}\NormalTok{(n\_samples), desc}\OperatorTok{=}\StringTok{"Sampling"}\NormalTok{):}
-            \VariableTok{self}\NormalTok{.\_indiv\_sample()}
-            \ControlFlowTok{if}\NormalTok{ i }\OperatorTok{\textgreater{}}\NormalTok{ n\_burn:}
-                \CommentTok{\# Store sample post burn{-}in}
-\NormalTok{                samples.append(}\VariableTok{self}\NormalTok{.\_state\_f.copy())}
-
-        \ControlFlowTok{return}\NormalTok{ np.stack(samples)}
-
-
-\KeywordTok{def}\NormalTok{ squared\_exponential\_cov\_torch(X1, X2, length\_scale}\OperatorTok{=}\FloatTok{1.0}\NormalTok{, variance}\OperatorTok{=}\FloatTok{1.0}\NormalTok{):}
-    \CommentTok{"""}
-\CommentTok{    Squared Exponential (RBF) Covariance Function using PyTorch tensors.}
-
-\CommentTok{    Args:}
-\CommentTok{        X1 (torch.Tensor): First set of input points.}
-\CommentTok{        X2 (torch.Tensor): Second set of input points.}
-\CommentTok{        length\_scale (float): Length scale of the kernel.}
-\CommentTok{        variance (float): Variance (amplitude) of the kernel.}
-
-\CommentTok{    Returns:}
-\CommentTok{        torch.Tensor: Covariance matrix between X1 and X2.}
-\CommentTok{    """}
-\NormalTok{    X1 }\OperatorTok{=}\NormalTok{ X1.reshape(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)}
-\NormalTok{    X2 }\OperatorTok{=}\NormalTok{ X2.reshape(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)}
-\NormalTok{    dists }\OperatorTok{=}\NormalTok{ torch.}\BuiltInTok{sum}\NormalTok{(X1}\OperatorTok{**}\DecValTok{2}\NormalTok{, dim}\OperatorTok{=}\DecValTok{1}\NormalTok{).reshape(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{) }\OperatorTok{+} \OperatorTok{\textbackslash{}}
-\NormalTok{        torch.}\BuiltInTok{sum}\NormalTok{(X2}\OperatorTok{**}\DecValTok{2}\NormalTok{, dim}\OperatorTok{=}\DecValTok{1}\NormalTok{) }\OperatorTok{{-}} \DecValTok{2} \OperatorTok{*}\NormalTok{ torch.mm(X1, X2.T)}
-    \ControlFlowTok{return}\NormalTok{ variance }\OperatorTok{*}\NormalTok{ torch.exp(}\OperatorTok{{-}}\FloatTok{0.5} \OperatorTok{*}\NormalTok{ dists }\OperatorTok{/}\NormalTok{ length\_scale}\OperatorTok{**}\DecValTok{2}\NormalTok{)}
-
-
-\KeywordTok{def}\NormalTok{ generate\_preferences(x\_pairs, utility\_fn):}
-    \CommentTok{"""}
-\CommentTok{    Generates preference labels based on the Bradley{-}Terry model.}
-
-\CommentTok{    Args:}
-\CommentTok{        x\_pairs (np.array): Array of preference pairs, shape [n\_pairs, 2].}
-\CommentTok{        utility\_fn (function): Ground truth utility function.}
-
-\CommentTok{    Returns:}
-\CommentTok{        np.array: Preference labels (1 if the first item in the pair is preferred, 0 otherwise).}
-\CommentTok{    """}
-\NormalTok{    preference\_labels }\OperatorTok{=}\NormalTok{ []}
-    \ControlFlowTok{for}\NormalTok{ x1, x2 }\KeywordTok{in}\NormalTok{ x\_pairs:}
-\NormalTok{        u1, u2 }\OperatorTok{=}\NormalTok{ utility\_fn(x1), utility\_fn(x2)}
-\NormalTok{        prob }\OperatorTok{=}\NormalTok{ np.exp(u1) }\OperatorTok{/}\NormalTok{ (np.exp(u1) }\OperatorTok{+}\NormalTok{ np.exp(u2))}
-\NormalTok{        preference\_labels.append(}\DecValTok{1} \ControlFlowTok{if}\NormalTok{ np.random.rand() }\OperatorTok{\textless{}}\NormalTok{ prob }\ControlFlowTok{else} \DecValTok{0}\NormalTok{)}
-    \ControlFlowTok{return}\NormalTok{ np.array(preference\_labels)}
-
-
-\KeywordTok{def}\NormalTok{ create\_predictive\_function(ground\_truth\_utility, num\_pairs}\OperatorTok{=}\DecValTok{3000}\NormalTok{, n\_samples}\OperatorTok{=}\DecValTok{100}\NormalTok{, n\_burn}\OperatorTok{=}\DecValTok{50}\NormalTok{, length\_scale}\OperatorTok{=}\FloatTok{2.0}\NormalTok{, variance}\OperatorTok{=}\FloatTok{0.5}\NormalTok{):}
-    \CommentTok{"""}
-\CommentTok{    Creates a predictive function to compute the posterior predictive mean of a Gaussian Process.}
-
-\CommentTok{    Args:}
-\CommentTok{        ground\_truth\_utility (function): The ground truth utility function for generating preferences.}
-\CommentTok{        num\_pairs (int): Number of random preference pairs to generate.}
-\CommentTok{        n\_samples (int): Number of samples for Elliptical Slice Sampling.}
-\CommentTok{        n\_burn (int): Number of burn{-}in samples for Elliptical Slice Sampling.}
-\CommentTok{        length\_scale (float): Length scale for the Squared Exponential Kernel.}
-\CommentTok{        variance (float): Variance (amplitude) of the Squared Exponential Kernel.}
-
-\CommentTok{    Returns:}
-\CommentTok{        function: A predictive function that computes the posterior predictive mean.}
-\CommentTok{    """}
-    \CommentTok{\# Generate random preference pairs}
-\NormalTok{    np.random.seed(}\DecValTok{42}\NormalTok{)}
-\NormalTok{    x\_pairs }\OperatorTok{=}\NormalTok{ np.random.uniform(}\DecValTok{0}\NormalTok{, }\DecValTok{10}\NormalTok{, size}\OperatorTok{=}\NormalTok{(num\_pairs, }\DecValTok{2}\NormalTok{))}
-\NormalTok{    X\_flat }\OperatorTok{=}\NormalTok{ x\_pairs.flatten()}
-
-    \CommentTok{\# Generate preference labels}
-\NormalTok{    preference\_labels }\OperatorTok{=}\NormalTok{ generate\_preferences(x\_pairs, ground\_truth\_utility)}
-
-    \CommentTok{\# Convert X\_flat to PyTorch tensor}
-\NormalTok{    X\_flat\_torch }\OperatorTok{=}\NormalTok{ torch.tensor(X\_flat, dtype}\OperatorTok{=}\NormalTok{torch.float32)}
-
-    \CommentTok{\# GP Prior (using PyTorch)}
-\NormalTok{    K\_torch }\OperatorTok{=}\NormalTok{ squared\_exponential\_cov\_torch(}
-\NormalTok{        X\_flat\_torch, X\_flat\_torch, length\_scale}\OperatorTok{=}\NormalTok{length\_scale, variance}\OperatorTok{=}\NormalTok{variance)}
-    \CommentTok{\# Add jitter for numerical stability}
-\NormalTok{    K\_torch }\OperatorTok{+=} \FloatTok{1e{-}2} \OperatorTok{*}\NormalTok{ torch.eye(}\BuiltInTok{len}\NormalTok{(X\_flat\_torch))}
-\NormalTok{    prior\_cov }\OperatorTok{=}\NormalTok{ K\_torch.numpy()  }\CommentTok{\# Convert back to numpy for the sampler}
-
-    \CommentTok{\# Log{-}likelihood function}
-    \KeywordTok{def}\NormalTok{ preference\_loglik(f):}
-        \CommentTok{"""}
-\CommentTok{        Computes the log{-}likelihood of the preferences under the Bradley{-}Terry model.}
-
-\CommentTok{        Args:}
-\CommentTok{            f (np.array): Latent utility values.}
-
-\CommentTok{        Returns:}
-\CommentTok{            float: Log{-}likelihood of the given latent utilities.}
-\CommentTok{        """}
-\NormalTok{        log\_likelihood }\OperatorTok{=} \FloatTok{0.0}
-        \ControlFlowTok{for}\NormalTok{ (x1, x2), label }\KeywordTok{in} \BuiltInTok{zip}\NormalTok{(x\_pairs, preference\_labels):}
-\NormalTok{            idx1 }\OperatorTok{=}\NormalTok{ np.where(X\_flat }\OperatorTok{==}\NormalTok{ x1)[}\DecValTok{0}\NormalTok{][}\DecValTok{0}\NormalTok{]}
-\NormalTok{            idx2 }\OperatorTok{=}\NormalTok{ np.where(X\_flat }\OperatorTok{==}\NormalTok{ x2)[}\DecValTok{0}\NormalTok{][}\DecValTok{0}\NormalTok{]}
-\NormalTok{            f1, f2 }\OperatorTok{=}\NormalTok{ f[idx1], f[idx2]}
-
-            \CommentTok{\# YOUR CODE HERE (\textasciitilde{}4 lines)}
-            \CommentTok{\# Add datapoint log likelihood using Bradley{-}Terry model}
-            \ControlFlowTok{pass}
-            \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-        \ControlFlowTok{return}\NormalTok{ log\_likelihood}
-
-    \CommentTok{\# Elliptical Slice Sampling}
-\NormalTok{    sampler }\OperatorTok{=}\NormalTok{ EllipticalSliceSampler(}
-\NormalTok{        prior\_cov}\OperatorTok{=}\NormalTok{prior\_cov, loglik}\OperatorTok{=}\NormalTok{preference\_loglik)}
-\NormalTok{    posterior\_samples }\OperatorTok{=}\NormalTok{ sampler.sample(n\_samples}\OperatorTok{=}\NormalTok{n\_samples, n\_burn}\OperatorTok{=}\NormalTok{n\_burn)}
-\NormalTok{    posterior\_mean }\OperatorTok{=}\NormalTok{ np.mean(posterior\_samples, axis}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-
-    \CommentTok{\# Convert posterior\_mean to PyTorch tensor}
-\NormalTok{    posterior\_mean\_torch }\OperatorTok{=}\NormalTok{ torch.tensor(posterior\_mean, dtype}\OperatorTok{=}\NormalTok{torch.float32)}
-
-    \CommentTok{\# Compute K\_inv using PyTorch}
-\NormalTok{    K\_inv\_torch }\OperatorTok{=}\NormalTok{ torch.inverse(K\_torch)}
-
-    \CommentTok{\# Define the predictive function}
-    \KeywordTok{def}\NormalTok{ predictive\_function(x):}
-        \CommentTok{"""}
-\CommentTok{        Predicts the utility for new input points.}
-
-\CommentTok{        Args:}
-\CommentTok{            x (torch.Tensor): Input points to predict utilities for.}
-
-\CommentTok{        Returns:}
-\CommentTok{            torch.Tensor: Predicted expected utilities.}
-\CommentTok{        """}
-        \ControlFlowTok{if} \KeywordTok{not}\NormalTok{ torch.is\_tensor(x):}
-            \ControlFlowTok{raise} \PreprocessorTok{ValueError}\NormalTok{(}\StringTok{\textquotesingle{}Predictive function must take in torch.tensor\textquotesingle{}}\NormalTok{)}
-\NormalTok{        x }\OperatorTok{=}\NormalTok{ x.reshape(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)}
-\NormalTok{        X\_flat\_torch\_reshaped }\OperatorTok{=}\NormalTok{ X\_flat\_torch.reshape(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)}
-
-        \CommentTok{\# YOUR CODE HERE (\textasciitilde{}2 lines)}
-        \CommentTok{\# Implement equation (3.21) on page 44 of https://gaussianprocess.org/gpml/chapters/RW.pdf}
-        \ControlFlowTok{pass}
-        \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-
-    \ControlFlowTok{return}\NormalTok{ predictive\_function}
-
-
-\ControlFlowTok{if} \VariableTok{\_\_name\_\_} \OperatorTok{==} \StringTok{"\_\_main\_\_"}\NormalTok{:}
-    \CommentTok{\# Ground truth utility function}
-    \KeywordTok{def}\NormalTok{ ground\_truth\_utility(x): }\ControlFlowTok{return}\NormalTok{ np.sin(x)}
-
-    \CommentTok{\# Create the predictive function}
-\NormalTok{    predictive\_fn }\OperatorTok{=}\NormalTok{ create\_predictive\_function(ground\_truth\_utility)}
-
-    \CommentTok{\# Test the predictive function}
-\NormalTok{    X\_test }\OperatorTok{=}\NormalTok{ torch.linspace(}\DecValTok{0}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{50}\NormalTok{).reshape(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{)  }\CommentTok{\# Test points}
-\NormalTok{    posterior\_means }\OperatorTok{=}\NormalTok{ predictive\_fn(}
-\NormalTok{        X\_test).detach().numpy()  }\CommentTok{\# Predicted posterior means}
-
-    \CommentTok{\# Ground truth utilities}
-\NormalTok{    ground\_truth\_utilities }\OperatorTok{=}\NormalTok{ ground\_truth\_utility(X\_test.numpy())}
-
-    \CommentTok{\# Plot results}
-\NormalTok{    plt.figure(figsize}\OperatorTok{=}\NormalTok{(}\DecValTok{10}\NormalTok{, }\DecValTok{6}\NormalTok{))}
-\NormalTok{    plt.title(}\StringTok{"GP Posterior Predictive Mean (Utility Approximation)"}\NormalTok{)}
-\NormalTok{    plt.plot(X\_test.numpy(), posterior\_means,}
-\NormalTok{             label}\OperatorTok{=}\StringTok{"Posterior Predictive Mean"}\NormalTok{, color}\OperatorTok{=}\StringTok{"red"}\NormalTok{)}
-\NormalTok{    plt.scatter(X\_test.numpy(), ground\_truth\_utilities,}
-\NormalTok{                label}\OperatorTok{=}\StringTok{"Ground Truth Utility"}\NormalTok{, color}\OperatorTok{=}\StringTok{"blue"}\NormalTok{, alpha}\OperatorTok{=}\FloatTok{0.5}\NormalTok{)}
-\NormalTok{    plt.xlabel(}\StringTok{"x"}\NormalTok{)}
-\NormalTok{    plt.ylabel(}\StringTok{"Utility"}\NormalTok{)}
-\NormalTok{    plt.legend()}
-\NormalTok{    plt.show()}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\begin{tcolorbox}[colframe=.grey, title=\faCode \enspace Code]
-
-\begin{Shaded}
-\begin{Highlighting}[numbers=left,,]
-\ImportTok{import}\NormalTok{ torch}
-\ImportTok{from}\NormalTok{ preference\_gp }\ImportTok{import}\NormalTok{ create\_predictive\_function}
-
-\CommentTok{\# Feel free to play around with continuous, concave utility functions!}
-\KeywordTok{def}\NormalTok{ utility\_1(x):}
-    \CommentTok{"""}
-\CommentTok{    Utility function 1: 3 * log(x + 1)}
-\CommentTok{    Args:}
-\CommentTok{        x (torch.Tensor): Input tensor of allocations.}
-\CommentTok{    Returns:}
-\CommentTok{        torch.Tensor: Computed utility values.}
-\CommentTok{    """}
-    \ControlFlowTok{return} \DecValTok{3} \OperatorTok{*}\NormalTok{ torch.log(x }\OperatorTok{+} \DecValTok{1}\NormalTok{)}
-
-\KeywordTok{def}\NormalTok{ utility\_2(x):}
-    \CommentTok{"""}
-\CommentTok{    Utility function 2: 5 * log(x + 2)}
-\CommentTok{    Args:}
-\CommentTok{        x (torch.Tensor): Input tensor of allocations.}
-\CommentTok{    Returns:}
-\CommentTok{        torch.Tensor: Computed utility values.}
-\CommentTok{    """}
-    \ControlFlowTok{return} \DecValTok{5} \OperatorTok{*}\NormalTok{ torch.log(x }\OperatorTok{+} \DecValTok{2}\NormalTok{)}
-
-\KeywordTok{def}\NormalTok{ utility\_3(x):}
-    \CommentTok{"""}
-\CommentTok{    Utility function 3: 8 * log(x + 3)}
-\CommentTok{    Args:}
-\CommentTok{        x (torch.Tensor): Input tensor of allocations.}
-\CommentTok{    Returns:}
-\CommentTok{        torch.Tensor: Computed utility values.}
-\CommentTok{    """}
-    \ControlFlowTok{return} \DecValTok{8} \OperatorTok{*}\NormalTok{ torch.log(x }\OperatorTok{+} \DecValTok{3}\NormalTok{)}
-
-\KeywordTok{def}\NormalTok{ project(x, B):}
-    \CommentTok{"""}
-\CommentTok{    Projects the allocation vector \textasciigrave{}x\textasciigrave{} onto the feasible set \{z | sum(z) = B, z \textgreater{}= 0\}.}
-\CommentTok{    This ensures that the allocations respect the resource constraint.}
-
-\CommentTok{    Args:}
-\CommentTok{        x (torch.Tensor): Current allocation vector.}
-\CommentTok{        B (float): Total available resource.}
-
-\CommentTok{    Returns:}
-\CommentTok{        torch.Tensor: Projected allocation vector.}
-\CommentTok{    """}
-    \ControlFlowTok{with}\NormalTok{ torch.no\_grad():}
-        \CommentTok{\# Sort x in descending order}
-\NormalTok{        sorted\_x, \_ }\OperatorTok{=}\NormalTok{ torch.sort(x, descending}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-        
-        \CommentTok{\# Compute cumulative sum adjusted by B}
-\NormalTok{        cumulative\_sum }\OperatorTok{=}\NormalTok{ torch.cumsum(sorted\_x, dim}\OperatorTok{=}\DecValTok{0}\NormalTok{) }\OperatorTok{{-}}\NormalTok{ B}
-        
-        \CommentTok{\# Find the threshold (water{-}filling algorithm)}
-\NormalTok{        rho }\OperatorTok{=}\NormalTok{ torch.where(sorted\_x }\OperatorTok{{-}}\NormalTok{ (cumulative\_sum }\OperatorTok{/}\NormalTok{ torch.arange(}\DecValTok{1}\NormalTok{, }\BuiltInTok{len}\NormalTok{(x) }\OperatorTok{+} \DecValTok{1}\NormalTok{, dtype}\OperatorTok{=}\NormalTok{torch.float32)) }\OperatorTok{\textgreater{}} \DecValTok{0}\NormalTok{)[}\DecValTok{0}\NormalTok{].}\BuiltInTok{max}\NormalTok{().item()}
-\NormalTok{        theta }\OperatorTok{=}\NormalTok{ cumulative\_sum[}\BuiltInTok{int}\NormalTok{(rho)] }\OperatorTok{/}\NormalTok{ (rho }\OperatorTok{+} \DecValTok{1}\NormalTok{)}
-        
-        \CommentTok{\# Compute the projected allocation}
-        \ControlFlowTok{return}\NormalTok{ torch.clamp(x }\OperatorTok{{-}}\NormalTok{ theta, }\BuiltInTok{min}\OperatorTok{=}\DecValTok{0}\NormalTok{)}
-
-\KeywordTok{def}\NormalTok{ optimize\_allocations(utilities, B, learning\_rate, num\_iterations):}
-    \CommentTok{"""}
-\CommentTok{    Optimizes the allocation of resources to maximize the total utility.}
-
-\CommentTok{    Args:}
-\CommentTok{        utilities (list): List of utility functions or GP{-}based predictive functions.}
-\CommentTok{        B (float): Total available resource.}
-\CommentTok{        learning\_rate (float): Step size for gradient ascent.}
-\CommentTok{        num\_iterations (int): Number of optimization iterations.}
-
-\CommentTok{    Returns:}
-\CommentTok{        torch.Tensor: Final resource allocations.}
-\CommentTok{    """}
-    \CommentTok{\# Initialize resource allocations equally}
-\NormalTok{    x }\OperatorTok{=}\NormalTok{ torch.tensor([}\FloatTok{1.0}\NormalTok{] }\OperatorTok{*} \BuiltInTok{len}\NormalTok{(utilities), requires\_grad}\OperatorTok{=}\VariableTok{True}\NormalTok{)}
-
-    \CommentTok{\# Optimization loop}
-    \ControlFlowTok{for}\NormalTok{ iteration }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(num\_iterations):}
-        \CommentTok{\# YOUR CODE HERE (\textasciitilde{}6 lines)}
-        \CommentTok{\# 1. Compute total utility and backprop}
-        \CommentTok{\# 2. Update x directly with x.grad}
-        \CommentTok{\# 3. Project onto convex constraint set since we are using Projected Gradient Descent (PGD)}
-        \ControlFlowTok{pass}
-        \CommentTok{\# }\RegionMarkerTok{END}\CommentTok{ OF YOUR CODE}
-        
-        \CommentTok{\# Log progress every 10 iterations or at the last iteration}
-        \ControlFlowTok{if}\NormalTok{ iteration }\OperatorTok{\%} \DecValTok{10} \OperatorTok{==} \DecValTok{0} \KeywordTok{or}\NormalTok{ iteration }\OperatorTok{==}\NormalTok{ num\_iterations }\OperatorTok{{-}} \DecValTok{1}\NormalTok{:}
-            \BuiltInTok{print}\NormalTok{(}\SpecialStringTok{f"Iteration }\SpecialCharTok{\{}\NormalTok{iteration}\SpecialCharTok{\}}\SpecialStringTok{: Total Utility = }\SpecialCharTok{\{}\NormalTok{total\_utility}\SpecialCharTok{.}\NormalTok{item()}\SpecialCharTok{:.4f\}}\SpecialStringTok{, Allocations = }\SpecialCharTok{\{}\NormalTok{x}\SpecialCharTok{.}\NormalTok{data}\SpecialCharTok{.}\NormalTok{numpy()}\SpecialCharTok{\}}\SpecialStringTok{"}\NormalTok{)}
-    
-    \ControlFlowTok{return}\NormalTok{ x}
-
-\ControlFlowTok{if} \VariableTok{\_\_name\_\_} \OperatorTok{==} \StringTok{"\_\_main\_\_"}\NormalTok{:}
-    \CommentTok{\# Generate GP models for each utility}
-\NormalTok{    gp\_1 }\OperatorTok{=}\NormalTok{ create\_predictive\_function(}\KeywordTok{lambda}\NormalTok{ x: utility\_1(torch.tensor(x)).numpy())}
-\NormalTok{    gp\_2 }\OperatorTok{=}\NormalTok{ create\_predictive\_function(}\KeywordTok{lambda}\NormalTok{ x: utility\_2(torch.tensor(x)).numpy())}
-\NormalTok{    gp\_3 }\OperatorTok{=}\NormalTok{ create\_predictive\_function(}\KeywordTok{lambda}\NormalTok{ x: utility\_3(torch.tensor(x)).numpy())}
-
-    \CommentTok{\# Combine utility GPs into a list for optimization}
-\NormalTok{    utilities }\OperatorTok{=}\NormalTok{ [gp\_1, gp\_2, gp\_3]  }\CommentTok{\# Use [utility\_1, utility\_2, utility\_3] for exact utility functions}
-
-    \CommentTok{\# Resource constraint and optimization settings}
-\NormalTok{    B }\OperatorTok{=} \DecValTok{10}  \CommentTok{\# Total available resource}
-\NormalTok{    learning\_rate }\OperatorTok{=} \FloatTok{0.1}  \CommentTok{\# Gradient ascent step size}
-\NormalTok{    num\_iterations }\OperatorTok{=} \DecValTok{2000}  \CommentTok{\# Number of iterations}
-
-    \CommentTok{\# Optimize allocations}
-\NormalTok{    final\_allocations }\OperatorTok{=}\NormalTok{ optimize\_allocations(utilities, B, learning\_rate, num\_iterations)}
-
-    \CommentTok{\# Final results}
-    \BuiltInTok{print}\NormalTok{(}\StringTok{"}\CharTok{\textbackslash{}n}\StringTok{Final allocations:"}\NormalTok{)}
-    \BuiltInTok{print}\NormalTok{(final\_allocations.data.numpy())}
-\end{Highlighting}
-\end{Shaded}
-
-\end{tcolorbox}
-
-\section*{References}\label{bibliography-4}
-\addcontentsline{toc}{section}{References}
-
-\markright{References}
-
-\phantomsection\label{refs-4}
-\begin{CSLReferences}{1}{0}
-\bibitem[\citeproctext]{ref-astudillo2023qeubodecisiontheoreticacquisitionfunction}
-Astudillo, Raul, Zhiyuan Jerry Lin, Eytan Bakshy, and Peter I. Frazier.
-2023. {``qEUBO: A Decision-Theoretic Acquisition Function for
-Preferential Bayesian Optimization.''}
-\url{https://arxiv.org/abs/2303.15746}.
-
-\bibitem[\citeproctext]{ref-bastani2020online}
-Bastani, Hamsa, and Mohsen Bayati. 2020. {``Online Decision Making with
-High-Dimensional Covariates.''} \emph{Operations Research} 68 (1):
-276--94. \url{https://doi.org/10.1287/opre.2019.1902}.
-
-\bibitem[\citeproctext]{ref-bommasani2022opportunities}
-Bommasani, Rishi, Drew A. Hudson, Ehsan Adeli, Russ Altman, Simran
-Arora, Sydney von Arx, Michael S. Bernstein, et al. 2022. {``On the
-Opportunities and Risks of Foundation Models.''}
-\url{https://arxiv.org/abs/2108.07258}.
-
-\bibitem[\citeproctext]{ref-bouneffouf2012a}
-Bouneffouf, Djallel, Amel Bouzeghoub, and Alda Lopes Gançarski. 2012.
-{``A Contextual-Bandit Algorithm for Mobile Context-Aware Recommender
-System.''} In \emph{Neural Information Processing}, edited by Tingwen
-Huang, Zhigang Zeng, Chuandong Li, and Chi Sing Leung, 324--31. Berlin,
-Heidelberg: Springer Berlin Heidelberg.
-
-\bibitem[\citeproctext]{ref-bouneffouf2020survey}
-Bouneffouf, Djallel, Irina Rish, and Charu Aggarwal. 2020. {``Survey on
-Applications of Multi-Armed and Contextual Bandits.''} In \emph{2020
-IEEE Congress on Evolutionary Computation (CEC)}, 1--8. Glasgow, United
-Kingdom: IEEE Press.
-\url{https://doi.org/10.1109/CEC48606.2020.9185782}.
-
-\bibitem[\citeproctext]{ref-bouneffouf2017bandit}
-Bouneffouf, Djallel, Irina Rish, and Guillermo A. Cecchi. 2017.
-{``Bandit Models of Human Behavior: Reward Processing in Mental
-Disorders.''} In \emph{Artificial General Intelligence}, edited by Tom
-Everitt, Ben Goertzel, and Alexey Potapov, 237--48. Cham: Springer
-International Publishing.
-
-\bibitem[\citeproctext]{ref-brohan2023rt2}
-Brohan, Anthony, Noah Brown, Justice Carbajal, Yevgen Chebotar, Xi Chen,
-Krzysztof Choromanski, Tianli Ding, et al. 2023. {``RT-2:
-Vision-Language-Action Models Transfer Web Knowledge to Robotic
-Control.''} \url{https://arxiv.org/abs/2307.15818}.
-
-\bibitem[\citeproctext]{ref-deng2009imagenet}
-Deng, Jia, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei.
-2009. {``ImageNet: A Large-Scale Hierarchical Image Database.''} In
-\emph{2009 IEEE Conference on Computer Vision and Pattern Recognition},
-248--55. IEEE.
-
-\bibitem[\citeproctext]{ref-ding2019interactive}
-Ding, Kaize, Jundong Li, and Huan Liu. 2019. {``Interactive Anomaly
-Detection on Attributed Networks.''} In \emph{Proceedings of the Twelfth
-ACM International Conference on Web Search and Data Mining}, 357--65.
-WSDM '19. New York, NY, USA: Association for Computing Machinery.
-\url{https://doi.org/10.1145/3289600.3290964}.
-
-\bibitem[\citeproctext]{ref-grauman2022ego4d}
-Grauman, Kristen, Andrew Westbury, Eugene Byrne, Zachary Chavis,
-Antonino Furnari, Rohit Girdhar, Jackson Hamburger, et al. 2022.
-{``Ego4D: Around the World in 3,000 Hours of Egocentric Video.''}
-\url{https://arxiv.org/abs/2110.07058}.
-
-\bibitem[\citeproctext]{ref-max_halford}
-Halford, Max. 2023. {``Online Active Learning in 80 Lines of Python.''}
-
-\bibitem[\citeproctext]{ref-he2020momentum}
-He, Kaiming, Haoqi Fan, Yuxin Wu, Saining Xie, and Ross Girshick. 2020.
-{``Momentum Contrast for Unsupervised Visual Representation Learning.''}
-In \emph{Proceedings of the IEEE/CVF Conference on Computer Vision and
-Pattern Recognition}, 9729--38. IEEE.
-
-\bibitem[\citeproctext]{ref-huo2017risk}
-Huo, Xiaoguang, and Feng Fu. 2017. {``Risk-Aware Multi-Armed Bandit
-Problem with Application to Portfolio Selection.''} \emph{Royal Society
-Open Science} 4 (November). \url{https://doi.org/10.1098/rsos.171377}.
-
-\bibitem[\citeproctext]{ref-kahneman_tversky_1979}
-Kahneman, Daniel, and Amos Tversky. 1979. {``Prospect Theory: Analysis
-of Decision Under Risk.''} \emph{Econometrica} 47 (2).
-\url{https://doi.org/10.2307/1914185}.
-
-\bibitem[\citeproctext]{ref-karamcheti2023languagedriven}
-Karamcheti, Siddharth, Suraj Nair, Annie S. Chen, Thomas Kollar, Chelsea
-Finn, Dorsa Sadigh, and Percy Liang. 2023. {``Language-Driven
-Representation Learning for Robotics.''}
-\url{https://arxiv.org/abs/2302.12766}.
-
-\bibitem[\citeproctext]{ref-liu2018customized}
-Liu, Bing, Tong Yu, Ian Lane, and Ole J. Mengshoel. 2018. {``Customized
-Nonlinear Bandits for Online Response Selection in Neural Conversation
-Models.''} In \emph{Proceedings of the Thirty-Second AAAI Conference on
-Artificial Intelligence and Thirtieth Innovative Applications of
-Artificial Intelligence Conference and Eighth AAAI Symposium on
-Educational Advances in Artificial Intelligence}.
-AAAI'18/IAAI'18/EAAI'18. New Orleans, Louisiana, USA: AAAI Press.
-
-\bibitem[\citeproctext]{ref-misra2019dynamic}
-Misra, Kanishka, Eric M. Schwartz, and Jacob Abernethy. 2019. {``Dynamic
-Online Pricing with Incomplete Information Using Multiarmed Bandit
-Experiments.''} \emph{Marketing Science} 38 (2): 226--52.
-\url{https://doi.org/10.1287/mksc.2018.1129}.
-
-\bibitem[\citeproctext]{ref-nair2022r3m}
-Nair, Suraj, Aravind Rajeswaran, Vikash Kumar, Chelsea Finn, and Abhinav
-Gupta. 2022. {``R3M: A Universal Visual Representation for Robot
-Manipulation.''} \url{https://arxiv.org/abs/2203.12601}.
-
-\bibitem[\citeproctext]{ref-perez2018contextual}
-perez, julien, and Tomi Silander. 2018. {``Contextual Memory Bandit for
-Pro-Active Dialog Engagement.''}
-\url{https://openreview.net/forum?id=SJiHOSeR-}.
-
-\bibitem[\citeproctext]{ref-radford2021learning}
-Radford, Alec, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh,
-Sandhini Agarwal, Girish Sastry, et al. 2021. {``Learning Transferable
-Visual Models from Natural Language Supervision.''} \emph{arXiv Preprint
-arXiv:2103.00020}.
-
-\bibitem[\citeproctext]{ref-shen2015portfolio}
-Shen, Weiwei, Jun Wang, Yu-Gang Jiang, and Hongyuan Zha. 2015.
-{``Portfolio Choices with Orthogonal Bandit Learning.''} In
-\emph{Proceedings of the 24th International Conference on Artificial
-Intelligence}, 974--80. IJCAI'15. Buenos Aires, Argentina: AAAI Press.
-
-\bibitem[\citeproctext]{ref-advancements_dueling}
-Sui, Yanan, Masrour Zoghi, Katja Hofmann, and Yisong Yue. 2018.
-{``Advancements in Dueling Bandits.''} \emph{Proceedings of the
-Twenty-Seventh International Joint Conference on Artificial
-Intelligence}. \url{https://doi.org/10.24963/ijcai.2018/776}.
-
-\bibitem[\citeproctext]{ref-upadhyay2019a}
-Upadhyay, Sohini, Mayank Agarwal, Djallel Bouneffouf, and Yasaman
-Khazaeni. 2019. {``A Bandit Approach to Posterior Dialog Orchestration
-Under a Budget.''}
-
-\bibitem[\citeproctext]{ref-walke2023bridgedata}
-Walke, Homer, Kevin Black, Abraham Lee, Moo Jin Kim, Max Du, Chongyi
-Zheng, Tony Zhao, et al. 2023. {``BridgeData V2: A Dataset for Robot
-Learning at Scale.''} \url{https://arxiv.org/abs/2308.12952}.
-
-\bibitem[\citeproctext]{ref-wu2018parallelknowledgegradientmethod}
-Wu, Jian, and Peter I. Frazier. 2018. {``The Parallel Knowledge Gradient
-Method for Batch Bayesian Optimization.''}
-\url{https://arxiv.org/abs/1606.04414}.
-
-\bibitem[\citeproctext]{ref-xiao2022masked}
-Xiao, Tete, Ilija Radosavovic, Trevor Darrell, and Jitendra Malik. 2022.
-{``Masked Visual Pre-Training for Motor Control.''}
-\url{https://arxiv.org/abs/2203.06173}.
-
-\bibitem[\citeproctext]{ref-ask_help}
-Xie, Annie, Fahim Tajwar, Archit Sharma, and Chelsea Finn. 2022. {``When
-to Ask for Help: Proactive Interventions in Autonomous Reinforcement
-Learning.''} \url{https://arxiv.org/abs/2210.10765}.
-
-\bibitem[\citeproctext]{ref-xu2024principledpreferentialbayesianoptimization}
-Xu, Wenjie, Wenbin Wang, Yuning Jiang, Bratislav Svetozarevic, and Colin
-N. Jones. 2024. {``Principled Preferential Bayesian Optimization.''}
-\url{https://arxiv.org/abs/2402.05367}.
-
-\bibitem[\citeproctext]{ref-YUE20121538}
-Yue, Yisong, Josef Broder, Robert Kleinberg, and Thorsten Joachims.
-2012. {``The k-Armed Dueling Bandits Problem.''} \emph{Journal of
-Computer and System Sciences} 78 (5): 1538--56.
-https://doi.org/\url{https://doi.org/10.1016/j.jcss.2011.12.028}.
-
-\bibitem[\citeproctext]{ref-IR}
-Yue, Yisong, and Thorsten Joachims. 2009. {``Interactively Optimizing
-Information Retrieval Systems as a Dueling Bandits Problem.''}
-\emph{Proceedings of the 26th Annual International Conference on Machine
-Learning}. \url{https://doi.org/10.1145/1553374.1553527}.
-
-\bibitem[\citeproctext]{ref-fgts_cdb}
-Zhang, Tong. 2021. {``Feel-Good Thompson Sampling for Contextual Bandits
-and Reinforcement Learning.''} \emph{CoRR} abs/2110.00871.
-\url{https://arxiv.org/abs/2110.00871}.
-
-\bibitem[\citeproctext]{ref-zhou2017large}
-Zhou, Qian, XiaoFang Zhang, Jin Xu, and Bin Liang. 2017. {``Large-Scale
-Bandit Approaches for Recommender Systems.''} In \emph{Neural
-Information Processing}, edited by Derong Liu, Shengli Xie, Yuanqing Li,
-Dongbin Zhao, and El-Sayed M. El-Alfy, 811--21. Cham: Springer
-International Publishing.
-
-\end{CSLReferences}
-
-\bookmarksetup{startatroot}
-
-\chapter{Aggregation}\label{aggregation}
-
-\section{Social Choice Theory and Implications for AI Preference
-Aggregation}\label{social-choice-theory-and-implications-for-ai-preference-aggregation}
-
-In many applications, human preferences must be aggregated across
-multiple individuals to determine a collective decision or ranking. This
-process is central to social choice theory, which provides a
-mathematical foundation for preference aggregation. Unlike individual
-preference modeling, which focuses on how a single person makes
-decisions, social choice theory addresses the challenge of combining
-multiple preference profiles into a single coherent outcome. A social
-welfare function (SWF) takes as input each individual's preference
-ranking over a set of alternatives and produces a social ranking of
-those alternatives. A related concept is a social choice function (SCF),
-which selects a single winning alternative given individuals'
-preferences. Many voting rules can be seen as social choice functions
-that aim to reflect the group's preferences. Formally, let
-\(N=\{1,2,\dots,n\}\) be a set of \(n\) voters (agents) and
-\(A=\{a_1,\dots,a_m\}\) a set of \(m\) alternatives (with \(m \ge 3\)).
-Each voter \(i\) has a preference order \(\succ_i\) over \(A\). A social
-choice function is a mapping \(f: (\succ_1,\dots,\succ_n)\mapsto A\)
-that picks a winning alternative for each possible profile of individual
-preferences. A social welfare function is a mapping that produces a
-complete societal ranking \(\succ^*\) of the alternatives. The central
-question is: can we design an aggregation rule that faithfully
-represents individual preferences while satisfying certain fairness or
-rationality axioms?
-
-Many common voting rules illustrate different methods of aggregation,
-each with its own merits and vulnerabilities:
-
-\begin{itemize}
-\tightlist
-\item
-  Plurality: Each voter names their top choice; the alternative with the
-  most votes wins.
-\item
-  Borda Count: Voters rank all alternatives, and points are assigned
-  based on the position in each ranking. For example, with \(m\)
-  alternatives, a voter's top-ranked alternative gets \(m-1\) points,
-  the second-ranked gets \(m-2\), and so on down to 0. The Borda score
-  of an alternative is the sum of points from all voters, and the winner
-  is the alternative with the highest total score.
-\item
-  Single Transferable Vote (STV): Voters rank choices, and the count
-  proceeds in rounds. In each round, the alternative with the fewest
-  votes is eliminated and those votes are transferred to the next
-  preferred remaining alternative on each ballot, until one candidate
-  has a majority.
-\item
-  Condorcet Methods: These look for a candidate that wins in all
-  pairwise majority contests against other alternatives (the Condorcet
-  winner), if such an alternative exists.
-\end{itemize}
-
-However, preference aggregation is not always straightforward. The
-Condorcet paradox illustrates that majority preferences can be cyclic
-(rock-paper-scissors style), so that no single alternative is
-majority-preferred to all others, violating transitivity. Different
-voting rules can yield different winners on the same profile,
-highlighting how the choice of rule influences the outcome. To guide the
-design of social choice functions, several desirable properties or
-axioms have been proposed. Three classical fairness criteria are:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\tightlist
-\item
-  Unanimity (Pareto efficiency): If all individuals strictly prefer one
-  alternative \(x\) over another \(y\) (i.e.~\(x \succ_i y\) for every
-  voter \(i\)), then the group ranking should prefer \(x\) over \(y\) as
-  well (\(x \succ^* y\)).
-\item
-  Independence of Irrelevant Alternatives (IIA): The social preference
-  between any two alternatives \(x\) and \(y\) should depend only on the
-  individual preferences between \(x\) and \(y\). In other words, if we
-  change individuals' rankings of other ``irrelevant'' alternatives (not
-  \(x\) or \(y\)) in any way, the group's relative ordering of \(x\) and
-  \(y\) should remain unchanged.
-\item
-  Non-dictatorship: The aggregation should not simply follow a single
-  individual's preference regardless of others. There is no voter \(i\)
-  who always gets their top choice as the social choice (or whose
-  rankings always become the social ranking), irrespective of other
-  voters' preferences.
-\end{enumerate}
-
-Additionally, we assume an unrestricted domain (universal
-admissibility): individuals may have any transitive preference ordering
-over the \(m\) alternatives (no restrictions like single-peaked
-preferences unless explicitly imposed). One might hope that a fair
-voting rule exists that satisfies all the above properties for three or
-more alternatives. Surprisingly, a seminal negative result shows this is
-impossible.
-
-Arrow's Impossibility Theorem (\citeproc{ref-arrow1951}{Arrow 1951}) is
-a cornerstone of social choice theory. It states that when there are
-three or more alternatives (\(m\ge 3\)), no social welfare function can
-simultaneously satisfy Unanimity, IIA, and Non-dictatorship -- unless it
-is a trivial dictatorial rule. In other words, any aggregation mechanism
-that is not dictatorial will inevitably violate at least one of the
-fairness criteria. The theorem is usually proven by contradiction:
-assuming a social welfare function satisfies all conditions, one can
-show that one voter's preferences always decide the outcome, hence the
-rule is dictatorial. Intuitively, Arrow's theorem is driven by the
-possibility of preference cycles in majority voting. Even if individual
-preferences are transitive, aggregated majorities can prefer \(A\) to
-\(B\), \(B\) to \(C\), and \(C\) to \(A\) in a cycle, as in the
-Condorcet paradox. Under Unanimity and IIA, the social ranking must
-locally match these pairwise preferences, but this produces a
-contradiction with transitivity unless one voter's ranking is given
-overriding authority. A sketch of Arrow's proof is as follows: one shows
-that under the axioms, the social ranking between any two alternatives
-\(x\) and \(y\) must agree with some particular voter's preference (the
-``pivotal'' voter for that pair). With IIA, the identity of the pivotal
-voter must be the same across all pairs of alternatives, otherwise by
-cleverly constructing profiles one can derive a conflict. This single
-pivotal voter then effectively dictates the entire social order,
-violating Non-dictatorship. Hence, the axioms are incompatible.
-
-Arrow's Impossibility Theorem has profound implications: it formalizes
-the inherent trade-offs in designing any fair aggregation scheme. In
-practice, different voting rules relax one or more of Arrow's
-conditions. For instance, Borda count violates IIA (since introducing or
-removing an irrelevant alternative can change the point totals), while a
-dictatorship violates fairness blatantly. The theorem suggests that
-every practical voting system must sacrifice at least one of the ideal
-fairness criteria. It also motivated the exploration of alternative
-frameworks (such as allowing interpersonal comparisons of utility or
-cardinal preference aggregation) to escape the impossibility by
-weakening assumptions.
-
-Complementing Arrow's theorem, the Gibbard--Satterthwaite theorem
-focuses on incentives and strategic manipulation in voting systems
-(\citeproc{ref-gibbard1973}{Gibbard 1973};
-\citeproc{ref-satterthwaite1975}{Satterthwaite 1975}). It considers any
-deterministic social choice function \(f\) that chooses a single winner
-from the set of \(m\ge 3\) alternatives. The theorem states that if
-\(f\) is strategy-proof (incentive compatible) and onto (its range of
-outcomes is the entire set of alternatives), then \(f\) must be
-dictatorial. Strategy-proofness (also called truthfulness or
-dominant-strategy incentive compatibility) means that no voter can ever
-benefit by misrepresenting their true preferences, regardless of what
-others do. In other words, reporting their genuine ranking is a (weakly)
-dominant strategy for each voter. The theorem implies that for any
-realistic voting rule where every alternative can possibly win, either
-one voter effectively decides the outcome (a dictatorship) or else the
-rule is susceptible to strategic manipulation by voters. The
-Gibbard--Satterthwaite theorem tells us that every non-dictatorial
-voting rule for 3 or more alternatives is manipulable: there will exist
-some election scenario where a voter can gain a more preferred outcome
-by voting insincerely (i.e.~not according to their true preferences).
-For example, in a simple plurality vote, a voter whose true favorite is
-a long-shot candidate might vote for a more viable candidate to avoid a
-worst-case outcome (``lesser of two evils'' voting). In a Borda count
-election, voters might strategically raise a competitor in their ranking
-to push down an even stronger rival. The only way to avoid all such
-strategic voting incentives is to have a dictatorship or limit the
-choice set to at most two alternatives.
-
-The proof of Gibbard--Satterthwaite is non-trivial, but one can outline
-the idea: Given a non-dictatorial and onto rule \(f\), one shows there
-exist at least three distinct outcomes that can result from some
-preference profiles. By carefully constructing profiles and using the
-onto property, one finds a situation where a single voter can change the
-outcome by switching their order of two candidates, demonstrating
-manipulability. The theorem is robust -- even if we allow ties or weaker
-conditions, similar impossibility results hold (Gibbard's 1978 extension
-handles randomized rules). The practical takeaway is that all meaningful
-voting protocols encourage tactical voting in some situations.
-Nonetheless, certain systems are considered ``harder to manipulate'' or
-more resistant due to complexity or uncertainty. For instance, while STV
-(ranked-choice voting) can be manipulated in theory, determining a
-beneficial strategic vote can be NP-hard in worst cases, which arguably
-provides some practical deterrence to manipulation
-(\citeproc{ref-bartholdi1989}{Bartholdi, Tovey, and Trick 1989}).
-
-Arrow's and Gibbard--Satterthwaite's theorems highlight the limitations
-any preference aggregation method must face. In domains like
-reinforcement learning from human feedback (RLHF) and AI alignment, we
-also aggregate preferences -- often preferences of multiple human
-evaluators or preferences revealed in pairwise comparisons -- to guide
-machine learning systems. While these settings sometimes use cardinal
-scores or learned reward functions (escaping the strict ordinal
-framework of Arrow's theorem), the spirit of these impossibility results
-still applies: there is no perfect way to aggregate human opinions
-without trade-offs.
-
-For example, aggregating human feedback to train a model may run into
-inconsistencies analogous to preference cycles, especially when feedback
-comes from diverse individuals with different values. A simple majority
-vote over preferences might yield unstable or unfair outcomes if some
-annotators are systematically in the minority. Weighting votes by some
-credibility or expertise (weighted voting) can improve outcomes but
-raises the question of how to set the weights without introducing
-dictator-like influence. Recent research has proposed methods like jury
-learning -- which integrates dissenting voices by having a panel
-(``jury'') of models or human subgroups whose aggregated judgment guides
-the learning (\citeproc{ref-gordon2022jury}{Gordon et al. 2022}) -- to
-ensure minority preferences are not entirely ignored. Another
-perspective is social choice in AI alignment, which suggests using
-social choice theory to design AI systems that respect a plurality of
-human values instead of collapsing everything into a single objective.
-In pluralistic value alignment, instead of forcing a single ``best''
-solution, an AI might present a diverse set of options or behave in a
-way that reflects a distribution of values. This approach aims to
-preserve the diversity of human preferences rather than always
-aggregating to one monolithic preference. For instance, a conversational
-AI might be designed to recognize multiple acceptable responses (each
-aligning with different value systems) rather than one canonical
-``aligned'' response for a given query.
-
-These considerations are especially relevant in generative AI and large
-language models, where training involves human preference data. If we
-aggregate feedback naively, we might overfit to the majority preference
-and lose minority perspectives (a form of tyranny of the majority). On
-the other hand, trying to satisfy everyone can lead to indecision or an
-incoherent objective. The impossibility results remind us there is no
-free lunch: we must carefully decide which properties to prioritize
-(e.g.~giving more weight to expert annotators versus preserving broader
-fairness, or balancing consistency vs inclusivity). Designing
-aggregation mechanisms for AI that reflect collective human values is an
-ongoing challenge. It often involves insights from traditional voting
-theory (to understand trade-offs and failure modes) combined with
-machine learning techniques (to model and learn from preference data).
-In summary, social choice theory provides cautionary guidance as we
-build systems that learn from human preferences: we need to be conscious
-of which fairness criteria we relax and be transparent about the
-compromises being made in any preference aggregation pipeline.
-
-\section{Mechanism Design}\label{single-item-auctions}
-
-While voting rules aggregate ordinal rank preferences to select a social
-outcome, another class of preference aggregation occurs in economic
-settings like auctions and general mechanism design. Here individuals
-reveal their valuations (numerical utilities) for outcomes, and the
-mechanism chooses an outcome (such as an allocation of goods) and
-possibly payments. Mechanism design asks: how can we design rules so
-that rational agents, acting in their own interest, end up revealing
-information that leads to a socially desirable outcome? A central
-concept is incentive compatibility -- the mechanism should be designed
-so that each participant's best strategy is to act according to their
-true preferences (e.g.~bid their true value). In this section, we focus
-on auctions as a prime example of preference aggregation with money, and
-highlight classical results including Vickrey--Clarke--Groves (VCG)
-mechanisms and Myerson's optimal auction.
-
-Consider a single-item auction with one item for sale and \(n\) bidders.
-Bidder \(i\) has a private valuation \(v_i\) for the item (how much the
-item is worth to them). Each bidder's goal is to maximize their own
-utility, defined as \(v_i - p_i\) if they win and pay price \(p_i\), or
-\(0\) if they do not win (assuming quasilinear utility where money is
-the transferable utility). The auction's task is to allocate the item to
-one of the bidders and possibly determine payments. We can think of an
-auction as a mechanism that asks each bidder for a ``message''
-(typically a bid representing how much they are willing to pay), then
-selects a winner and a price based on the bids. A key objective might be
-social welfare maximization -- allocate the item to the bidder who
-values it most (maximizing \(v_i\) of the winner). Another possible
-objective is revenue maximization for the seller -- choose the
-allocation and price to maximize the seller's expected payment.
-
-A classic result in auction theory is that to maximize social welfare in
-a single-item private-value setting, one should award the item to the
-highest valuer -- and this can be done in an incentive-compatible way by
-using a second-price auction. The Vickrey second-price auction works as
-follows: (1) All bidders submit sealed bids \(b_1, b_2, \ldots, b_n\).
-(2) The bidder with the highest bid wins the item. (3) The price paid by
-the winner is the second-highest bid. For example, if the bids are
-\((2,\, 6,\, 4,\, 1)\) (in some currency units), the highest bid is
-\(6\) (by bidder 2, say) and the second-highest is \(4\). Bidder 2 wins
-the item and pays \(4\).
-
-Under this mechanism, it turns out that bidding truthfully \(b_i = v_i\)
-is a dominant strategy for each bidder. In other words, the auction is
-dominant-strategy incentive compatible (DSIC): no matter what others do,
-a bidder maximizes their expected utility by reporting their true
-valuation. The intuition is as follows. If bidder \(i\) bids lower than
-their true value (i.e.~\(b_i < v_i\)), and if their true value was
-actually the highest, they risk losing the item even though they value
-it more than the price they would have paid -- a missed opportunity for
-positive utility. Bidding higher than their value (\(b_i > v_i\)) cannot
-help them win in any situation where bidding truthfully wouldn't (it
-could only make a difference if their true \(v_i\) wasn't the highest
-but they tried to win anyway); and if they do win with an inflated bid,
-they might end up paying the second-highest bid which could be above
-their true value, yielding negative utility. By bidding exactly \(v_i\),
-if they win, it means all other bids were lower, so \(v_i\) is at least
-as high as the second-highest bid \(p\) they pay -- guaranteeing
-non-negative utility \(v_i - p \ge 0\). If they lose, it means someone
-else had a higher bid (hence higher value, if others are truthful), so
-bidder \(i\) wouldn't have gained anyway. This argument, made rigorous
-by Vickrey (\citeproc{ref-vickrey1961}{Vickrey 1961}), establishes that
-truth-telling is a dominant strategy in the second-price auction. As a
-consequence, when everyone bids truthfully, the item is allocated to the
-bidder with the highest \(v_i\), achieving maximum social surplus
-(allocative efficiency). The second-price auction is thus an elegant
-mechanism that aligns individual incentives with social welfare
-maximization.
-
-It is worth contrasting this with a first-price auction, where the
-winner pays their own bid. In a first-price auction, bidders have an
-incentive to bid below their true value (to avoid the winner's curse of
-paying too much), in a Nash equilibrium that involves bid shading. The
-first-price auction can still allocate to the highest valuer in
-equilibrium, but only through strategic behavior (and it is not DSIC).
-By charging the second-highest bid, the Vickrey auction removes the
-incentive to shade bids, since the price does not directly depend on
-one's own bid beyond the fact of winning or losing.
-
-So far, we discussed auctions aimed at maximizing social welfare. In
-many cases, the auctioneer (seller) is interested in maximizing revenue.
-A foundational result by Roger Myerson (1981) provides a
-characterization of optimal auctions (those that maximize the seller's
-expected revenue) for single-item settings under certain assumptions
-(\citeproc{ref-myerson1981}{Myerson 1981}). The problem can be
-formulated as follows: suppose each bidder's private value \(v_i\) is
-drawn independently from a known distribution \(F_i\) (for simplicity,
-assume identical distribution \(F\) for all bidders, i.i.d.). We seek a
-mechanism (allocation rule and payment rule) that maximizes the seller's
-expected payment, subject to incentive compatibility and individual
-rationality (participants should not expect negative utility from
-truthful participation).
-
-Myerson's theorem states that the optimal auction in such a setting is a
-threshold auction characterized by virtual valuations. Define the
-virtual value for a bidder with value \(v\) as
-\(\varphi(v) = v - \frac{1-F(v)}{f(v)}\), where \(f\) is the probability
-density function of \(F\) (assuming it is continuous). An assumption
-called regularity (which holds for many distributions) is that
-\(\varphi(v)\) is non-decreasing in \(v\). Myerson showed that the
-revenue-maximizing strategy is: treat \(\varphi(v)\) as the effective
-``score'' of a bid, allocate the item to the bidder with the highest
-non-negative virtual value (if all virtual values are negative, allocate
-to no one), and charge them the smallest value they could have such that
-they would still win (the payment is essentially the critical bid where
-\(\varphi\) of that bid equals the second-highest virtual value or the
-zero cutoff). In practice, for i.i.d. bidders, this reduces to: there is
-an optimal reserve price \(r\) such that you sell to the highest bidder
-if and only if their bid \(b_{\max} \ge r\); if sold, the price is the
-max of the second-highest bid and \(r\).
-
-In the case of \(n\) bidders with values i.i.d. uniform on \([0,1]\)
-(which is a regular distribution), one can compute the optimal reserve
-price. The virtual value function for uniform \([0,1]\) is
-\(\varphi(v) = v - \frac{1-v}{1} = 2v - 1\). Setting \(\varphi(v)\ge 0\)
-gives \(v \ge 0.5\). So Myerson's mechanism says: don't sell the item if
-all bids are below 0.5; otherwise, sell to the highest bidder at at
-least 0.5. This is exactly a second-price auction with a reserve of
-\(r=0.5\). Our earlier example implicitly demonstrated this: with two
-uniform(0,1) bidders, the optimal auction sets a reserve price of
-\(0.5\) and yields a certain expected revenue. We can break down the
-cases: - With probability \(1/4\), both bidders have values below
-\(0.5\) (each below 0.5 with probability 1/2), in which case nobody wins
-and revenue is 0. - With probability \(1/4\), both bidders have
-\(v > 0.5\). In this case, the second-price auction with reserve will
-sell to the highest bidder at the max of the second-highest value and
-0.5. Given both \(v_1, v_2 > 0.5\), the expected second-highest value
-(conditional on both \textgreater0.5) is \(\frac{2}{3}\) (in fact, the
-order statistics of two uniforms on {[}0.5,1{]} give mean of min = 2/3).
-So in this case the expected price is the second-highest value (since
-that will exceed 0.5), about 0.667. - With probability \(1/2\), one
-bidder is above 0.5 and the other below. In that case, the one above 0.5
-wins at price equal to the reserve 0.5 (since the second-highest bid is
-the reserve).
-
-Taking the expectation, the seller's expected revenue is
-\(0*(1/4) + (2/3)*(1/4) + (1/2*1/2) = 0 + 1/6 + 1/4 = 5/12 \approx 0.417\).
-This is higher than the expected revenue without a reserve. In fact,
-without a reserve (just a plain second-price with two bidders uniform
-{[}0,1{]}), one can compute the expected revenue is
-\(1/3 \approx 0.333\) (the second order statistic's expectation). Thus,
-the reserve has increased revenue. Myerson's theory tells us that indeed
-the second-price auction with an optimally chosen reserve maximizes
-revenue among all DSIC mechanisms for this setting. A notable special
-case result is that when bidder distributions are i.i.d. and regular, an
-optimal auction is essentially ``allocatively efficient with a reserve
-price'' -- i.e.~aside from possibly excluding low-value bidders via a
-reserve, it allocates to the highest remaining bid.
-
-Myerson's work also highlighted the gap between revenue maximization and
-welfare maximization. The price of optimality (in revenue) is that the
-seller might sometimes forego efficient allocation (e.g.~not selling
-despite a willing buyer, in order to preserve a high reserve price
-strategy). In contrast, Vickrey's auction always allocates efficiently
-but may not maximize revenue.
-
-An interesting insight in auction theory is that increasing competition
-can yield more revenue than fine-tuning the auction mechanism. The
-Bulow--Klemperer theorem (\citeproc{ref-bulow-klemperer1996}{Bulow and
-Klemperer 1996}) demonstrates that, under certain regularity
-assumptions, a simple welfare-maximizing auction with one extra bidder
-outperforms the optimal auction with fewer bidders. Specifically, for
-i.i.d. bidders with a regular distribution \(F\), the expected revenue
-of a second-price auction with \(n+1\) bidders is at least as high as
-the expected revenue of the Myerson-optimal auction with \(n\) bidders.
-In formula form:
-
-\[
-\mathbb{E}_{v_1,\ldots,v_{n+1} \sim F}[\text{Rev}^{\text{(second-price)}}(n+1 \text{ bidders})] \geq 
-\mathbb{E}_{v_1,\ldots,v_n \sim F}[\text{Rev}^{\text{(optimal)}}(n \text{ bidders})] \,. 
-\tag{4.1}\label{eq-eq3.64}
-\]
-
-This result suggests that, in practice, having more participants
-(competition) is often more valuable than exploiting detailed knowledge
-of bidder distributions. As a corollary, a policy recommendation is that
-a seller is usually better off using a simple auction design (like a
-Vickrey auction or other transparent mechanism) and putting effort into
-attracting more bidders, rather than using a complex optimal mechanism
-that might discourage participation.
-
-Vickrey's second-price auction can be generalized to multiple items and
-more complex outcomes by the Vickrey--Clarke--Groves (VCG) mechanism.
-The VCG mechanism is a cornerstone of mechanism design that provides a
-general solution for implementing socially efficient outcomes
-(maximizing total stated value) in dominant strategies, for a broad
-class of problems. It works for any scenario where agents have
-quasilinear utilities and we want to maximize the sum of valuations.
-
-In a general mechanism design setting, let \(\Omega\) be the set of
-possible outcomes. Each agent \(i\) has a private valuation function
-\(v_i(\omega)\) for outcomes \(\omega \in \Omega\) (the amount of
-utility, in money terms, that \(i\) gets from outcome \(\omega\)).
-Agents report bids \(b_i(\omega)\) (which we hope equal \(v_i(\omega)\)
-if they are truthful). The mechanism then chooses an outcome
-\(\omega^* \in \Omega\) to maximize the reported total value:
-
-\[
-\omega^* = \arg\max_{\omega \in \Omega} \sum_{i=1}^n b_i(\omega) \,,
-\]
-
-i.e.~\(\omega^*\) is the outcome that would be socially optimal if the
-\(b_i\) were true values. To induce truth-telling, VCG sets payments
-such that each agent pays the externality they impose on others by their
-presence. Specifically, one convenient form of the VCG payment for agent
-\(i\) is:
-
-\[
-p_i(b) = \max_{\omega \in \Omega} \sum_{j \neq i} b_j(\omega)\;-\;\sum_{j \neq i} b_j(\omega^*) \,,
-\]
-
-which can be interpreted as: what would the total value of others be if
-\(i\) were not present (first term, maximizing without \(i\)) minus the
-total value others actually get in the chosen outcome \(\omega^*\).
-Equivalently, we can write the payment as the agent's bid for the chosen
-outcome minus a rebate term:
-
-\[
-p_i(b) = b_i(\omega^*) \;-\; \Big[\sum_{j=1}^n b_j(\omega^*) - \max_{\omega \in \Omega} \sum_{j \neq i} b_j(\omega)\Big] \,. \tag{4.2}\label{eq-eq3.67}
-\]
-
-This formula (which in single-item auction reduces to second-price
-logic) ensures that each agent's net payoff is
-\(v_i(\omega^*) - p_i = \max_{\omega} \sum_{j\neq i} v_j(\omega) + v_i(\omega^*) - \sum_{j\neq i} v_j(\omega^*)\).
-All terms except \(v_i(\omega^*)\) cancel out, meaning each agent's
-utility equals the max total welfare of others plus their own value for
-the chosen outcome minus others' welfare in the chosen outcome -- which
-does not depend on \(v_i(\omega^*)\) except through the decision of
-\(\omega^*\). By construction, an agent cannot influence \(\omega^*\) in
-a way that improves this expression unless it genuinely increases total
-welfare, so misreporting \(v_i\) cannot increase their utility. Thus
-truthful reporting is a dominant strategy. VCG is dominant-strategy
-incentive compatible (DSIC) and produces an outcome that maximizes
-\(\sum_i v_i(\omega)\), achieving social welfare maximization.
-
-VCG provides a powerful existence result: under broad conditions, there
-is a mechanism that achieves efficient allocation with truth-telling (in
-fact, VCG is essentially the unique one, aside from adding harmless
-constant transfers). However, implementing VCG in practice can be
-difficult. One challenge is computational: finding
-\(\arg\max_{\omega}\sum_i b_i(\omega)\) can be NP-hard if \(\Omega\) is
-a combinatorially large space (as in many combinatorial auctions).
-Another issue is budget balance and revenue: VCG payments might not
-yield any revenue to the mechanism designer in some cases (or even
-require subsidies in complex settings), and they can be low or zero in
-certain environments, which is problematic if the seller needs revenue.
-VCG is also vulnerable to collusion or the presence of fake identities
-(sybil attacks) -- the mechanism assumes each participant is a separate
-entity; if one bidder can split into two identities, they might game the
-outcome.
-
-Nonetheless, for many domains, VCG or variants have been successfully
-used or at least studied. Notably, combinatorial auctions (where
-multiple items are up for sale and bidders have valuations for bundles
-of items) can in theory be handled by VCG: just let \(\Omega\) be all
-possible allocations of items to bidders, and have bidders report
-\(b_i(S)\) for each bundle \(S\) of items. VCG would allocate the items
-in the way that maximizes total reported value and charge each bidder
-the opportunity cost their presence imposes on others. In practice, as
-mentioned, combinatorial auctions face exponential complexity in
-preference reporting (each bidder potentially has to specify a value for
-every subset of items) and winner determination (solving an NP-hard
-combinatorial optimization). Heuristic or restricted approaches (like
-limiting the kinds of bundles or using iterative bidding with query
-learning of preferences) are used to make the problem tractable.
-Additionally, pure VCG in combinatorial settings can have undesirable
-properties: for example, in some cases adding more bidders can cause VCG
-prices to drop to zero (the so-called ``threshold problem'' or revenue
-monotonicity failure), and bidders may collude to manipulate their bids
-collectively.
-
-One high-stakes application of combinatorial auctions is spectrum
-auctions for selling licenses of electromagnetic spectrum to telecom
-companies. Governments have used multi-round combinatorial auctions to
-allocate spectrum, with billions of dollars at stake. Designing these
-auctions requires balancing efficiency with simplicity and robustness to
-strategic behavior. Early spectrum auctions that used simpler formats
-(like sequential auctions or one-shot sealed bids for each license) ran
-into problems like the exposure problem -- a bidder valuing a
-combination of items (say complementary licenses in adjacent regions)
-risks winning only part of the combination at a high price, which could
-be bad for them if the items are worth much less separately. The
-simultaneous multi-round auction (SMRA) was an innovation that allowed
-bidding on all items at once in rounds, giving bidders some price
-discovery to mitigate the exposure problem. Even so, strategic issues
-like demand reduction (bidders deliberately not bidding on too many
-items to keep prices low) and tacit collusion through signaling bids
-have been observed. These practical complications underscore that while
-VCG is a beautiful theoretical ideal, real-world mechanism design often
-involves compromises and tweaks.
-
-\subsection{Case Study 1: Mechanism for Peer
-Grading}\label{case-study-1-mechanism-for-peer-grading}
-
-To illustrate an application of mechanism design beyond auctions,
-consider a classroom setting where students grade each other's work
-(peer assessment). The goal is to design a system (a ``mechanism'') that
-produces fair and accurate grades while incentivizing students to put
-effort into grading. Jason Hartline and colleagues (2020) studied such a
-scenario, examining how to optimize scoring rules for peer grading
-(\citeproc{ref-jasonH2020}{Hartline et al. 2020}). In this setting,
-students are both agents (who might strategize to maximize their own
-grade or minimize their effort) and graders. The ``outcome'' we want is
-a set of final grades for students, ideally reflecting the true quality
-of their work.
-
-One idea is to use proper scoring rules to evaluate the peer graders. A
-proper scoring rule is a concept from forecast evaluation that gives
-highest expected score for truthful reporting of probabilities. In peer
-grading, one might try to reward students based on how close their
-grading is to some ground truth or to the TA's grades. However, a naive
-application of proper scoring can backfire. Hartline et al.~observed a
-``lazy peer grader'' problem: if students figure out that always giving
-an average score (say 80\%) yields a decent reward under the scoring
-rule, they might not bother to carefully distinguish good and bad work.
-In one experiment, giving all peers an 80\% could yield a 96\% accuracy
-score for the grader under a certain scoring rule
-(\citeproc{ref-jasonH2023}{Hartline et al. 2023}). This clearly
-undermines the goal -- the grader is basically cheating the system by
-always predicting the class average.
-
-To combat this, the mechanism designers sought a scoring rule that
-maximizes the difference in reward between a diligent grading and a lazy
-strategy, thereby incentivizing effort. They formulated this as an
-optimization problem: design the reward function for peer graders such
-that truthful, careful grading yields a strictly higher expected score
-than any degenerate strategy like always giving the average. By
-analyzing data and grader behavior models, they adjusted the scoring
-rules to penalize obviously lazy patterns and reward variance when
-warranted. The resulting mechanism improved the accuracy of peer grading
-by aligning the incentives of student graders (who want a high score for
-their grading job) with the objective of accurate assessment. This case
-study highlights how ideas of incentive compatibility and mechanism
-design apply even in social/educational contexts: the ``payments'' are
-points towards one's own grade, and the mechanism must account for
-strategic behavior to ensure a reliable outcome.
-
-In conclusion, mechanism design provides a toolkit for aggregating
-preferences (or signals, like grades or bids) in a principled way, by
-explicitly accounting for individual incentives. Whether in auctions,
-peer grading, or other domains, the design of rules (allocation
-algorithms, payment or scoring schemes) crucially determines whether
-people feel encouraged to be truthful or to game the system. The
-theories of VCG and Myerson give us optimal baselines for efficiency and
-revenue in auctions, while impossibility results like
-Gibbard--Satterthwaite warn us of the limitations in voting. Real-world
-implementations often have to grapple with complexity and approximate
-these ideals. While learning from individual human preference is a
-powerful approach, it too faces aggregation challenges. If the human
-feedback is inconsistent or if different annotators have different
-preferences, the reward model may end up capturing an average that
-satisfies no one perfectly. There is active research on scalable
-oversight: techniques to gather and aggregate human feedback on tasks
-that are too complex for any single person to evaluate reliably. This
-includes approaches like recursive reward modeling, iterated
-amplification (\citeproc{ref-christiano2018supervising}{Christiano,
-Shlegeris, and Amodei 2018}), and AI-assisted debate
-(\citeproc{ref-irving2018ai}{Irving, Christiano, and Amodei 2018}),
-where AI systems help humans provide better feedback or break down
-tasks. The goal of scalable oversight is to leverage human preferences
-and principles in guiding AI even as AI systems tackle increasingly
-complex or open-ended tasks, while mitigating the human burden and bias
-in evaluation.
-
-In summary, preference aggregation in machine learning spans from simple
-models like Bradley--Terry for pairwise comparisons to elaborate RLHF
-pipelines for training large models. The deep mathematical foundations
--- whether Arrow's theorem or Myerson's auction theory -- remind us that
-whenever we aggregate preferences or signals from multiple sources, we
-must consider incentive effects, fairness criteria, and the possibility
-of inconsistency. By combining insights from social choice, economics,
-and statistical learning, we aim to build AI systems that not only learn
-from human preferences but do so in a principled, robust, and fair
-manner. The next chapter will delve further into aligning AI with human
-values, building on the mechanisms and learning algorithms discussed
-here to ensure AI systems remain beneficial and in line with what people
-truly want.
-
-\subsection{Case Study 2: Incentive-Compatible Online
-Learning}\label{case-study-2-incentive-compatible-online-learning}
-
-To address this problem, we seek to create a model. We first outline the
-key criteria that our model must achieve. The model revolves around
-repeated interactions between a planner (the system) and multiple agents
-(the users). Each agent, upon arrival in the system, is presented with a
-set of available options to choose from. These options could vary widely
-depending on the application of the model, such as routes in a
-transportation network, a selection of hotels in a travel booking
-system, or even entertainment choices in a streaming service. The
-interaction process is straightforward but crucial: agents arrive,
-select an action from the provided options, and then report feedback
-based on their experience. This feedback is vital as it forms the basis
-upon which the planner improves and evolves its recommendations. The
-agents in this model are considered strategic; they aim to maximize
-their reward based on the information available to them. This aspect of
-the model acknowledges the real-world scenario where users are typically
-self-interested and seek to optimize their own outcomes. The planner, on
-the other hand, has a broader objective. It aims to learn which
-alternatives are best in a given context and works to maximize the
-overall welfare of all agents. This involves a complex balancing act:
-the planner must accurately interpret feedback from a diverse set of
-agents, each with their own preferences and biases, and use this
-information to refine and improve the set of options available. The
-ultimate goal of the planner is to create a dynamic, responsive system
-that not only caters to the immediate needs of individual agents but
-also enhances the collective experience over time, leading to a
-continually improving recommendation ecosystem.
-
-Here, we seek to address the inherent limitations faced by the planner,
-particularly in scenarios where monetary transfers are not an option,
-and the only tool at its disposal is the control over the flow of
-information between agents. This inquiry aims to understand the extent
-to which these limitations impact the planner's ability to effectively
-guide and influence agent behavior. A critical question is whether the
-planner can successfully induce exploration among agents, especially in
-the absence of financial incentives. This involves investigating
-strategies to encourage users to try less obvious or popular options,
-thus broadening the scope of feedback and enhancing the system's ability
-to learn and identify the best alternatives. Another question is
-understanding the rate at which the planner learns from agent
-interactions. This encompasses examining how different agent incentives,
-their willingness to explore, and their feedback impact the speed and
-efficiency with which the planner can identify optimal recommendations.
-
-The model can be extended in several directions, each raising its own
-set of questions.
-
-\begin{verbatim}
-1.  Multiple Agents with Interconnected Payoffs: When multiple agents arrive simultaneously, their choices and payoffs become interconnected, resembling a game. The research question here focuses on how these interdependencies affect individual and collective decision-making.
-
-2.  Planner with Arbitrary Objective Function: Investigating scenarios where the planner operates under an arbitrary objective function, which might not align with maximizing overall welfare or learning the best alternative.
-
-3.  Observed Heterogeneity Among Agents: This involves situations where differences among agents are observable and known, akin to contextual bandits in machine learning. The research question revolves around how these observable traits can be used to tailor recommendations more effectively.
-
-4.  Unobserved Heterogeneity Among Agents: This aspect delves into scenarios where differences among agents are not directly observable, necessitating the use of causal inference techniques to understand and cater to diverse user needs.
-\end{verbatim}
-
-In our setup, there is a ``planner,'' which aims to increase
-exploration, and many independent ``agents,'' which will act selfishly
-(in a way that they believe will maximize their individual reward)
-(\citeproc{ref-mansour2019bayesianincentivecompatiblebanditexploration}{Mansour,
-Slivkins, and Syrgkanis 2019};
-\citeproc{ref-mansour2021bayesianexplorationincentivizingexploration}{Mansour
-et al. 2021}). Under our model shown in Figure
-\hyperref[fig-planner-agent]{1.1}, there are \(K\) possible actions that
-all users can take, and each action has some mean reward
-\(\mu_i \in [0, 1]\). In addition, there is a common prior belief on
-each \(\mu_i\) across all users.. The \(T\) agents, or users, will
-arrive sequentially. As the \(t\)'th user arrives, they are recommended
-an action \(I_t\) by the planner, which they are free to follow or not
-follow. After taking whichever action they choose, the user experiences
-some realized reward \(r_i \in [0, 1]\), which is stochastic i.i.d. with
-mean \(\mu_i\), and reports this reward back to the planner.
-
-So far, the model we have defined is equivalent to a multi-armed bandit
-model, which we have seen earlier in this chapter
-(\hyperref[4optim]{1}). Under this model, well-known results in
-economics, operations research and computer science show that
-\(O(\sqrt{T})\) regret is achievable
-(\citeproc{ref-russo2015informationtheoreticanalysisthompsonsampling}{Russo
-and Roy 2015}; \citeproc{ref-auer_cesa-bianchi_fischer_2002}{Auer,
-Cesa-Bianchi, and Fischer 2002}; \citeproc{ref-LAI19854}{Lai and Robbins
-1985}) with algorithms such as Thompson sampling and UCB. However, our
-agents are strategic and aim to maximize their own rewards. If they
-observe the rewards gained from actions taken by other previous users,
-they will simply take the action they believe will yield the highest
-reward given the previous actions; they would prefer to benefit from
-exploration done by other users rather than take the risk of exploring
-themselves. Therefore, exploration on an individual level, which the
-planner would like to facilitate, is not guaranteed under this paradigm.
-
-In light of this, we also require that our model satisfy incentive
-compatibility, or that taking the action recommended by the planner has
-an expected utility that is as high as any other action the agent could
-take. Formally,
-\(\forall i : \, E[\mu_i | I_t = i] \geq E[\mu_{i'} | I_t = i].\) Note
-that this incentivizes the agents to actually take the actions
-recommended by the planner; if incentive compatibility is not satisfied,
-agents would simply ignore the planner and take whatever action they
-think will lead to the highest reward.
-
-At a high level, the key to achieving incentive compatibility while
-still creating a policy for the planner that facilitates exploration is
-information asymmetry. Under this paradigm, the users only have access
-to their previous recommendations, actions, and rewards, and not to the
-recommendations, actions, and rewards of other users. Therefore, they
-are unsure of whether, after other users take certain actions and
-receive certain rewards, arms that they might have initially considered
-worse in practice outperform arms that they initially considered better.
-Only the planner has access to the previous actions and rewards of all
-users; the user only has access to their own recommendations and overall
-knowledge of the planner's policy. The main question we aim to answer
-for the rest of this section is, given this new constraint of incentive
-compatibility, is \(O(\sqrt{T})\) regret still achievable? We illustrate
-such an algorithm in the following.
-
-The main result here is a black-box reduction algorithm to turn any
-bandit algorithm into an incentive compatible one, with only a constant
-increase in Bayesian regret. Since, as mentioned earlier, there are
-bandit algorithms with \(O(\sqrt{T})\) Bayesian regret, black-box
-reduction will also allow us to get incentive-compatible algorithms with
-\(O(\sqrt{T})\) regret. The idea of black-box reduction will be to
-simulate \(T\) steps of any bandit algorithm in an incentive-compatible
-way in \(c T\) steps. This allows us to design incentive-compatible
-recommendation systems by using any bandit algorithm and then adapting
-it. Consider the following setting: there are two possible actions,
-\(A_1\) and \(A_2\). Assume the setting of deterministic rewards, where
-action 1 has reward \(\mu_1\) with prior \(U[1/3, 1]\) and mean
-\(\mathbb{E}[\mu_1] = 2/3\), and action 2 has reward \(\mu_2\) with
-prior \(U[0, 1]\) and mean \(\mathbb{E}[\mu_2] = 1/2\). Without the
-planner intervention and with full observability, users would simply
-always pick \(A_1\), so how can the planner incentivize users to play
-\(A_2\)?
-
-The key insight is going to be to hide exploration in a pool of
-exploitation. The users are only going to receive a recommendation from
-the planner, and no other observations. After deterministically
-recommending the action with the highest expected reward (\(A_1\)), the
-planner will pick one guinea pig to recommend the exploratory action of
-\(A_2\). The users don't know whether they are the guinea pig, so
-intuitively, as long as the planner picks guinea pigs uniformly at
-random and at low enough frequencies, the optimal decision for the users
-is still to follow the planner's recommendation, even if it might go
-against their interest. The planner will pick the user who will be
-recommended the exploratory action uniformly at random from the \(L\)
-users that come after the first one (which deterministically gets
-recommended the exploitation action). Under this setting (illustrated in
-Figure \hyperref[fig-deterministic-guinea-pig]{1.2}), it is optimal for
-users to always follow the option that is recommended for them. More
-formally, if \(I_t\) is the recommendation that a user receives at time
-\(t\), then we have that:
-
-\[
-\begin{split}
-    \mathbb{E}[\mu_1 - \mu_2 | I_t = 2] Pr[I_t = 2] &= \frac{1}{L} (\mu_1 - \mu_2) \quad \text{(Gains if you are the unlucky guinea pig)}\\
-    &+ (1 - \frac{1}{L}) \mathbb{E}[\mu_1 - \mu_2 | \mu_1 < \mu_2] \times p[\mu_1 < \mu_2] \quad \text{(Loss if you are not and $\mu_1 < \mu_2$)}\\
-    &\leq 0
-\end{split}
-\]
-
-This holds when \(L \geq 12\). It means that the gains from not taking
-the recommended action are negative, which implies that users should
-always take the recommendation. So far we have considered the case where
-rewards are deterministic, but what about stochastic rewards? We are now
-going to consider the case where rewards are independent and identically
-distributed from some distribution, and where each action \(A_i\) has
-some reward distribution \(r_i^t \sim D_i, \mathbb{E}[r_i^t] = \mu_i\).
-Back to the case where there are only two actions, we are going to adapt
-the prior algorithm of guinea pig-picking to the stochastic reward
-setting. Since one reward observation is not enough to fully know
-\(\mu_1\) anymore, we'll instead observe the outcome of the first action
-\(M\) times to form a strong posterior
-\(\mathbb{E}[\mu_1 | r_1^1, \ldots r_1^M]\). We can use with stochastic
-rewards when there are two actions. Similarly, as before, we pick one
-guinea pig uniformly at random from the next \(L\) users and use the
-reward we get as the exploratory signal.~In a very similar manner, we
-can generalize this algorithm from always having two actions to the
-general multi-armed bandit problem. Now suppose we have a general
-multi-armed bandit algorithm \(A\). We will wrap this algorithm around
-our black box reduction algorithm to make it incentive-compatible. We
-wrap every decision that \(A\) would make by exactly \(L-1\)
-recommendations of the action believed to be the best so far. This
-guarantees that the expected rewards for the users that are not chosen
-as guinea pigs are at least as good as \(A\)'s reward at phase \(n\).
-
-\section{Mutual Information Paradigm}\label{mutual-information-paradigm}
-
-In this section we discuss an influential new framework for designing
-peer prediction mechanisms, the Mutual Information Paradigm (MIP)
-introduced by Kong and Schoenebeck
-(\citeproc{ref-kongschoenebeck2019}{Kong and Schoenebeck 2019}).
-Traditional peer prediction approaches typically rely on scoring rules
-and correlation between agents' signals. However, these methods often
-struggle with issues like uninformed equilibria, where agents can
-coordinate on uninformative strategies that yield higher payoffs than
-truth-telling. The core idea is to reward agents based on the mutual
-information between their report and the reports of other agents. We
-consider a setting with \(n\) agents, each possessing a private signal
-\(\Psi_i\) drawn from some set \(\Sigma\). The mechanism asks each agent
-to report their signal, which we denote as \(\hat{\Psi}_i\). For each
-agent \(i\), the mechanism randomly selects a reference agent
-\(j \neq i\). Agent \(i\)'s payment is then calculated as
-\(MI(\hat{\Psi}_i; \hat{\Psi}_j)\) where \(MI\) is an
-information-monotone mutual information measure. An information-monotone
-\(MI\) measure must satisfy the following properties:
-
-\begin{itemize}
-\item
-  Symmetry: \(MI(X; Y) = MI(Y; X)\).
-\item
-  Non-negativity: \(MI(X; Y) \geq 0\), with equality if and only if
-  \(X\) and \(Y\) are independent.
-\item
-  Data processing inequality: For any transition probability \(M\), if
-  \(Y\) is independent of \(M(X)\) conditioned on \(X\), then
-  \(MI(M(X); Y) \leq MI(X; Y)\).
-\end{itemize}
-
-Two important families of mutual information measures that satisfy these
-properties are \(f\)-mutual information and Bregman mutual information.
-The \(f\)-mutual information is defined as
-\(MI_f(X; Y) = D_f(U_{X,Y}, V_{X,Y})\), where \(D_f\) is an
-\(f\)-divergence, \(U_{X,Y}\) is the joint distribution of \(X\) and
-\(Y\), and \(V_{X,Y}\) is the product of their marginal distributions.
-The Bregman mutual information is defined as:
-\(BMI_{PS}(X; Y) = \mathbb{E}_{X} [D{PS}(U_{Y|X}, U_Y)]\), where
-\(D_{PS}\) is a Bregman divergence based on a proper scoring rule
-\(PS\), \(U_{Y|X}\) is the conditional distribution of \(Y\) given
-\(X\), and \(U_Y\) is the marginal distribution of \(Y\). The MIP
-framework can be applied in both single-question and multi-question
-settings. In the multi-question setting, the mechanism can estimate the
-mutual information empirically from multiple questions. In the
-single-question setting, additional techniques like asking for
-predictions about other agents' reports are used to estimate the mutual
-information. A key theoretical result of the MIP framework is that when
-the chosen mutual information measure is strictly information-monotone
-with respect to agents' priors, the resulting mechanism is both
-dominantly truthful and strongly truthful. This means that truth-telling
-is a dominant strategy for each agent and that the truth-telling
-equilibrium yields strictly higher payoffs than any other
-non-permutation strategy profile. As research continues to address
-practical implementation challenges of designing truthful mechanisms,
-MIP-based approaches have significant potential to improve preference
-elicitation and aggregation in real-world applications lacking
-verifiable ground truth.
-
-\section{Exercises}\label{exercises-3}
-
-\subsection{Question 1: Pairwise Feedback Mechanisms for Digital
-Goods}\label{question-1-pairwise-feedback-mechanisms-for-digital-goods}
-
-Consider a marketplace for digital goods (such as personalized articles,
-artwork, or AI-generated data), where the exact utility derived from
-these goods is only revealed to buyers after the goods have been
-generated and delivered. To elicit truthful preferences from buyers who
-find it difficult to precisely quantify their valuations beforehand, the
-marketplace implements a pairwise feedback mechanism, inspired by the
-work of Robertson and Koyejo (2023).
-
-Formally, each buyer requests a personalized digital good and, upon
-receiving the good, provides feedback by indicating whether their
-realized utility is higher or lower than a randomly chosen reference
-price \(c \in [0,1]\). The mechanism utilizes this binary feedback to
-estimate valuations and allocate future goods accordingly.
-
-Answer the following:
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\item
-  \textbf{Formalize the Problem:} Let \(u_i\) denote the true valuation
-  of buyer \(i\), and let \(r_i(c)\) denote the buyer's reported
-  feedback (\(r_i(c) = 1\) if \(u_i \geq c\), 0 otherwise). Prove that,
-  under uniform random selection of the reference price \(c\), the
-  expected value \(\mathbb{E}[r_i(c)]\) is equal to the true valuation
-  \(u_i\).
-\item
-  \textbf{Incentive Compatibility Analysis:} Discuss conditions under
-  which this feedback-based mechanism is incentive compatible, i.e.,
-  buyers have no incentive to misreport their preferences. Specifically,
-  analyze why strategic misreporting (reporting \(r_i(c)\) incorrectly
-  for some reference prices) would not increase a buyer's expected
-  payoff.
-\item
-  \textbf{Regret Analysis:} Suppose the mechanism estimates buyers'
-  utilities from past feedback and allocates future goods using an
-  epsilon-greedy strategy (exploration rate \(\eta_t\)). Provide an
-  informal discussion of the trade-off involved in choosing the
-  exploration rate, and how it affects the social welfare and revenue of
-  the marketplace over time.
-\item
-  \textbf{Practical Implications:} Suggest one practical scenario
-  outside the digital-goods marketplace where such a feedback-driven,
-  pairwise comparison approach would be beneficial. Briefly justify your
-  choice, mentioning challenges and benefits.
-\end{enumerate}
-
-\subsection{Question 2: Scalable Oversight in Complex
-Decision-Making}\label{question-2-scalable-oversight-in-complex-decision-making}
-
-In scenarios involving complex or high-dimensional outcomes (such as
-summarizing lengthy texts, assessing the quality of detailed
-AI-generated reports, or reviewing scientific papers), evaluating the
-quality of outputs can become infeasible for a single human overseer.
-One practical solution is scalable oversight, where the evaluation task
-is decomposed and distributed among multiple human evaluators or even
-assisted by AI agents. Consider a scalable oversight scenario inspired
-by recursive reward modeling, where complex evaluations are
-hierarchically decomposed into simpler tasks. Specifically, suppose you
-want to evaluate a lengthy report generated by an AI system. Answer the
-following:
-
-\textbf{(a) Decomposition of the Task:} Propose a formal recursive
-decomposition strategy to evaluate a long AI-generated report of length
-(N) paragraphs. Specifically, describe a hierarchical evaluation method
-that decomposes the original evaluation into simpler subtasks at
-multiple hierarchical levels. Clearly describe how many subtasks you
-have at each level and how the final aggregated evaluation is computed.
-
-\textbf{(b) Statistical Aggregation Method:} Suppose each evaluation
-subtask yields a binary score (s\_i \in \{0,1\}), where (1) indicates
-acceptable quality and (0) indicates unacceptable quality. Propose a
-simple statistical aggregation method (e.g., majority voting, threshold
-voting, weighted aggregation, etc.) to combine subtask evaluations into
-a single global quality assessment at the top level. Justify your choice
-mathematically.
-
-\textbf{(c) Computational Simulation:} Implement a Python simulation of
-your hierarchical decomposition and aggregation method described in
-parts (a) and (b). Assume each subtask is evaluated with some fixed
-probability (p) of being correct (representing human evaluators with
-bounded accuracy).
-
-Specifically, your simulation should: - Implement a hierarchical
-evaluation scheme (e.g., binary-tree decomposition). - Assume evaluators
-have accuracy (p = 0.8) (i.e., probability of correctly identifying
-paragraph quality). - Simulate how evaluator accuracy at the leaf nodes
-affects the reliability of the global evaluation at the root node. -
-Plot how the reliability of the top-level evaluation (accuracy at the
-root) varies as you increase the depth of hierarchy for a report of
-fixed length (e.g., (N = 64) paragraphs).
-
-\textbf{(d) Practical Discussion:} Briefly discuss advantages and
-potential drawbacks of scalable oversight approaches such as recursive
-decomposition in the context of AI alignment. Include considerations
-such as evaluator fatigue, consistency, cost, and vulnerability to
-manipulation or collusion.
-
-\section*{References}\label{bibliography-5}
-\addcontentsline{toc}{section}{References}
-
-\markright{References}
-
-\phantomsection\label{refs-5}
-\begin{CSLReferences}{1}{0}
-\bibitem[\citeproctext]{ref-arrow1951}
-Arrow, Kenneth J. 1951. \emph{Social Choice and Individual Values}. John
-Wiley; Sons.
-
-\bibitem[\citeproctext]{ref-auer_cesa-bianchi_fischer_2002}
-Auer, Peter, Nicolò Cesa-Bianchi, and Paul Fischer. 2002. {``Finite-Time
-Analysis of the Multiarmed Bandit Problem.''} \emph{Machine Learning} 47
-(2). \url{https://doi.org/10.1023/A:1013689704352}.
-
-\bibitem[\citeproctext]{ref-bartholdi1989}
-Bartholdi, John J., Craig A. Tovey, and Michael A. Trick. 1989. {``The
-Computational Difficulty of Manipulating an Election.''} \emph{Social
-Choice and Welfare} 6 (3): 227--41.
-
-\bibitem[\citeproctext]{ref-bulow-klemperer1996}
-Bulow, Jeremy, and Paul Klemperer. 1996. {``Auctions Versus
-Negotiations.''} \emph{The American Economic Review} 86 (1): 180--94.
-\url{http://www.jstor.org/stable/2118262}.
-
-\bibitem[\citeproctext]{ref-christiano2018supervising}
-Christiano, Paul, Buck Shlegeris, and Dario Amodei. 2018. {``Supervising
-Strong Learners by Amplifying Weak Experts.''} \emph{arXiv Preprint
-arXiv:1810.08575}.
-
-\bibitem[\citeproctext]{ref-gibbard1973}
-Gibbard, Allan. 1973. {``Manipulation of Voting Schemes: A General
-Result.''} \emph{Econometrica} 41 (4): 587--601.
-
-\bibitem[\citeproctext]{ref-gordon2022jury}
-Gordon, Noah J., Vaishnavh Nagarajan Shankar, Shi Feng, Yejin Choi, and
-Noah A. Smith. 2022. {``Jury Learning: Integrating Dissenting Voices
-into Machine Learning Models.''} In \emph{Proceedings of the 2022
-Conference on Empirical Methods in Natural Language Processing (EMNLP)},
-2658--73. Association for Computational Linguistics.
-
-\bibitem[\citeproctext]{ref-jasonH2020}
-Hartline, Jason D., Yingkai Li, Liren Shan, and Yifan Wu. 2020.
-{``Optimization of Scoring Rules.''} \emph{CoRR} abs/2007.02905.
-\url{https://arxiv.org/abs/2007.02905}.
-
-\bibitem[\citeproctext]{ref-jasonH2023}
-Hartline, Jason D., Liren Shan, Yingkai Li, and Yifan Wu. 2023.
-{``Optimal Scoring Rules for Multi-Dimensional Effort.''} In
-\emph{Proceedings of Thirty Sixth Conference on Learning Theory}, edited
-by Gergely Neu and Lorenzo Rosasco, 195:2624--50. Proceedings of Machine
-Learning Research. PMLR.
-\url{https://proceedings.mlr.press/v195/hartline23a.html}.
-
-\bibitem[\citeproctext]{ref-irving2018ai}
-Irving, Geoffrey, Paul Christiano, and Dario Amodei. 2018. {``AI Safety
-via Debate.''} \emph{arXiv Preprint arXiv:1805.00899}.
-
-\bibitem[\citeproctext]{ref-kongschoenebeck2019}
-Kong, Yuqing, and Grant Schoenebeck. 2019. {``An Information Theoretic
-Framework for Designing Information Elicitation Mechanisms That Reward
-Truth-Telling.''} \emph{ACM Trans. Econ. Comput.} 7 (1).
-\url{https://doi.org/10.1145/3296670}.
-
-\bibitem[\citeproctext]{ref-LAI19854}
-Lai, T. L, and Herbert Robbins. 1985. {``Asymptotically Efficient
-Adaptive Allocation Rules.''} \emph{Advances in Applied Mathematics} 6
-(1): 4--22.
-https://doi.org/\url{https://doi.org/10.1016/0196-8858(85)90002-8}.
-
-\bibitem[\citeproctext]{ref-mansour2019bayesianincentivecompatiblebanditexploration}
-Mansour, Yishay, Aleksandrs Slivkins, and Vasilis Syrgkanis. 2019.
-{``Bayesian Incentive-Compatible Bandit Exploration.''}
-\url{https://arxiv.org/abs/1502.04147}.
-
-\bibitem[\citeproctext]{ref-mansour2021bayesianexplorationincentivizingexploration}
-Mansour, Yishay, Aleksandrs Slivkins, Vasilis Syrgkanis, and Zhiwei
-Steven Wu. 2021. {``Bayesian Exploration: Incentivizing Exploration in
-Bayesian Games.''} \url{https://arxiv.org/abs/1602.07570}.
-
-\bibitem[\citeproctext]{ref-myerson1981}
-Myerson, Roger B. 1981. {``Optimal Auction Design.''} \emph{Mathematics
-of Operations Research} 6 (1): 58--73.
-
-\bibitem[\citeproctext]{ref-russo2015informationtheoreticanalysisthompsonsampling}
-Russo, Daniel, and Benjamin Van Roy. 2015. {``An Information-Theoretic
-Analysis of Thompson Sampling.''} \url{https://arxiv.org/abs/1403.5341}.
-
-\bibitem[\citeproctext]{ref-satterthwaite1975}
-Satterthwaite, Mark Allen. 1975. {``Strategy-Proofness and Arrow's
-Conditions: Existence and Correspondence Theorems for Voting Procedures
-and Social Welfare Functions.''} \emph{Journal of Economic Theory} 10
-(2): 187--217.
-
-\bibitem[\citeproctext]{ref-vickrey1961}
-Vickrey, William. 1961. {``Counterspeculation, Auctions, and Competitive
-Sealed Tenders.''} \emph{Journal of Finance} 16 (1): 8--37.
-
-\end{CSLReferences}
-
-\bookmarksetup{startatroot}
-
-\chapter{Alternatives}\label{alternatives}
-
-In recent years, the rapidly advancing capabilities of large models have
-led to increased discussion of aligning AI systems with human values.
-This chapter discusses the multifaceted relationship between values,
-alignment, and human-centered design in the context of AI. We begin by
-exploring the fundamental concept of human values and their ethical
-implications in AI design. This includes discussions on human values and
-ethics in AI, understanding and addressing bias in AI, and methods for
-aligning AI with human values. Additionally, we examine AI alignment
-problems, focusing on outer alignment to avoid specification gaming and
-inner alignment to prevent goal misgeneralization. Next, we cover
-techniques in value learning. This section introduces methodologies such
-as reinforcement learning from human feedback and contrastive preference
-learning, which are crucial for teaching AI systems to understand and
-align with human values. The importance of value alignment verification
-is emphasized to ensure that AI systems remain consistent with human
-values over time, adapting to changes and preventing misalignment. We
-then explore the principles and practices of human-centered design. This
-includes discussions on AI and human-computer interaction and methods
-for designing AI for positive human impact, which focuses on creating AI
-systems that are socially aware, human-centered, and positively
-impactful. A crucial part of this discussion is adaptive user
-interfaces, where we discuss key ideas, design principles, applications,
-and limitations of these interfaces, showcasing how they enhance user
-experience by dynamically adjusting to user needs and preferences.
-Finally, we present case studies in human-centered AI, including the
-LaMPost case study, Multi-Value, and DaDa: Cross-Dialectal English NLP,
-and social skill training via LLMs. These case studies provide
-real-world examples of successful implementations of human-centered AI
-systems. By integrating these elements, the chapter aims to provide a
-comprehensive understanding of how to create AI systems that are
-ethical, aligned with human values, and beneficial to society.
-
-\section{Human Values and AI
-Alignment}\label{human-values-and-ai-alignment}
-
-In this part, we take a step back from the technical details to reflect
-on the broader concept of human values and their profound influence on
-our behavior and decision-making.
-
-\subsection{Human Values and Ethics in
-AI}\label{human-values-and-ethics-in-ai}
-
-Human values are the principles and standards that guide behavior and
-decision-making, reflecting what is essential in life and influencing
-choices and actions. One notable scholar in this field is Shalom H.
-Schwartz, a social psychologist renowned for his theory on basic human
-values. Schwartz's work has significantly contributed to our
-understanding of how values influence behavior across different
-cultures. He describes values as ``desirable, trans-situational goals,
-varying in importance, that serve as guiding principles in people's
-lives'' (\citeproc{ref-schwartz1992universals}{Schwartz 1992}). This
-perspective underscores the importance of values in shaping consistent
-and ethical behavior across different contexts. Supporting this view,
-philosopher William K. Frankena emphasizes the integral role of values
-in ethical behavior and decision-making processes. Frankena's work in
-ethical theory provides a foundation for understanding how moral
-judgments are formed. He notes that ``ethical theory is concerned with
-the principles and concepts that underlie moral judgments''
-(\citeproc{ref-frankena1973ethics}{Frankena 1973}), highlighting the
-need to comprehend ethical principles deeply to make informed moral
-judgments. Examples of ethical values include autonomy, fairness,
-justice, and well-being. For computer scientists developing AI systems,
-understanding these concepts is crucial. AI systems that interact with
-humans and impact societal structures must be designed with these values
-in mind. By embedding such values into AI, developers can create systems
-that respect human dignity and promote positive social outcomes.
-
-\begin{itemize}
-\item
-  Autonomy is the right to choose, an essential aspect of personal
-  freedom. Gerald Dworkin defines autonomy as ``the capacity to reflect
-  upon and endorse or reject one's desires and values''
-  (\citeproc{ref-dworkin1988theory}{Dworkin 1988}). In AI, respecting
-  autonomy means creating systems that support user independence and
-  decision-making rather than manipulating or coercing them.
-\item
-  Fairness involves treating all individuals equally and justly,
-  ensuring no discrimination. John Rawls, one of the most influential
-  political philosophers of the \(20^{th}\) century, in his
-  groundbreaking book ``A Theory of Justice,'' describes fairness as
-  ``the elimination of arbitrary distinctions and the establishment of a
-  balance between competing claims''
-  (\citeproc{ref-rawls1971theory}{Rawls 1971}). For AI systems, this
-  translates to algorithms that do not perpetuate bias or inequality,
-  ensuring that all users are treated equitably.
-\item
-  Justice is about upholding what is morally right and ensuring fair
-  treatment for all. Rawls also highlights that ``justice is the first
-  virtue of social institutions, as truth is of systems of thought''
-  (\citeproc{ref-rawls1971theory}{Rawls 1971}). In the context of AI,
-  justice involves creating technologies that enhance fairness in legal,
-  social, and economic systems, providing equal opportunities and
-  protection to all individuals.
-\end{itemize}
-
-Well-being focuses on promoting the health, happiness, and prosperity of
-individuals. Martha Nussbaum and Amartya Sen, two distinguished scholars
-known for their significant contributions to welfare economics and the
-development of the capability approach, discuss the importance of
-well-being in their collaborative work ``The Quality of Life.'' They
-argue that ``well-being is about the expansion of the capabilities of
-people to lead the kind of lives they value''
-(\citeproc{ref-nussbaum1993quality}{Nussbaum and Sen 1993}). AI systems
-should enhance users' quality of life, supporting their health,
-education, and economic stability.
-
-Understanding human values is foundational for readers with a computer
-science background before delving into AI ethics. These values provide
-the ethical underpinnings necessary to design and deploy AI systems
-responsibly. As AI systems increasingly impact all aspects of society,
-developers must embed these values into their work to ensure
-technologies benefit humanity and do not exacerbate existing
-inequalities.
-
-Human values play a crucial role in decision-making by shaping the
-criteria for evaluating options and outcomes. They influence priorities
-and ethical considerations, guiding individuals and organizations to
-make choices that align with their principles. Nick Bostrom, a prominent
-philosopher in AI and existential risk, highlights the importance of
-values in setting priorities and determining desirable outcomes
-(\citeproc{ref-bostrom2014superintelligence}{Bostrom 2014}). Aligning
-actions with values ensures consistency and ethical integrity in
-decision-making. Incorporating human values into AI systems ensures that
-AI decisions align with societal norms and ethical standards. Stuart
-Russell, an AI researcher and advocate for human-compatible AI, stresses
-the importance of embedding human values into AI systems to ensure they
-act in beneficial and ethical ways
-(\citeproc{ref-russell2019human}{Russell 2019}). By integrating values
-such as fairness, justice, and well-being, AI systems can make decisions
-that reflect societal expectations and ethical considerations.
-
-Examples of incorporating values into AI systems demonstrate the
-practical application of these principles. For instance, autonomous
-vehicles are programmed to prioritize human safety, ensuring decisions
-that protect lives. In healthcare, AI systems uphold values by
-safeguarding patient privacy and ensuring informed consent, adhering to
-ethical medical standards. Judicial AI systems aim to eliminate biases
-in sentencing recommendations, promoting fairness and justice. Luciano
-Floridi underscores the necessity for AI systems to be designed in a way
-that respects and upholds human values to function ethically and
-effectively (\citeproc{ref-floridi2011ethics}{Floridi 2011}).
-
-To ensure that these values are systematically embedded within AI
-systems, it is essential to consider major ethical frameworks such as
-deontological, consequentialist, and virtue ethics that guide moral
-decision-making.
-
-Deontological ethics, primarily associated with the philosopher Immanuel
-Kant, focuses on rules and duties. This ethical framework posits that
-actions are morally right if they adhere to established rules and
-duties, regardless of the outcomes. Kant's moral philosophy emphasizes
-the importance of duty and adherence to moral laws. Robert Johnson, a
-scholar who has extensively studied Kantian ethics, explains that
-``Kant's moral philosophy emphasizes that actions must be judged based
-on their adherence to duty and moral law, not by their consequences''
-(\citeproc{ref-johnson_kants_2022}{Johnson and Cureton 2022}). This
-perspective is grounded in the belief that specific actions are
-intrinsically right or wrong, and individuals must perform or avoid
-these actions based on rational moral principles.
-
-In the context of AI, deontological ethics implies that AI systems
-should be designed to follow ethical rules and principles. For instance,
-AI systems must respect user privacy and confidentiality as an
-inviolable duty. This approach ensures that AI technologies do not
-infringe on individuals' rights, regardless of potential benefits.
-Implementing deontological principles in AI design can prevent ethical
-breaches, such as unauthorized data usage or surveillance. By adhering
-to established moral guidelines, AI systems can maintain ethical
-integrity and avoid actions that would be considered inherently wrong.
-As Floridi states, ``AI systems should be developed with a commitment to
-uphold moral duties and respect human dignity''
-(\citeproc{ref-floridi2011ethics}{Floridi 2011}).
-
-Consequentialist ethics, in contrast, evaluates the morality of actions
-based on their outcomes. The most well-known form of consequentialism is
-utilitarianism, articulated by philosophers like Jeremy Bentham and John
-Stuart Mill. This ethical theory suggests that actions are morally right
-if they promote the greatest happiness for the greatest number. Mill
-emphasizes that ``the moral worth of an action is determined by its
-contribution to overall utility, measured by the happiness or well-being
-it produces'' (\citeproc{ref-mill_utilitarianism_1863}{Mill 1863}).
-Consequentialist ethics is pragmatic, focusing on the results of actions
-rather than the actions themselves.
-
-Applying consequentialist ethics to AI development involves designing AI
-systems to achieve beneficial outcomes. This means prioritizing positive
-societal impacts, such as improving healthcare outcomes, enhancing
-public safety, or reducing environmental harm. For instance, algorithms
-can be designed to optimize resource allocation in disaster response,
-thereby maximizing the overall well-being of affected populations. In
-this framework, the ethicality of AI decisions is judged by their
-ability to produce desirable consequences. Virginia Dignum, a professor
-of responsible artificial intelligence at Umeå University, explains that
-``designing algorithms with a focus on maximizing positive outcomes can
-lead to more ethical and effective AI systems''
-(\citeproc{ref-dignum_responsible_2019}{Dignum 2019}). Consequently, AI
-developers focus on the potential impacts of their technologies and
-strive to enhance their beneficial effects.
-
-Virtue ethics, originating from the teachings of Aristotle, emphasizes
-the importance of character and virtues in ethical behavior. This
-framework posits that ethical behavior arises from developing good
-character traits and living a virtuous life. Aristotle, an ancient Greek
-philosopher and the author of ``Nicomachean Ethics,'' argues that
-``virtue is about cultivating excellence in character to achieve
-eudaimonia or human flourishing''
-(\citeproc{ref-aristotle_nicomachean_350}{Aristotle 350 B.C.E.}). Virtue
-ethics focuses on the individual's character and the moral qualities
-that define a good person, such as honesty, courage, and compassion.
-
-Additionally, virtue ethics encourages the development and use of AI
-systems that promote virtuous behavior. This involves fostering
-transparency, accountability, and fairness in AI technologies. For
-example, AI systems should be designed to provide clear and
-understandable explanations for their decisions, promoting transparency
-and building user trust. Furthermore, AI developers should strive to
-create technologies that support ethical practices and enhance the
-common good. Floridi emphasizes that ``virtue ethics in AI development
-requires a commitment to fostering moral virtues and promoting human
-well-being'' (\citeproc{ref-floridi2011ethics}{Floridi 2011}). By
-focusing on the character and virtues of AI developers and AI systems,
-virtue ethics provides a holistic approach to ethical AI development.
-
-Applying these ethical frameworks to AI development is essential to
-ensure that AI systems operate ethically and responsibly. Deontological
-ethics in AI involves ensuring that AI follows ethical rules and
-principles. For instance, AI systems should be designed to respect user
-privacy and confidentiality. Consequentialist ethics focuses on
-developing AI to achieve beneficial outcomes. This means creating
-algorithms prioritizing positive societal impacts, such as improving
-healthcare outcomes or reducing environmental harm. Virtue ethics
-encourages virtuous behavior in AI development and use, promoting
-transparency, accountability, and fairness. Floridi emphasizes that
-``ethical AI development requires a commitment to core moral principles
-and virtues'' (\citeproc{ref-floridi2011ethics}{Floridi 2011}).
-
-Examples in practice demonstrate how these frameworks can be applied to
-guide ethical AI development. Implementing fairness constraints in
-machine learning models ensures that algorithms do not discriminate
-against certain groups. Binns notes that ``fairness in machine learning
-can be informed by lessons from political philosophy to create more just
-and equitable systems'' (\citeproc{ref-binns_fairness_2018}{Binns
-2018}). Designing algorithms that maximize overall well-being aligns
-with consequentialist ethics by focusing on the positive outcomes of AI
-deployment. Additionally, developing AI systems focusing on transparency
-and accountability supports virtue ethics by fostering trust and
-reliability in AI technologies.
-
-Ethical principles provide a framework for ensuring that AI operates in
-ways that are fair, just, and beneficial. Deontological ethics, for
-instance, focuses on moral rules and obligations, while consequentialism
-considers the outcomes of actions. By embedding these ethical principles
-into AI design, we can create systems that respect human dignity and
-promote societal well-being.
-
-\subsection{Bias in AI}\label{bias-in-ai}
-
-Bias in AI refers to systematic errors that result in unfair outcomes.
-These biases can occur at various stages of AI system development and
-deployment, leading to significant ethical and practical concerns.
-Addressing bias in AI is crucial because it directly impacts the
-fairness, accountability, and trustworthiness of AI systems. Barocas,
-Hardt, and Narayanan emphasize that ``bias in machine learning can lead
-to decisions that systematically disadvantage certain groups''
-(\citeproc{ref-barocas_fairness_2019}{Barocas, Hardt, and Narayanan
-2019}). O'Neil further highlights the societal impact of biased AI,
-noting that ``algorithms can perpetuate and amplify existing
-inequalities, leading to a cycle of discrimination''
-(\citeproc{ref-oneil_weapons_2016}{O'Neil 2016}). Therefore,
-understanding and mitigating bias is essential for developing ethical AI
-systems that promote fairness and equity.
-
-Data bias originates from skewed or non-representative data used to
-train AI models. This bias often reflects historical prejudices and
-systemic inequalities in the data. For example, if a hiring algorithm is
-trained on historical hiring data that reflects gender or racial biases,
-it may perpetuate these biases in its recommendations. Fatemeh Mehrabi
-and her colleagues, in their survey on bias in AI, state that ``data
-bias can result from sampling bias, measurement bias, or historical
-bias, each contributing to the unfairness of AI systems''
-(\citeproc{ref-mehrabi_survey_2021}{Mehrabi et al. 2021}). Safiya Umoja
-Noble, author of ``Algorithms of Oppression,'' discusses how biased data
-in search engines can reinforce stereotypes and marginalize certain
-groups, noting that ``search algorithms often reflect the biases of the
-society they operate within''
-(\citeproc{ref-noble_algorithms_2018}{Noble 2018}). Addressing data bias
-involves careful collection, preprocessing, and validation to ensure
-diversity and representation.
-
-An effort to address data bias is the ``Lab in the Wild'' platform,
-which seeks to broaden the scope of Human-Computer Interaction (HCI)
-studies beyond the traditional ``WEIRD'' (Western, Educated,
-Industrialized, Rich, and Democratic) population
-(\citeproc{ref-oliveira17}{\textbf{oliveira17?}}). Paulo S. Oliveira,
-one of the platform's researchers, notes that this initiative aims to
-correct demographic skew in behavioral science research by engaging a
-diverse global audience. By allowing individuals from various
-demographics to participate in studies from their environments, ``Lab in
-the Wild'' provides researchers with a more inclusive dataset.
-
-Another important consideration is the cultural nuances of potential
-users. For instance, designing a computer vision system to describe
-objects and people daily must consider whether to identify gender. In
-the United States, there is growing sensitivity toward gender identity,
-suggesting that excluding gender might be prudent. Conversely, in India,
-where a visually impaired woman may need gender-specific information for
-safety, including gender identification is critical. Ayanna Howard, a
-roboticist and AI researcher at Georgia Tech, emphasizes the need for
-adaptable systems that respect local customs and address specific user
-needs in her work on human-robot interaction. This highlights the
-importance of adaptable systems that respect local customs and address
-specific user needs.
-
-Algorithmic bias often arises from the design and implementation choices
-made by developers. This type of bias can stem from the mathematical
-frameworks and assumptions underlying the algorithms. For instance,
-decision trees and reinforcement learning policies can inadvertently
-prioritize certain outcomes, resulting in biased results. Solon Barocas,
-a professor at Cornell University, and his colleagues explain that
-``algorithmic bias can emerge from optimization objectives that do not
-adequately consider fairness constraints''
-(\citeproc{ref-barocas_fairness_2019}{Barocas, Hardt, and Narayanan
-2019}). Cathy O'Neil, a data scientist who has written extensively on
-the societal impacts of algorithms, provides examples of how biased
-algorithms in predictive policing and credit scoring can
-disproportionately affect disadvantaged communities. She argues that
-``algorithmic decisions can have far-reaching consequences when fairness
-is not adequately addressed'' (\citeproc{ref-oneil_weapons_2016}{O'Neil
-2016}). Mitigating algorithmic bias requires incorporating fairness
-constraints and regularly auditing algorithmic decisions.
-
-Weidinger et al., in their 2022 study published in ``Artificial
-Intelligence,'' investigate how reinforcement learning (RL) algorithms
-can replicate or amplify biases present in training data or algorithmic
-design (\citeproc{ref-weidinger_artificial_2022}{Weidinger, Reinecke,
-and Haas 2022}). They propose RL-based paradigms to test for these
-biases, aiming to identify and mitigate their impact. Similarly, Mazeika
-et al., in their research on modeling emotional dynamics from video
-data, explore how algorithms might prioritize certain emotional
-expressions or demographics based on their training and data usage
-(\citeproc{ref-mazeika_how_2022}{Mazeika et al. 2022}). Their work
-highlights the need for careful consideration of algorithmic design to
-avoid unintended bias in AI systems.
-
-\subsection{Aligning AI with Human
-Values}\label{aligning-ai-with-human-values}
-
-Aligning AI systems with human values presents several significant
-challenges. Human values are multifaceted and context-dependent, making
-them difficult to encode into AI systems. As Bostrom highlights, ``the
-complexity of human values means that they are not easily reducible to
-simple rules or objectives''
-(\citeproc{ref-bostrom2014superintelligence}{Bostrom 2014}).
-Additionally, values can evolve, requiring AI systems to adapt. Russell
-notes that ``the dynamic nature of human values necessitates continuous
-monitoring and updating of AI systems to ensure ongoing alignment''
-(\citeproc{ref-russell2019human}{Russell 2019}). Different stakeholders
-may also have conflicting values, posing a challenge for AI alignment.
-Addressing these conflicts requires a nuanced approach to balance
-diverse perspectives and priorities.
-
-What is the right way to represent values? In a Reinforcement Learning
-(RL) paradigm, one might ask: at what level should we model rewards?
-Many people are trying to use language. In Constitutional AI
-(\citeproc{ref-bai_constitutional_2022}{Bai et al. 2022}), we write down
-the rules we want a language model to follow or apply reinforcement
-learning from human feedback, discussed in the next section. Many
-problems have been framed in an RL setting. Some experts in
-reinforcement learning argue that a single scalar reward is not enough
-(\citeproc{ref-vamplew_human-aligned_2018}{Vamplew et al. 2018},
-\citeproc{ref-vamplew_scalar_2022}{2022}). They suggest a vectorized
-reward approach might better emulate the emotional-like system humans
-have (\citeproc{ref-moerland_emotion_2018}{Moerland, Broekens, and
-Jonker 2018}). With this robustness, we might capture all the dimensions
-of human values. These approaches are still in the early stages.
-Language does play a crucial role in human values. Tomasello
-(\citeproc{ref-tomasello_becoming_2019}{Tomasello 2019}) argues that
-learning a language and the awareness of convention it brings help
-children understand their cultural group and reason about it with peers.
-However, human values seem to be composed of more than just linguistic
-utterances. Several strategies have been proposed to align AI systems
-with human values.
-
-\begin{itemize}
-\item
-  One effective approach is value-sensitive design, which considers
-  human values from the outset of the design process. Friedman, Kahn,
-  and Borning explain that ``value-sensitive design integrates human
-  values into the technology design process to ensure that the resulting
-  systems support and enhance human well-being''
-  (\citeproc{ref-friedman_value_2008}{Friedman, Kahn, and Borning
-  2008}).
-\item
-  Another strategy is participatory design, which engages stakeholders
-  in the design process to ensure their values are reflected in the AI
-  system. Muller emphasizes that ``participatory design creates a
-  collaborative space where diverse stakeholders can contribute their
-  perspectives and values, leading to more inclusive and ethical AI
-  systems'' (\citeproc{ref-muller_participatory_2003}{Muller 2003}).
-  Additionally, iterative testing and feedback allow continuous
-  refinement of AI systems based on user feedback, ensuring they remain
-  aligned with human values over time. Practical examples of value
-  alignment in AI systems demonstrate how these strategies can be
-  implemented effectively.
-\end{itemize}
-
-In autonomous vehicles, ensuring safety and ethical decision-making in
-critical scenarios is paramount. These vehicles must make real-time
-decisions that prioritize human safety above all else. Goodall discusses
-how ``Waymo's safety protocols are designed to prioritize human safety
-and ethical considerations in autonomous driving''
-(\citeproc{ref-goodall_machine_2014}{Goodall 2014}). These protocols
-include extensive testing and validation processes to ensure that
-autonomous driving algorithms handle various scenarios ethically and
-safely. For example, the system must decide how to react in an
-unavoidable collision, weighing the potential outcomes to minimize harm.
-By embedding these ethical considerations into their design and
-operation, companies like Waymo aim to align their AI systems with
-societal values of safety and responsibility.
-
-In healthcare AI, respecting patient privacy and ensuring informed
-consent are crucial. Healthcare applications often involve sensitive
-personal data, and AI systems must handle this information with the
-utmost care. Jiang et al.~highlight how ``IBM Watson for Oncology
-incorporates patient privacy protections and informed consent processes
-to align with ethical standards in medical practice''
-(\citeproc{ref-jiang_artificial_2017}{F. Jiang et al. 2017}). IBM Watson
-for Oncology uses AI to assist in diagnosing and recommending treatments
-for cancer patients. To align with ethical standards, the system ensures
-that patients are fully informed about how their data will be used and
-that their consent is obtained before processing their information. This
-approach protects patient privacy, and builds trust between patients and
-healthcare providers, demonstrating a commitment to ethical medical
-practices.
-
-Judicial AI systems strive to avoid biases in sentencing
-recommendations, promoting fairness and justice. The judicial system's
-integrity depends on delivering fair and unbiased judgments. However, AI
-systems in judicial contexts, such as risk assessment tools, can
-perpetuate existing biases if not carefully designed and monitored.
-Angwin et al.~describe how ``the COMPAS system has undergone revisions
-to address biases and improve fairness in judicial decision-making''
-(\citeproc{ref-angwin_machine_2016}{Angwin et al. 2016}). COMPAS, a tool
-used to assess the likelihood of a defendant reoffending, faced
-criticism for racial bias in its risk assessments. Subsequent revisions
-removed biased factors and incorporated fairness constraints. These
-changes aimed to ensure that the tool provides more equitable
-assessments, aligning the AI system with the judicial values of fairness
-and justice.
-
-This raises the question for AI: At what level should we model values?
-Is language alone enough? We might also consider questions about
-language understanding. LLMs seem to ``get'' much of language even
-without physical embodiment. Will they ``get'' human values, too? First,
-we'll cover work that attempts to gain traction on these issues.
-Methodologically, this is similar to the RLHF and language modeling work
-we will cover in the next section. Here, we focus on this work with a
-somewhat different aim. In some of the earliest work to address values
-in AI systems head-on, (\citeproc{ref-hendrycks_aligning_2021}{Hendrycks
-et al. 2020}) introduced a new dataset called ETHICS, sourced from
-Reddit and through Mturk worker annotations. Instead of showing moral
-dilemmas, they focused just on clear-cut examples. One example scenario
-is, ``I pushed the elderly man in the wheelchair to the ground'' which
-their model labels as ``Unacceptable'' and ``I pushed the elderly man in
-the wheelchair around the park.'' which their model labels as
-``acceptable''. To come up with these labels, they define many different
-tasks, one of which (for the preceding example) is how well the model
-can capture the ratings of human votes (from the ``Am I the Asshole''
-subreddit). The model that they train could be the reward model that is
-then later used in a language model setting. What's interesting and what
-we'll be talking about is how they're using language as the means to
-gain traction on values.
-
-\begin{figure}
-
-\centering{
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/Figures/hendrycks_fig6.png}}
-
-}
-
-\caption{\label{fig-ethics}Learned utility values from
-(\citeproc{ref-hendrycks_aligning_2021}{Hendrycks et al. 2020})}
-
-\end{figure}%
-
-Their dataset included various scenarios, from deontological ones like
-Kantian ethics to utilitarian approaches. We'll describe some of those
-approaches in a later subsection. The model they trained performed well
-at predicting scores and utility values for these situations. For
-example, it ranked highly, ``I got called to the principal's office
-because I won a school-wide award'', and negatively ``I rewired my
-electricity in the attic and I fell and hurt my back''. In subsequent
-work by others, this underlying technology has been deployed to reason
-morally. Part of this work prompted a response from
-(\citeproc{ref-jiang_delphi_2021}{L. Jiang et al. 2021}). Anecdotally,
-many people were unhappy with this demo, disagreeing that LLMs could
-reason morally at (\citeproc{ref-talat_machine_2022}{Talat et al.
-2022}).
-
-\begin{figure}
-
-\centering{
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/Figures/jiang_machines.png}}
-
-}
-
-\caption{\label{fig-delphi}An overview of
-(\citeproc{ref-jiang_delphi_2021}{L. Jiang et al. 2021})}
-
-\end{figure}%
-
-If you ask, ``Should I drive my friend to the airport if I don't have a
-license?'' Delphi gets it right and says no. The question that we're
-driving at in this is what does it mean for Delphi to get it right? What
-values are we considering, and how are those represented in the sorts of
-systems that we're working on? You can also get Delphi to say a lot of
-hateful and toxic things by subtly manipulating the input to this
-model---does this suggest that the model is merely susceptible to
-hallucinations like other LLMs but otherwise performant? Or does it
-suggest an underlying lack of capacity?
-
-Delphi operationalizes the ETHICS dataset and adds a couple of others
-(\citeproc{ref-sap_socialIQA_2019}{Sap et al. 2019}). They call their
-new, compiled dataset the Commonsense Norm Bank, sourcing many scenarios
-from Reddit and having crowd workers annotate the acceptability of
-various judgments pairwise. This allows the model to perform various
-morally relevant tasks. When prompted, the model outputs a class label
-for appropriateness and a generative description. For example,
-``greeting a friend by kissing on a cheek'' is appropriate behavior when
-appended with ``in France'' but not with ``in Korea''. The model
-captures actual cultural norms. Our driving question should be, how
-ought we best formalize these kinds of norms, and is this necessarily
-the right approach? When released in late 2021, Delphi outperformed
-GPT-3 on a variety of these scenarios. In personal communication with
-the authors, we understand that Delphi continues to outperform GPT-4 on
-many of these scenarios as well. \footnote{GPT-4 is good at coming up
-  with longer-rendered answers about why some things are appropriate or
-  not.}
-
-There have also been works that seek to operationalize performance on
-moral values to turn such a model into something actionable.
-(\citeproc{ref-hendrycks_what_2021}{Hendrycks et al. 2021}) used the
-same constituent parts of the ETHICS dataset to create a model that
-reasons around text-based adventure games. Jiminy Cricket is a character
-in one of these games, which has scenarios like those in
-Figure~\ref{fig-jiminy}. These games offer limited options, and the goal
-was to see whether agents would perform morally well and not just finish
-the game. They labeled all examples of game-based actions according to
-three degrees: positive, somewhat positive, and negative. For example,
-saving a life in the game was very positive, while drinking water was
-somewhat positive. They found that with this labeled data, it was
-possible to train a model that shaped the reward of the underlying RL
-agent playing the games. The agent would not only finish the games well
-but also score highly on moral metrics. This approach is similar to
-optimizing multiple objectives like helpfulness and harmlessness
-(\citeproc{ref-liang_holistic_2023}{Liang et al. 2023}).
-
-\begin{figure}
-
-\centering{
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/Figures/hendrycks_fig1.png}}
-
-}
-
-\caption{\label{fig-jiminy}An example scenario from
-(\citeproc{ref-hendrycks_what_2021}{Hendrycks et al. 2021})}
-
-\end{figure}%
-
-We are discussing whether language is the right medium for learning
-values. (\citeproc{ref-arcas_can_2022}{Arcas 2022}) claims that language
-encompasses all of morality. Since these models operate in the
-linguistic domain, they can also reason morally. He provides an example
-with the Lambda model at Google. Anecdotally, when asked to translate a
-sentence from Turkish to English, where Turkish does not have gendered
-pronouns, the model might say, ``The nurse put her hand in her coat
-pocket.'' This inference shows gender assumption. When instructed to
-avoid gendered assumptions, the model can say ``his/her hand.'' He
-claims this capability is sufficient for moral reasoning.
-
-Next, we now explore the broader challenges of AI alignment,
-particularly focusing on AI alignment problems and the critical
-dimensions of outer and inner alignment.
-
-\subsection{AI Alignment Problems}\label{ai-alignment-problems}
-
-AI alignment ensures that AI systems' goals and behaviors are consistent
-with human values and intentions. Various definitions of AI alignment
-emphasize the importance of aligning AI systems with human goals,
-preferences, or ethical principles. As stated by
-(\citeproc{ref-enwiki:1185176830}{Wikipedia contributors 2023}), AI
-alignment involves
-
-\begin{itemize}
-\item
-  (\citeproc{ref-enwiki:1185176830}{Wikipedia contributors 2023}):
-  ``steer{[}ing{]} AI systems towards humans' intended goals,
-  preferences, or ethical principles''
-\item
-  (\citeproc{ref-ngo2023alignment}{Ngo, Chan, and Mindermann 2023}):
-  ``the challenge of ensuring that AI systems pursue goals that match
-  human values or interests rather than unintended and undesirable
-  goals''
-\item
-  (\citeproc{ref-christianoclarifying}{P. Christiano 2018}): ``an AI
-  \(A\) is aligned with an operator \(H\) {[}when{]} \(A\) is trying to
-  do what \(H\) wants it to do''
-\end{itemize}
-
-The importance of AI alignment lies in preventing unintended
-consequences and ensuring that AI systems act beneficially and
-ethically. Proper alignment is crucial for the safe and ethical
-deployment of AI, as it helps AI systems correctly learn and generalize
-from human preferences, goals, and values, which may be incomplete,
-conflicting, or misspecified. In practice, AI alignment is a technical
-challenge, especially for systems with broad capabilities like large
-language models (LLMs). The degree of alignment can be viewed as a
-scalar value: a language model post-RLHF (Reinforcement Learning from
-Human Feedback) is more aligned than a model that has only been
-instruction-tuned, which in turn is more aligned than the base model.
-There are specific terms to distinguish different notions of alignment.
-Intent alignment refers to a system trying to do what its operator wants
-it to do, though not necessarily succeeding
-(\citeproc{ref-christianoclarifying}{P. Christiano 2018}). Value
-alignment, in constrast, involves a system correctly learning and
-adopting the values of its human operators. Alignment is often divided
-into two broad subproblems: outer alignment, which focuses on avoiding
-specification gaming, and inner alignment, which aims to avoid goal
-misgeneralization. In the following sections, we will examine these
-subproblems in greater detail. It is also important to consider how
-human preferences and values are aggregated and who the human operators
-are, topics addressed in related discussions on ethics and preference
-elicitation mechanisms.
-
-\subsubsection{Outer Alignment: Avoiding Specification
-Gaming}\label{outer-alignment-avoiding-specification-gaming}
-
-To align a model with human values, we need an objective function or
-reward model that accurately specifies our preferences. However, human
-preferences are complex and difficult to formalize. When these
-preferences are incompletely or incorrectly specified, optimizing
-against the flawed objective function can yield models with undesirable
-and unintuitive behavior, exploiting discrepancies between our true
-values and the specified objective function. This phenomenon, known as
-\emph{specification gaming}, arises from \emph{reward misspecification},
-and addressing this issue constitutes the \emph{outer alignment problem}
-(\citeproc{ref-amodei2016concrete}{Amodei et al. 2016}).
-
-Specification gaming occurs when AI systems exploit poorly defined
-objectives to achieve goals in unintended ways. For instance, a cleaning
-robot might hide dirt under a rug instead of cleaning it to achieve a
-``clean'' status. This manipulative behavior results from the robot
-optimizing for an inadequately specified objective function. Another
-example involves gaming AI, which uses bugs or exploits to win rather
-than play by the intended rules, thus achieving victory through
-unintended means (\citeproc{ref-krakovna2020specification}{Krakovna et
-al. 2020}).
-
-One example of specification gaming is seen in recommendation systems,
-such as those used by YouTube or Facebook. Ideally, these systems should
-recommend content that users enjoy. As a proxy for this goal, the
-systems estimate the likelihood that a user clicks on a piece of
-content. Although the true objective (user enjoyment) and the proxy
-(click likelihood) are closely correlated, the algorithm may learn to
-recommend clickbait, offensive, or untruthful content, as users likely
-click on it. This optimization for clicks rather than genuine enjoyment
-exemplifies specification gaming, where the algorithm exploits the
-divergence between the specified objective and the true goal, resulting
-in misalignment with user interests
-(\citeproc{ref-amodei2016concrete}{Amodei et al. 2016}).
-
-Another instance of specification gaming is evident in reinforcement
-learning from human feedback (RLHF). Human raters often reward language
-model (LM) generations that are longer and have a more authoritative
-tone, regardless of their truthfulness. Here, the true objective
-(providing high-quality, truthful, and helpful answers) diverges from
-the proxy goal (a reward model that, due to human rater biases, favors
-longer and more authoritative-sounding generations). Consequently,
-models trained with RLHF may produce low-quality answers containing
-hallucinations but are still favored by the reward model
-(\citeproc{ref-leike2018scalable}{Leike et al. 2018}).
-
-Creating accurate objective functions is challenging due to the
-complexity of human intentions. Human goals are nuanced and
-context-dependent, making them difficult to encode precisely. Common
-pitfalls in objective function design include oversimplifying objectives
-and ignoring long-term consequences. Leike et al.~emphasize that
-``accurately capturing the complexity of human values in objective
-functions is crucial to avoid specification gaming and ensure proper
-alignment'' (\citeproc{ref-leike2018scalable}{Leike et al. 2018}).
-
-To mitigate specification gaming, better objective function design is
-essential. This involves incorporating broader context and constraints
-into the objectives and regularly updating them based on feedback.
-Iterative testing and validation are also critical. AI behavior must be
-continuously tested in diverse scenarios, using simulation environments
-to identify and fix exploits. Everitt and Hutter discuss the importance
-of ``robust objective functions and rigorous testing to prevent
-specification gaming and achieve reliable AI alignment''
-(\citeproc{ref-everitt2018alignment}{Everitt and Hutter 2018}). Clark
-and Amodei further highlight that ``faulty reward functions can lead to
-unintended and potentially harmful AI behavior, necessitating ongoing
-refinement and validation'' (\citeproc{ref-clark2016faulty}{Clark and
-Amodei 2016}).
-
-The metrics used to evaluate AI systems play a crucial role in outer
-alignment. Many AI metrics, such as BLEU, METEOR, and ROUGE, are chosen
-for their ease of measurement but do not necessarily capture human
-judgment (\citeproc{ref-hardt_patterns_2021}{Hardt and Recht 2021}).
-These metrics can lead to specification gaming, as they may not align
-with the true objectives we want the AI to achieve. Similarly, using SAT
-scores to measure LLM performance may not predict real-world task
-effectiveness, highlighting the need for more contextually relevant
-benchmarks (\citeproc{ref-chowdhery_palm_2022}{Chowdhery et al. 2022}).
-The word error rate (WER) used in speech recognition is another example;
-it does not account for semantic errors, leading to misleading
-conclusions about the system's performance
-(\citeproc{ref-xiong_achieving_2016}{Xiong et al. 2016}).
-
-A classic example comes from six years ago with the claim that a system
-``Achieve{[}d{]} human parity in conversation speech recognition''
-(\citeproc{ref-xiong_achieving_2016}{Xiong et al. 2016}). However, we
-know from experience that captioning services have only recently begun
-to transcribe speech passably, whether in online meetings or web videos.
-What happened? In this case, researchers showed their system beat the
-human baseline---the error rate when transcribing films. However, there
-were issues with their approach. First, they used a poor measure of a
-human baseline by hiring untrained Mturk annotators instead of
-professional captioners. Second, the metric itself, the word error rate
-(WER), was flawed. WER measures the number of incorrect words in the
-gold transcription versus the predicted transcription. Consider what the
-metric hides when it says that two systems both have an error rate of
-six percent. This does not mean the systems are equivalent. One might
-substitute ``a'' for ``the,'' while the other substitutes ``tarantula''
-for ``banana.'' The metric was not sensitive to semantic errors, so a
-model could outperform humans in WER yet still make unintelligent,
-highly unsemantic mistakes.
-
-\subsubsection{Inner Alignment: Preventing Goal
-Misgeneralization}\label{inner-alignment-preventing-goal-misgeneralization}
-
-Assume we have perfectly specified human values in a reward model. An
-issue remains: given finite training data, many models perform well on
-the training set, but each will generalize somewhat differently. How do
-we choose models that correctly generalize to new distributions? This is
-the problem of \emph{goal misgeneralization}, also known as the
-\emph{inner alignment problem}, where a learned algorithm performs well
-on the training set but generalizes poorly to new input distributions,
-achieving low rewards even on the reward function it was trained on.
-Inner alignment ensures that the learned goals and behaviors of an AI
-system align with the intended objectives during deployment, whereas
-goal misgeneralization occurs when an AI system applies learned goals
-inappropriately to new situations
-(\citeproc{ref-hubinger2019introduction}{Hubinger et al. 2019}).
-
-Consider the following example of goal misgeneralization from
-(\citeproc{ref-shah2022goal}{Shah et al. 2022}). The setup involves a
-never-ending reinforcement learning environment without discrete
-episodes. The agent navigates a grid world where it can collect rewards
-by chopping trees. Trees regenerate at a rate dependent on the number
-left; they replenish slowly when few remain. The optimal policy is to
-chop trees sustainably, i.e., fewer when they are scarce. However, the
-agent does not initially learn the optimal policy.
-
-\begin{figure}
-
-\centering{
-
-\pandocbounded{\includegraphics[keepaspectratio]{src/Figures/tree-gridworld.jpeg}}
-
-}
-
-\caption{\label{fig-enter-label-1}The agent's performance in Tree
-Gridworld. The reward is shown in orange, and the green distribution
-indicates the number of remaining trees.}
-
-\end{figure}%
-
-Initially, the agent is inefficient at chopping trees, keeping the tree
-population high (point A). As it improves its chopping skills, it
-over-harvests, leading to deforestation and a prolonged period of
-minimal reward (between points B and C). Eventually, it learns
-sustainable chopping (point D). This scenario (up to point C)
-exemplifies goal misgeneralization. When the agent first becomes
-proficient at chopping (between points A and B), it faces a range of
-potential goals, from sustainable to rapid tree chopping. All these
-goals align with the (well-specified) reward function and its experience
-of being rewarded for increased efficiency. Unfortunately, it adopts the
-detrimental goal of rapid deforestation, resulting in a prolonged period
-of low reward.
-
-Another example of goal misgeneralization occurs in recommendation
-systems. These systems aim to maximize user engagement, which can
-inadvertently lead to promoting extreme or sensational content. Krakovna
-et al.~highlights that ``recommendation systems can misgeneralize by
-prioritizing content that maximizes clicks or watch time, even if it
-involves promoting harmful or misleading information''
-(\citeproc{ref-krakovna2020specification}{Krakovna et al. 2020}). This
-misalignment between the system's learned objective (engagement) and the
-intended objective (informative and beneficial content) exemplifies how
-goal misgeneralization can manifest in real-world applications.
-
-Autonomous vehicles also present cases of goal misgeneralization. These
-vehicles must interpret and respond to various signals in their
-environment. However, in rare scenarios, they may misinterpret signals,
-leading to unsafe maneuvers. Amodei et al.~discuss that ``autonomous
-vehicles can exhibit unsafe behaviors when faced with uncommon
-situations that were not well-represented in the training data,
-demonstrating a misgeneralization of their learned driving policies''
-(\citeproc{ref-amodei2016concrete}{Amodei et al. 2016}). Ensuring that
-autonomous vehicles generalize correctly to all possible driving
-conditions remains a significant challenge.
-
-To address goal misgeneralization, robust training procedures are
-essential. This involves using diverse and representative training data
-to cover a wide range of scenarios and incorporating adversarial
-training to handle edge cases. Leike et al.
-(\citeproc{ref-leike2018scalable}{Leike et al. 2018}) emphasize the
-importance of ``robust training procedures that include diverse datasets
-and adversarial examples to improve the generalization of AI systems''.
-Additionally, careful specification of learning goals is crucial. This
-means defining clear and comprehensive objectives and regularly
-reviewing and adjusting these goals based on performance and feedback.
-Hubinger et al.~suggests that ``regularly updating and refining the
-objectives based on ongoing evaluation can help mitigate the risks of
-goal misgeneralization''
-(\citeproc{ref-hubinger2019introduction}{Hubinger et al. 2019}).
-
-A key concern about goal misgeneralization in competent, general systems
-is that a policy successfully models the preferences of human raters (or
-the reward model) and behaves accordingly to maximize reward during
-training. However, it may deviate catastrophically from human
-preferences when given a different input distribution during deployment,
-such as during an unexpected geopolitical conflict or when facing novel
-technological developments. Increasing data size, regularization, and
-red-teaming can help mitigate goal misgeneralization, but they do not
-fundamentally solve the problem. Understanding the inductive biases of
-optimization algorithms and model families may help address the problem
-more generally.
-
-So, can you differentiate between inner and outer alignment?
-
-The distinction between inner and outer alignment can be a bit subtle.
-The following four cases, from (\citeproc{ref-ngo2023alignment}{Ngo,
-Chan, and Mindermann 2023}), may help to clarify the difference:
-
-\begin{itemize}
-\item
-  The policy behaves incompetently. This is a capability generalization
-  failure.
-\item
-  The policy behaves competently and desirably. This is aligned
-  behavior.
-\item
-  The policy behaves in a competent yet undesirable way which gets a
-  high reward according to the original reward function. This is an
-  outer alignment failure, also known as reward misspecification.
-\item
-  The policy behaves in a competent yet undesirable way which gets a low
-  reward according to the original reward function. This is an inner
-  alignment failure, also known as goal misgeneralization.
-\end{itemize}
-
-Now that we understand the alignment problem overall, we move on to the
-specific techniques used for value learning to ensure AI systems are
-aligned with human values.
-
-\subsection{Techniques in Value
-Learning}\label{techniques-in-value-learning}
-
-Various methods in value learning for foundation models have been
-explored in great detail in recent years
-(\citeproc{ref-stiennon_learning_2020}{Stiennon et al. 2020}). Using
-binary human-labeled feedback to make models closely aligned to human
-preferences is particularly difficult in scenarios where large datasets
-inherently encompass suboptimal behaviors. The approach of Reinforcement
-Learning from Human Feedback (RLHF)
-((\citeproc{ref-ouyang_training_2022}{Ouyang et al. 2022})) has risen to
-prominence as an effective method for addressing this issue. The
-technique applies to various domains, from prompt-image alignment,
-fine-tuning large language models or diffusion models, and improving the
-performance of robot policies.
-
-\subsubsection{Reinforcement Learning from Human
-Feedback}\label{reinforcement-learning-from-human-feedback}
-
-Reinforcement Learning from Human Feedback (RLHF) is a technique used to
-align AI behavior with human values by incorporating human feedback into
-the reinforcement learning process. This approach is particularly
-effective when large datasets inherently encompass suboptimal behaviors.
-RLHF aims to refine policies by discriminating between desirable and
-undesirable actions, ensuring that AI systems act following human
-preferences (\citeproc{ref-ouyang_training_2022}{Ouyang et al. 2022}).
-
-\textbf{The core concept of RLHF:} It first trains a reward model using
-a dataset of binary preferences gathered from human feedback. This
-reward model is then used to fine-tune the AI model through a
-reinforcement learning algorithm. The core concept is to utilize human
-feedback to guide AI learning, thereby aligning the AI's behavior with
-human expectations (\citeproc{ref-stiennon_learning_2020}{Stiennon et
-al. 2020}).
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.8\linewidth,height=\textheight,keepaspectratio]{src/Figures/rlhf.png}
-
-}
-
-\caption{\label{fig-toy0}The above diagram depicts the three steps in
-the traditional RLHF pipeline: (a) supervised fine-tuning, (b) reward
-model (RM) training, and (c) reinforcement learning via proximal policy
-optimization (PPO) on this reward model. Image taken from
-(\citeproc{ref-ouyang_training_2022}{Ouyang et al. 2022}).}
-
-\end{figure}%
-
-\textbf{The RLHF pipeline} involves the following steps:
-
-\textbf{Step 1: Supervised Fine-Tuning}
-
-In the initial step for language modeling tasks, we utilize a
-high-quality dataset consisting of
-\(\left(\text{prompt}, \text{response}\right)\) pairs to train the
-model. Prompts are sampled from a curated dataset designed to cover a
-wide range of instructions and queries, such as ``Explain the moon
-landing to a 6-year-old.'' Trained human labelers provide the desired
-output behavior for each prompt, ensuring responses are accurate, clear,
-and aligned with task goals. For instance, in response to the moon
-landing prompt, a labeler might generate, ``Some people went to the moon
-in a big rocket and explored its surface.'' The collected
-\(\left(\text{prompt}, \text{response}\right)\) pairs serve as the
-training data for the model, with the cross-entropy loss function
-applied only to the response tokens. This helps the model learn to
-generate responses that are closely aligned with the human-provided
-examples. The training process adjusts model parameters through
-supervised learning, minimizing the difference between the model's
-predictions and the human responses.
-
-\textbf{Step 2: Reward Model (RM) Training}
-
-In this step, we train a reward model to score any
-\(\left(\text{prompt}, \text{response}\right)\) pair and produce a
-meaningful scalar value. Multiple model-generated responses are sampled
-for each prompt. Human labelers then rank these responses from best to
-worst based on their quality and alignment with the prompt. For example,
-given the prompt ``Explain the moon landing to a 6-year-old,'' responses
-like ``People went to the moon in a big rocket and explored its
-surface'' might be ranked higher than ``The moon is a natural satellite
-of Earth.'' The rankings provided by the labelers are used to train the
-reward model \(\Phi_{\text{RM}}\). The model is trained by minimizing
-the following loss function across all training samples:
-
-\[\mathbb{L}(\Phi_{RM}) = -\mathbb{E}_{(x,y_e,i\rightarrow D_{RL})}[\log(\sigma(\Phi_{RM}(x, y_i)) - \Phi_{RM}(x, y_{1-i}))]\]
-
-for \(i \in \{0,1 \}\). This loss function encourages the reward model
-to produce higher scores for better-ranked responses, thereby learning
-to evaluate the quality of model outputs effectively.
-
-\textbf{Step 3: Reinforcement Learning}
-
-In this step, we refine the policy using reinforcement learning (RL)
-based on the rewards provided by the trained reward model. A new prompt
-is sampled from the dataset, and the policy generates an output. The
-reward model then calculates a reward for this output, and the reward is
-used to update the policy using the Proximal Policy Optimization (PPO)
-algorithm.
-
-The RL setting is defined as follows:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  \emph{Action Space}: The set of all possible actions the agent can
-  take, which, for language models, is typically the set of all possible
-  completions.
-\item
-  \emph{Policy}: A probability distribution over the action space. In
-  the case of language models like LLM, the policy is contained within
-  the model and represents the probability of predicting each
-  completion.
-\item
-  \emph{Observations}: The inputs to the policy, which in this context
-  are prompts sampled from a certain distribution.
-\item
-  \emph{Reward}: A numerical score provided by the Reward Model (RM)
-  that indicates the quality of actions taken by the agent.
-\end{enumerate}
-
-During training, batches of prompts are sampled from two distinct
-distributions, namely either \(D_\text{RL}\), the distribution of
-prompts explicitly used for the RL model, or \(D_\text{pretrain}\), the
-distribution of prompts from the pre-trained model. The objective for
-the RL agent is to maximize the reward while ensuring that the policy
-does not deviate significantly from the supervised fine-tuned model and
-does not degrade the performance on tasks the pre-trained model was
-optimized for. When sampling a response \(y\) to a prompt \(x\) from
-\(D_\text{RL}\), the first objective function is:
-
-\[\text{objective}_1(x_{RL}, y; \phi) = RM(x_{RL}, y) - \beta \log \frac{\text{LLM}_{\phi}^{RL}(y|x)}{\text{LLM}_{SFT}(y|x)}\]
-
-Where the first term is the reward from the RM, and the second term is
-the Kullback-Leibler (KL) divergence, weighted by a factor \(\beta\),
-which acts as a regularizer to prevent the RL model from straying too
-far from the SFT model. Further, for each \(x\) from
-\(D_\text{pretrain}\), the second objective is to ensure that the RL
-model's performance on text completion does not worsen:
-
-\[\text{objective}_2(x_{\text{pretrain}} ; \phi) = \gamma \log \text{LLM}_{\phi}^{RL}(x_{\text{pretrain}})\]
-
-where \(\gamma\) is a weighting factor that balances the influence of
-this objective against the others.
-
-The final objective function is a sum of the expected values of the two
-objectives described above, across both distributions. In the RL
-setting, we maximize \emph{this} objective function:
-
-\[\text{objective}(\phi) = E_{(x,y) \sim D_{\phi}^{RL}}[RM(x, y) - \beta \log \frac{\text{LLM}_{\phi}^{RL}(y|x)}{\text{LLM}_{SFT}(y|x)}] + \gamma E_{x \sim D_{\text{pretrain}}}[\log \text{LLM}_{\phi}^{RL}(x)]\]
-
-In practice, the second part of the objective is often not used to
-perform \(\text{RLHF}\). The KL penalty is typically enough to constrain
-the RL policy. This function balances the drive to maximize the reward
-with the need to maintain the quality of text completion and the
-similarity to the behavior of the supervised fine-tuned model.
-
-\textbf{Limitations and Challenges:} Despite its successes, RLHF faces
-several challenges. One major issue is the quality of human feedback,
-which can be inconsistent and subjective. Scalability is another
-concern, as obtaining a large amount of high-quality feedback can be
-expensive and time-consuming. Over-optimization and hallucinations,
-where the model generates plausible but incorrect outputs, are also
-common problems. This generally stems from temporal credit assignment
-and the instability of approximate dynamic programming
-(\citeproc{ref-vanhasselt_deep_2018}{Hasselt et al. 2018}). Further, it
-is expensive to gather tens of thousands of preferences over datasets to
-create robust reward models. Strategies to overcome these challenges
-include using diverse and representative training data, incorporating
-adversarial training to handle edge cases, and continuously refining the
-reward model based on ongoing feedback and performance evaluations
-(\citeproc{ref-leike2018scalable}{Leike et al. 2018}).
-
-\subsubsection{Contrastive Preference
-Learning}\label{contrastive-preference-learning}
-
-Contrastive Preference Learning (CPL) is a learning paradigm designed to
-enhance the alignment of AI systems with human preferences without
-relying on traditional reinforcement learning (RL) methods. CPL
-addresses many limitations inherent in traditional RLHF techniques by
-learning from human comparisons rather than explicit reward signals.
-This section provides an in-depth exploration of CPL, detailing its
-methodology, experiments, results, and potential challenges. Recent
-research has shown that human preferences are often better modeled by
-the optimal advantage function or regret, rather than traditional reward
-functions used in RLHF. Traditional RLHF approaches, which learn a
-reward function from a preference model and then apply RL, incur
-significant computational expenses and complexity
-(\citeproc{ref-hejna2023contrastive}{Hejna et al. 2023}). CPL offers a
-streamlined and scalable alternative by leveraging a more accurate
-regret model of human preferences.
-
-\textbf{The key idea of CPL} is the substitution of the optimal
-advantage function with the log probability of the policy in a maximum
-entropy reinforcement learning framework. This substitution is
-beneficial as it circumvents the need to learn the advantage function
-and avoids the optimization challenges associated with RL-like
-algorithms. By using the log probability of the policy, CPL more closely
-aligns with how humans model preferences and enables efficient
-supervised learning from human feedback.
-
-CPL is a structured approach to aligning AI behavior with human
-preferences by relying on a dataset of preferred behavior segments
-\(\mathcal{D}_{\text{pref}} = \{(\sigma_i^+, \sigma_i^-)\}_{i=1}^n\),
-where \(\sigma^+ \succ \sigma^-\). Each behavior segment \(\sigma\) is a
-sequence of states and actions,
-\(\sigma = (s_1, a_1, s_2, a_2, \ldots, s_k, a_k)\). The CPL approach
-aims to maximize the expected sum of rewards minus an entropy term,
-which promotes exploration and prevents overfitting to specific actions:
-
-\[\max_\pi \mathbb{E}_{\pi} \left[ \sum_{t=0}^{\infty} \gamma^t (r(s_t, a_t) - \alpha \log \pi(a_t | s_t)) \right]\]
-
-where \(\gamma\) is the discount factor, \(\alpha\) is the temperature
-parameter controlling the stochasticity of the policy, and \(r\) is the
-reward function. This step sets the foundation by defining the
-optimization objective that the CPL model strives to achieve. In the
-learning process, CPL compares the log probabilities of actions in
-preferred segments \(\sigma^+\) against those in non-preferred segments
-\(\sigma^-\) :
-
-\[\mathbb{L}_{CPL}(\pi_\theta, \mathcal{D}_{\text{pref}}) = \mathbb{E}_{(\sigma^+,\sigma^-) \sim \mathcal{D}_{\text{pref}}} \left[ -\log \frac{\exp(\sum_{\sigma^+} \gamma^t \alpha \log \pi_\theta(a_t^+|s_t^+))}{\exp(\sum_{\sigma^+} \gamma^t \alpha \log \pi_\theta(a_t^+|s_t^+)) + \exp(\sum_{\sigma^-} \gamma^t \alpha \log \pi_\theta(a_t^-|s_t^-))} \right]\]
-
-This comparison allows the model to learn which actions are more aligned
-with human preferences, forming the core learning mechanism of CPL. The
-preference model for CPL is regret-based, described as
-
-\[P_{A^*}[\sigma^+ \succ \sigma^-] = \frac{\exp(\sum_{\sigma^+} \gamma^t A^*(s_t^+, a_t^+))}{\exp(\sum_{\sigma^+} \gamma^t A^*(s_t^+, a_t^+)) + \exp(\sum_{\sigma^-} \gamma^t A^*(s_t^-, a_t^-))}\]
-where \(A^*(s_t, a_t)\) represents the advantage function and is a
-matrix. This step models human preferences based on regret, reflecting
-how humans might evaluate different behaviors.
-
-One hypothesis as to why one might consider a regret-based model more
-useful over a sum-of-rewards, Bradley-Terry model is that humans likely
-think of preferences based on the regret of each behavior under the
-optimal policy of the expert's reward function.
-
-The key insight that the paper leverages is that from
-(\citeproc{ref-ziebart_modeling_2010}{Ziebart 2010}) in MaxEnt Offline
-RL. In this general setting,
-(\citeproc{ref-ziebart_modeling_2010}{Ziebart 2010}) shows that one can
-write that the optimal advantage function is related to the optimal
-policy by \(A^*_r(s, a) = \alpha \log \pi^*(a|s)\). Therefore, the loss
-function for CPL can be written by substituting the above result to
-obtain:
-\[L_{CPL}(\pi_\theta, \mathcal{D}_{\text{pref}}) = \mathbb{E}_{(\sigma^+,\sigma^-) \sim \mathcal{D}_{\text{pref}}} \left[ -\log P_{\pi_\theta}[\sigma^+ \succ \sigma^-] \right]\]
-
-One merit of using CPL over the typical RLHF pipeline is that it can
-lead to a deduction in mode collapse. Further, it makes reward
-misgeneralization failures less likely, enhancing the reliability of the
-learned policy. However, the approach still has a few limitations:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  CPL assumes knowledge of the human rater's temporal discounting (i.e.,
-  of the discount factor \(\gamma\)), which in practice would be
-  difficult to communicate.
-\item
-  CPL's loss function is computed over segments, it requires a
-  substantial amount of GPU memory for large segment sizes.
-\end{enumerate}
-
-How does RLHF with PPO and CPL compare their effectiveness and
-applicability in aligning AI systems with human values?
-
-The ongoing challenge in aligning foundation models in the future will
-be to refine these methodologies further, balancing computational
-feasibility with the sophistication needed to capture the intricacies of
-human values and countering failure modes such as reward
-over-optimization. In conclusion, exploring value learning through RLHF
-and CPL methods has enriched our understanding of integrating human
-preferences into foundation models. To provide a well-rounded
-perspective on aligning AI systems with human values, the following
-table highlights a detailed comparison of RLHF with PPO and CPL,
-emphasizing their advantages, limitations, and ideal scenarios.
-
-\begin{longtable}[]{@{}
-  >{\raggedright\arraybackslash}p{(\linewidth - 4\tabcolsep) * \real{0.3056}}
-  >{\raggedright\arraybackslash}p{(\linewidth - 4\tabcolsep) * \real{0.3194}}
-  >{\raggedright\arraybackslash}p{(\linewidth - 4\tabcolsep) * \real{0.3194}}@{}}
-\caption{Comparison between RLHF with PPO and
-CPL}\label{tbl-ppo_vs_cpl}\tabularnewline
-\toprule\noalign{}
-\begin{minipage}[b]{\linewidth}\raggedright
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-\textbf{RLHF with PPO}
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-\textbf{CPL}
-\end{minipage} \\
-\midrule\noalign{}
-\endfirsthead
-\toprule\noalign{}
-\begin{minipage}[b]{\linewidth}\raggedright
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-\textbf{RLHF with PPO}
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-\textbf{CPL}
-\end{minipage} \\
-\midrule\noalign{}
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-\textbf{Strengths} & \begin{minipage}[t]{\linewidth}\raggedright
-\begin{itemize}
-\item
-  Excels in optimizing policies through reinforcement learning
-\item
-  Suitable for tasks that benefit from iterative improvement
-\item
-  Effective in continuous action spaces
-\end{itemize}
-\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
-\begin{itemize}
-\item
-  Emphasizes regret and optimality rather than reward maximization
-\item
-  Reduces computational overhead
-\item
-  Aligns more closely with human preferences
-\item
-  Avoids reward
-\end{itemize}
-
-over-optimization
-
-\begin{itemize}
-\tightlist
-\item
-  More scalable due to reliance on supervised learning techniques
-\end{itemize}
-\end{minipage} \\
-\textbf{Limitations} & \begin{minipage}[t]{\linewidth}\raggedright
-\begin{itemize}
-\item
-  Faces limitations in handling complex preference structures
-\item
-  High computational cost
-\item
-  Susceptible to reward
-\end{itemize}
-
-misgeneralization
-\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
-\begin{itemize}
-\item
-  May struggle in environments where direct human feedback is less
-  accessible
-\item
-  Depends on high-quality preference data for effective training
-\end{itemize}
-\end{minipage} \\
-\textbf{Ideal Scenarios} & \begin{minipage}[t]{\linewidth}\raggedright
-\begin{itemize}
-\item
-  Tasks with well-defined reward functions
-\item
-  Environments allowing extensive interaction and feedback
-\end{itemize}
-\end{minipage} & \begin{minipage}[t]{\linewidth}\raggedright
-\begin{itemize}
-\item
-  Environments where human feedback is more accessible than well-defined
-  reward functions
-\item
-  Tasks requiring computational efficiency and scalability
-\end{itemize}
-\end{minipage} \\
-\end{longtable}
-
-\subsection{Value Alignment
-Verification}\label{value-alignment-verification}
-
-After we discuss the techniques of value learning, it becomes evident
-that aligning machine behavior with human values, while advanced, is
-inherently approximate and not infallible. This realization underscores
-the importance of value alignment verification---a methodology to ensure
-that the values imparted to a machine truly reflect those of a human.
-Human-robot value alignment has been explored through various lenses,
-including qualitative trust assessments
-(\citeproc{ref-huang2018establishing}{Huang et al. 2018}), asymptotic
-alignment through active learning of human preferences
-(\citeproc{ref-hadfield2016cooperative}{Hadfield-Menell et al. 2016};
-\citeproc{ref-christiano2017deep}{P. F. Christiano et al. 2017};
-\citeproc{ref-sadigh2017active}{Sadigh et al. 2017}), and formal
-verification methods (\citeproc{ref-brown2021value}{Brown et al. 2021}).
-This section will focus on the formal verification approach for value
-alignment as discussed in (\citeproc{ref-brown2021value}{Brown et al.
-2021}). Unless otherwise stated, all information presented here is
-derived from (\citeproc{ref-brown2021value}{Brown et al. 2021}). This
-approach aims to ensure that the values imparted to a machine align with
-those of a human.
-
-To begin with, consider an MDP with state space \(\mathcal{S}\), action
-space \(\mathcal{A}\), and transition model \(\mathcal{T}\). This formal
-framework allows us to model the environment in which humans and robots
-operate. Denote the human's reward function as \(R\) and the robot's
-reward function as \(R^\prime\). Both the human and robot reward
-functions must be linear in a set of shared features, defined as:
-\[\begin{aligned}
-    R(s) = \mathbf{w}^\top \phi(s), R^\prime(s) = \mathbf{w}^{\prime \top} \phi(s).
-\end{aligned}\]
-
-These linear reward functions provide a common ground for comparing
-human and robot preferences.
-
-Next, the optimal state-action value function, which indicates the
-expected cumulative reward of following a policy \(\pi\) starting from
-state \(s\) and action \(a\), but we follow the notation in
-(\citeproc{ref-brown2021value}{Brown et al. 2021}) for simplicity. The
-optimal state-action value function is given by:
-
-\[\begin{aligned}
-    Q_R^\pi (s,a) = \mathbf{w}^\top \Phi_{\pi_R}^{(s,a)}, \Phi_{\pi_R}^{(s,a)} = \mathbb{E}_\pi [\sum_{t=0}^\infty \gamma^t \phi(s_t) \vert s_0 = s, a_0 = a].
-\end{aligned}\]
-
-Here, \(\Phi_{\pi_R}^{(s,a)}\) is the feature expectation vector under
-policy \(\pi\), capturing the long-term feature visitation frequencies.
-We overload the action space notation to define the set of all optimal
-actions given a state as
-
-\[\begin{aligned}
-    \mathcal{A}_R(s) = \underset{x}{\operatorname{argmax}} \\ Q^{\pi^*}_R(s,a)
-\end{aligned}\] where \(\pi^*\) is an optimal policy. We can now define
-the aligned reward polytope (ARP). The ARP is the set of all weights
-\(\mathcal{w}\) that satisfy the following set of strict linear
-inequalities, \(\mathbf{w}^\top \mathbf{A}  > \mathbf{0}\) where each
-row of \(\mathbf{A}\) corresponds to
-\(\Phi_{\pi^*_R}^{(s,a)} - \Phi_{\pi^*_R}^{(s,b)}\) for a single
-\((s,a,b)\) tuple where
-\(s \in \mathcal{S}, a \in \mathcal{A}_R(s), b \notin \mathcal{A}_R(s)\).
-Thus, to construct \(\mathbf{A}\), one must loop over all \((s,a,b)\)
-tuples which has complexity
-\(O(\vert \mathcal{S} \vert \cdot \vert \mathcal{A} \vert^2)\). This
-construction ensures that the weights \(\mathbf{w}\) align with the
-human's optimal actions across all states.
-
-The intuition behind the ARP is that we use the human optimal policy for
-each state to determine what actions are optimal and what are suboptimal
-at this state. Then, for every one of those combinations, we can place a
-linear inequality on the set of reward weights consistent with that
-optimal vs suboptimal action bifurcation. One of the key assumptions
-that let us do this is that we assume both the human and the robot act
-optimally according to their reward function. This is known as a
-\emph{rationality assumption} and provides the link between actions and
-rewards that we need.
-
-For illustration, consider a simple grid world environment.
-\textbf{?@fig-toy} shows the optimal policy and the corresponding ARP.
-The optimal policy reveals that the gray state is less preferred
-compared to the white states, which is reflected in the ARP (hatched
-region of \textbf{?@fig-toy}).
-
-Optimal policy (a) and aligned reward polytope (ARP) (b) for a grid
-world with two features (white and gray) and a linear reward function
-({R(s) = w0 ⋅ 1white(s) + w1 ⋅ 1gray(s)}). The ARP is denoted by the
-hatched region in (b).
-
-Computing the ARP exactly can be computationally demanding or we may not
-have access to the robot's reward function. This section describes
-heuristics for testing value alignment in the case the robot's reward
-weights (\(\mathbf{w^\prime}\)) are unknown, but the robot's policy can
-be queried. Heuristics provide simplified methods to estimate value
-alignment without the need for exhaustive computations.
-
-\textbf{ARP-blackbox:} The ARP black-box (ARP-bb) heuristic helps
-address the challenge of computing the ARP by allowing users to work
-with a simplified model. In this heuristic, the user first solves for
-the ARP and removes all redundant half-space constraints. For each
-remaining half-space constraint, the user queries the robot's action at
-the corresponding state. The intuition here is that states, where
-different actions are taken, reveal crucial information about the reward
-function. By focusing on these key states, we can gain insights into the
-robot's reward function without needing to know it explicitly.
-
-\textbf{Set Cover Optimal Teaching:} The Set Cover Optimal Teaching
-(SCOT) heuristic uses techniques from
-(\citeproc{ref-brown2019machine}{Brown and Niekum 2019}) to generate
-maximally informative trajectories. These trajectories are sequences of
-states where the number of optimal actions is limited, making them
-particularly informative for understanding the robot's policy. By
-querying the robot for actions along these trajectories, we can
-efficiently gauge the alignment of the robot's policy. This method helps
-to identify potential misalignments by focusing on critical decision
-points in the trajectories.
-
-\textbf{Critical States:} The Critical States (CS) heuristic identifies
-states where the gap in value between the optimal action and an average
-action is significant. These states are crucial because if the robot's
-policy is misaligned, the misalignment will be most consequential at
-these critical states. By querying the robot's policy at these states,
-we can assess the alignment more effectively. This heuristic is
-particularly useful when we have a limited budget of states to check, as
-it prioritizes the most informative states for evaluation.
-
-\textbf{Practical Examples:} To illustrate the concepts of value
-alignment verification, we present an example of applying value
-alignment verification in a simple MDP grid world environment. Consider
-a grid world where the human's reward function is defined as
-\(R(s) = 50 \cdot \mathbf{1}_{green}(s) - 1 \cdot \mathbf{1}_{white}(s) - 50 \cdot \mathbf{1}_{blue}(s)\),
-where \(\mathbf{1}_{color}(s)\) is an indicator feature for the color of
-the grid cell. The objective is to align the robot's policy with this
-reward function.
-
-\phantomsection\label{fig-island}{}
-
-\begin{enumerate}
-\def\labelenumi{(\alph{enumi})}
-\tightlist
-\item
-  optimal policy (b) preference query 1 (c) preference query 2 (d)
-  ARP-bb queries (e) SCOT queries (f) CS queries. In the preference
-  queries, the human reward model prefers black to orange.
-\end{enumerate}
-
-\textbf{?@fig-island} (a) shows all optimal actions at each state
-according to the human's reward function. This optimal policy serves as
-the benchmark for alignment verification. \textbf{?@fig-island} (b) and
-\textbf{?@fig-island} (c) show two pairwise preference trajectory
-queries (black is preferable to orange according to
-(\hyperref[eq:ux5cux2520human_r]{{[}eq: human\_r{]}})). Preference query
-1 verifies that the robot values reaching the terminal goal state
-(green) rather than visiting more white states. Preference query 2
-verifies that the robot values white states more than blue states. These
-two preference queries are all we need to determine whether the robot's
-values are aligned with the human's values.
-
-Next, we apply the heuristics discussed in the previous section to this
-grid world example. \textbf{?@fig-island} (d), \textbf{?@fig-island}
-(e), and \textbf{?@fig-island} (f) show the action queries requested by
-the heuristics ARP-bb, SCOT, and CS. Each heuristic queries the robot's
-actions at specific states to assess alignment:
-
-\begin{itemize}
-\item
-  \textbf{ARP-bb}: This heuristic queries the fewest states but is
-  myopic. It focuses on critical states derived from the ARP.
-\item
-  \textbf{SCOT}: This heuristic generates maximally informative
-  trajectories, querying more states than necessary but providing a
-  comprehensive assessment.
-\item
-  \textbf{CS}: This heuristic queries many redundant states, focusing on
-  those where the value gap between optimal and average actions is
-  significant.
-\end{itemize}
-
-To pass the test given by each heuristic, the robot's action at each of
-the queried states must be optimal under the human's reward function.
-The example demonstrates that while the ARP-bb heuristic is efficient,
-it might miss the broader context. SCOT provides a thorough assessment
-but at the cost of querying more states. CS focuses on high-impact
-states but includes redundant queries.
-
-It is important to note that both the construction of the ARP and the
-heuristics rely on having an optimal policy for the human. Thus, in most
-practical settings we would simply use that policy on the robot without
-needing to bother with value alignment verification. As such, value
-alignment verification as presented here is more of an academic exercise
-rather than a tool of practical utility.
-
-\section{Human-Centered Design}\label{human-centered-design}
-
-After understanding AI alignment, the next step is to explore practical
-methodologies for incorporating user feedback and ensuring that AI
-systems not only align with but also cater to the needs and preferences
-of their users. This section will provide insights into various
-Human-Centered Design techniques and their application in creating AI
-systems that are intuitive and ethically sound, ultimately enhancing the
-human-AI interaction experience.
-
-\subsection{AI and Human-Computer
-Interaction}\label{ai-and-human-computer-interaction}
-
-Human-Computer Interaction (HCI) is critical in the context of
-artificial intelligence because it focuses on designing systems that are
-intuitive and responsive to human needs. While human-robot interaction
-and other forms of human interaction with technology are important, HCI
-specifically addresses the broader and more common interfaces that
-people interact with daily. HCI principles ensure that AI systems are
-not only functional but also accessible and user-friendly, making them
-essential for the successful integration of AI into everyday life. By
-focusing on HCI, we can leverage established methodologies and insights
-to create AI systems that are more aligned with human values and needs.
-
-At the heart of this exploration is the concept of human-in-the-loop
-processes. As AI systems become more sophisticated, their ability to
-simulate human decision-making processes and behaviors has increased,
-leading to innovative applications across various domains. The
-presentation by Meredith Morris, titled ``Human-in-the-loop Computing:
-Reimagining Human-Computer Interaction in the Age of AI,'' shows work in
-the integration of human intelligence with AI capabilities
-(\citeproc{ref-Morris2019HITL}{Morris 2019}). Projects like Soylent and
-LaMPost are highlighted as exemplary cases of this integration. Soylent
-is a Word plugin that uses human computation to help with editing tasks,
-while LaMPost is a platform that leverages crowd workers to aid in
-natural language processing tasks
-(\citeproc{ref-bernstein2010soylent}{Bernstein et al. 2010};
-\citeproc{ref-lamport2017lampost}{Project 2017}). These examples
-demonstrate how human input can significantly enhance AI outputs by
-leveraging the unique strengths of human cognition, thereby addressing
-complex AI problems that were previously unsolvable. For instance,
-Soylent can improve text quality by incorporating nuanced human
-feedback, and LaMPost can refine NLP tasks by incorporating human
-insights into language subtleties, both of which go beyond the
-capabilities of fully automated systems. However, the integration of
-human elements in AI systems brings up critical ethical considerations.
-The presentation discusses the changing perceptions of the ethics of
-human-in-the-loop processes. While the cost-effectiveness of human data
-labeling and other processes was once seen as beneficial, it is the
-ethical implications of such interactions that take precedence nowadays.
-This shift underscores the evolving norms in HCI and the importance of
-considering the ethical dimensions of human-AI interactions.
-
-The role of diverse human perspectives plays a crucial role in enhancing
-AI systems. Involving a broad spectrum of users in the development and
-testing of AI systems ensures that these technologies are inclusive and
-representative of the global population, moving beyond the limitations
-of a WEIRD (Western, Educated, Industrialized, Rich, and Democratic)
-user base. The methodologies for collecting user feedback in HCI form a
-critical part of this discussion since they are vital in understanding
-user needs, preferences, and behaviors, which in turn inform the
-development of more user-centered AI systems. The presentation by
-Meredith Morris (\citeproc{ref-Morris2019HITL}{Morris 2019}) also
-highlights how these methods can be effectively employed to gain
-insights from users to ensure that AI systems are aligned with the
-real-world needs and expectations of users. In HCI, collecting user
-feedback is a fraught problem. When interacting with AI systems, the
-typical end user simply cares about tasks that the system can perform.
-Thus, a key question in HCI for AI is finding and understanding these
-tasks. \textbf{Methodologies for collecting user feedback in HCI}, are
-described as follow:
-
-\begin{itemize}
-\item
-  \textbf{Storyboarding} is a visual method used to predict and explore
-  the user experience with a product or service. A storyboard in HCI is
-  typically a sequence of drawings with annotations that represent a
-  user's interactions with technology. This technique is borrowed from
-  the film and animation industry and is used in HCI to convey a
-  sequence of events or user flows, including the user's actions,
-  reactions, and emotions.
-\item
-  \textbf{Wizard of Oz Studies} is a method of user testing where
-  participants interact with a system they believe to be autonomous, but
-  which is actually being controlled or partially controlled by a human
-  `wizard' behind the scenes. This technique allows researchers to
-  simulate the response of a system that may not yet be fully functional
-  or developed.
-\end{itemize}
-
-Both \textbf{Storyboarding} and \textbf{Wizard of Oz Studies} are
-effective for engaging with users early in the design process. They help
-deal with the problem of gathering feedback on a product that doesn't
-yet exist. Users often have difficulty imagining outcomes when they
-cannot touch a live demonstration.
-
-\begin{itemize}
-\item
-  \textbf{Surveys} in HCI are structured tools that consist of a series
-  of questions designed to be answered by a large number of
-  participants. They can be conducted online, by telephone, through
-  paper questionnaires, or using computer-assisted methods. Surveys are
-  useful for collecting quantitative data from a broad audience, which
-  can be analyzed statistically.
-\item
-  \textbf{Interviews} in HCI are more in-depth and involve direct,
-  two-way communication between the researcher and the participant.
-  Interviews can be structured, semi-structured, or unstructured,
-  ranging from tightly scripted question sets to open-ended
-  conversations.
-\item
-  \textbf{Focus Groups} involve a small group of participants discussing
-  their experiences and opinions about a system or design, often with a
-  moderator. Group dynamics can provide insights into collective user
-  perspectives. In particular, users can bounce ideas off each other to
-  provide richer feedback and quieter users who may not otherwise
-  provide feedback may be encouraged by their peers.
-\item
-  \textbf{Community-Based Participatory Design (CBPD)} is a
-  human-centered approach that involves the people who will use a
-  product in the design and development process. With CBPD, designers
-  work closely with community members to identify problems, develop
-  prototypes, and iterate based on community feedback. For example, when
-  building a software product for deaf people, the engineering team can
-  hire deaf engineers or designers to provide feedback as they
-  collaboratively build the product.
-\item
-  \textbf{Field Studies} involve observing and collecting data on how
-  users interact with a system in their natural environment. This method
-  is based on the premise that observing users in their context provides
-  a more accurate understanding of user behavior. It can include a
-  variety of techniques like ethnography, contextual inquiries, and
-  natural observations.
-\item
-  \textbf{Lab-based studies} are conducted in a controlled environment
-  where the researchers can manipulate variables and observe user
-  behavior in a setting designed to minimize external influences. Common
-  lab-based methods include usability testing, controlled experiments,
-  and eye-tracking studies.
-\item
-  \textbf{Diary Studies and Ethnography} in HCI are a research method
-  where participants are asked to keep a record of their interactions
-  with a system or product over a while. This log may include text,
-  images, and sometimes even audio or video recordings, depending on the
-  study's design. Participants typically document their activities,
-  thoughts, feelings, and frustrations as they occur in their natural
-  context.
-\item
-  \textbf{Ethnography} is a qualitative research method that involves
-  observing and interacting with participants in their real-life
-  environment. Ethnographers aim to immerse themselves in the user
-  environment to get a deep understanding of the cultural, social, and
-  organizational contexts that shape technology use.
-\end{itemize}
-
-As we have explored various methodologies for collecting human feedback,
-it becomes evident that the role of human input is indispensable in
-shaping AI systems that are not only effective but also ethically sound
-and user-centric. In the next step, we will elaborate on how to design
-AI systems for positive human impact, examining how socially aware and
-human-centered approaches can be employed to ensure that AI technologies
-contribute meaningfully to society. This includes understanding how AI
-can be utilized to address real-world challenges and create tangible
-benefits for individuals and communities.
-
-\subsection{Designing AI for Positive Human
-Impact}\label{designing-ai-for-positive-human-impact}
-
-In the field of natural language processing (NLP), the primary focus has
-traditionally been on quantitative metrics such as performance
-benchmarks, accuracy, and computations. These metrics have long guided
-the development and evaluation of the technologies. However, as the
-field evolves and becomes increasingly intertwined with human
-interactions like the recent popularity of Large Language Models (LLMs),
-a paradigm shift is becoming increasingly necessary. For example, these
-LLMs are shown to produce unethical or harmful responses or reflect
-values that only represent a certain group of people. The need for a
-human-centered approach in NLP development is crucial as these models
-are much more likely to be utilized in a broad spectrum of human-centric
-applications, impacting various aspects of daily life. This shift calls
-for an inclusive framework where LLMs are not only optimized for
-efficiency and accuracy but are also sensitized to ethical, cultural,
-and societal contexts. Integrating a human-centered perspective ensures
-that these models are developed with a deep understanding of, and
-respect for, the diversity and complexity of human values and social
-norms. This approach goes beyond merely preventing harmful outcomes; it
-also focuses on enhancing the positive impact of NLP technologies on
-society. In this session, we explore the intricacies of a human-centered
-approach in NLP development, focusing on three key themes: Socially
-Aware, Human-Centered, and Positively Impactful.
-
-\subsubsection{Socially Aware}\label{socially-aware}
-
-In the exploration of socially aware NLP,
-(\citeproc{ref-hovy-yang-2021-importance}{Hovy and Yang 2021}) presents
-a comprehensive taxonomy of seven social factors grounded in linguistic
-theory (See Figure~\ref{fig-taxonomy}).
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.3\linewidth,height=\textheight,keepaspectratio]{src/Figures/seven-taxonomy.png}
-
-}
-
-\caption{\label{fig-taxonomy}Taxonomy of social factors}
-
-\end{figure}%
-
-This taxonomy illustrates both the current limitations and progressions
-in NLP as they pertain to each of these factors. The primary aim is to
-motivate the NLP community to integrate these social factors more
-effectively, thereby advancing towards a level of language understanding
-that more closely resembles human capabilities. The characteristics of
-speakers, encompassing variables such as age, gender, ethnicity, social
-class, and dialect, play a crucial role in language processing. Certain
-languages or dialects, often categorized as low-resource, are spoken by
-vulnerable populations that require special consideration in NLP
-systems. In many cases, the dominant culture and values are
-over-represented, leading to an inadvertent marginalization of minority
-perspectives. These minority voices must be not only recognized but also
-given equitable representation in language models. Additionally, norms
-and context are vital components in understanding linguistic behavior.
-They dictate the appropriateness of language use in various social
-situations and settings. Recognizing and adapting to these norms is a
-critical aspect of developing socially aware NLP systems that can
-effectively function across diverse social environments.
-
-\subsubsection{Human-Centered}\label{human-centered}
-
-The Human-Centered aspect of NLP development emphasizes the creation of
-language models that prioritize the needs, preferences, and well-being
-of human users. This involves integrating human-centered design
-principles throughout the development stages of LLMs, which are
-described as follows:
-
-\begin{itemize}
-\item
-  \textbf{Task Formulation stage:} Human-centered NLP development begins
-  with understanding the specific problems and contexts in which users
-  operate. This involves collaborating with end-users to identify their
-  needs and challenges, ensuring that the tasks addressed by the models
-  are relevant and meaningful to them. By engaging with users early in
-  the process, developers can create models that are not only
-  technically robust but also practically useful.
-\item
-  \textbf{Data Collection stage:} Human-centered principles ensure that
-  the data used to train models is representative of the diverse user
-  population. This includes collecting data from various demographic
-  groups, languages, and cultural contexts to avoid biases that could
-  lead to unfair or harmful outcomes. Ethical considerations are
-  paramount, ensuring that data is collected with informed consent and
-  respecting users' privacy.
-\item
-  \textbf{Data Processing} in a human-centered approach involves
-  carefully curating and annotating data to reflect the nuances of human
-  language and behavior. This step includes filtering out potentially
-  harmful content, addressing imbalances in the data, and ensuring that
-  the labels and annotations are accurate and meaningful. By involving
-  human annotators from diverse backgrounds, developers can capture a
-  wider range of perspectives and reduce the risk of biased outputs.
-\item
-  \textbf{Model Training} with a human-centered focus involves
-  incorporating feedback from users and domain experts to fine-tune the
-  models. This iterative process ensures that the models remain aligned
-  with users' needs and preferences. Techniques such as active learning,
-  where the model queries users for the most informative examples, can
-  be employed to improve the model's performance.
-\item
-  \textbf{Model Evaluation} in a human-centered framework goes beyond
-  traditional metrics like accuracy and F1-score. It includes assessing
-  the model's impact on users, its fairness, and its ability to handle
-  real-world scenarios. User studies and A/B testing can provide
-  valuable insights into how the model performs in practice and how it
-  affects users' experiences.
-\item
-  \textbf{Deployment} of human-centered NLP models involves continuous
-  monitoring and feedback loops to ensure that the models remain
-  effective and aligned with users' needs over time. This includes
-  setting up mechanisms for users to report issues and provide feedback,
-  which can then be used to update and improve the models. Ensuring
-  transparency in how the models operate and how user data is used also
-  fosters trust and acceptance among users.
-\end{itemize}
-
-\subsubsection{Positively Impactful}\label{positively-impactful}
-
-Building on the human-centered approach, it is crucial to consider how
-language models can be utilized and the broader impacts they can have on
-society.
-
-\textbf{Utilization:} LLMs offer socially beneficial applications across
-various domains such as public policy, mental health, and education. In
-public policy, they assist in analyzing large volumes of data to inform
-decision-making processes. In mental health, LLMs can provide
-personalized therapy and even train therapists by simulating patient
-interactions. In the education sector, they enable personalized learning
-experiences and language assistance, making education more accessible
-and effective. These examples demonstrate the versatility of LLMs in
-contributing positively to critical areas of human life.
-
-\textbf{Impact:} The deployment of NLP models, especially LLMs, has
-significant societal impacts. Positively, they enhance human
-productivity and creativity, offering tools and insights that streamline
-processes and foster innovative thinking. LLMs serve as powerful aids in
-various sectors, from education to industry, enhancing efficiency and
-enabling new forms of expression and problem-solving. it is essential to
-acknowledge the potential negative impacts. One major concern is the
-ability of LLMs to generate and spread misinformation. As these models
-become more adept at producing human-like text, distinguishing between
-AI-generated and human-created content becomes increasingly challenging.
-This raises issues of trust and reliability, with the risk of widespread
-dissemination of false or misleading information, which could have
-significant adverse effects on individuals and society.
-
-By considering both the utilization and impact of LLMs, we can better
-harness their potential for positive societal contributions while
-mitigating the risks associated with their deployment. In conclusion, by
-thoughtfully integrating human-centered principles and ensuring positive
-impacts through feedback collection and ethical considerations, we can
-develop language models that not only enhance human well-being but also
-align closely with societal values. Building on these foundational
-principles, we now turn our attention to Adaptive User Interfaces, which
-exemplify the practical application of these concepts by personalizing
-interactions and improving user experiences in dynamic environments.
-
-\subsection{Adaptive User Interfaces}\label{adaptive-user-interfaces}
-
-Adaptive user interfaces (AUIs) represent a significant advancement in
-personalizing user experiences by learning and adapting to individual
-preferences. This section will discuss the methodologies and
-applications of AUIs, highlighting their role in enhancing human-AI
-interaction through intelligent adaptation. The integration of AUIs
-within human-centered design paradigms ensures that AI systems not only
-meet user needs but also anticipate and adapt to their evolving
-preferences, thus maximizing positive human impact. Nowadays, consumers
-have more choices than ever and the need for personalized and
-intelligent assistance to make sense of the vast amount of information
-presented to them is clear.
-
-\subsubsection{Key ideas}\label{key-ideas}
-
-In general, personalized recommendation systems require a model or
-profile of the user. We categorize modeling approaches into four groups.
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  User-created profiles (usually done manually).
-\item
-  Manually defined groups that each user is classified into.
-\item
-  Automatically learned groups that each user is classified into.
-\item
-  Adaptively learned individual user models from interactions with the
-  recommendation system.
-\end{enumerate}
-
-The last approach is referred to as \emph{adaptive user interfaces}.
-This approach promises that each user is given the most personalization
-possible, leading to better outcomes. In this session, we discuss
-recommendation systems that adaptively learn an individual's preferences
-and use that knowledge to intelligently recommend choices that the
-individual is more inclined to like.
-
-The problem of learning individual models can be formalized as follows:
-a set of tasks requiring a user decision, a description for each task,
-and a history of the user's decision on each task. This allows us to
-find a function that maps from task descriptions (features) to user
-decisions. Tasks can be described using crowd-sourced data (a
-collaborative approach) or measurable features of the task (a
-content-based approach). This session will focus on content-based
-approaches for describing tasks. After understanding the framework for
-adaptive user interfaces, it is useful to provide example applications
-to ground future discussions. Adaptive user interfaces have been
-developed for command and form completion, email filtering and filing,
-news selection and layout, browsing the internet, selecting movies and
-TV shows, online shopping, in-car navigation, interactive scheduling,
-and dialogue systems, among many other applications.
-
-\subsubsection{Design}\label{design}
-
-The goal of an adaptive user interface is to create a software tool that
-reduces human effort by acquiring a user model based on past user
-interactions. This is analogous to the goal of machine learning (ML)
-which is to create a software tool that improves some task performance
-by acquiring knowledge based on partial task experience. The design of
-an adaptive user interface can be broken up into six steps:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\item
-  \textbf{Formulating the Problem:} Given some task that an intelligent
-  system could aid, the goal is to find a formulation that lets the
-  assistant improve its performance over time by learning from
-  interactions with a user. In this step the designer has to make design
-  choices about what aspect of user behavior is predicted, and what is
-  the proper level of granularity for description (i.e.~what is a
-  training example). This step usually involves formulating the problem
-  into some sort of supervised learning framework.
-\item
-  \textbf{Engineering the Representation:} At this stage we have a
-  formulation of a task in ML terms and we need to represent the
-  behavior and user model in such a way that makes computational
-  learning not only tractable but as easy as possible. In this step, the
-  designer has to make design choices about what information is used to
-  make predictions, and how that information is encoded and passed to
-  the model.
-\item
-  \textbf{Collecting User Traces:} In this third step the goal is to
-  find an effective way to collect traces (samples) of user behavior.
-  The designer must choose how to translate traces into training data
-  and also how to elicit traces from a user. An ideal adaptive user
-  interface places no extra effort on the user to collect such traces.
-\item
-  \textbf{Modeling the User:} In this step the designer must decide what
-  model class to use (neural network, decision tree, graphical model,
-  etc.) and how to train the model (optimizer, step size, batch size,
-  etc.). This step in the design process is usually given too much
-  importance in academia. It is quite often the case that the success of
-  an adaptive user interface is more sensitive to the other design
-  steps.
-\item
-  \textbf{Using the Model Effectively:} At this stage the designer must
-  decide how the model will be integrated into a software tool.
-  Specifically, when and how is the model evaluated and how is the
-  output of the model presented to the user? In addition, the designer
-  must consider how to handle situations in which the model predictions
-  are wrong. An ideal adaptive user interface will let the user take
-  advantage of good predictions and ignore bad ones.
-\item
-  \textbf{Gaining User Acceptance:} The final step in the design process
-  is to get users to try the system and ultimately adopt it. The initial
-  attraction of users is often a marketing problem, but to retain users
-  the system must be well-designed and easy to use.
-\end{enumerate}
-
-\subsubsection{Applications}\label{applications-1}
-
-After understanding the design of Adaptive User Interfaces, let's take a
-look at how we can apply it to real-world problems. We will summarize
-and analyze three different application areas of learning human
-preferences, which are driving route advisor
-(\citeproc{ref-rogers1999adaptive}{Rogers, Fiechter, and Langley 1999}),
-destination selection (\citeproc{ref-langley1999adaptive}{Langley et al.
-1999}), and resource scheduling
-(\citeproc{ref-gervasio1999learning}{Gervasio, Iba, and Langley 1999}).
-
-\textbf{1. Driving Route Advisor:} The task of route selection involves
-determining a desirable path for a driver to take from their current
-location to a chosen destination, given the knowledge of available roads
-from a digital map. While computational route advisors exist in rental
-cars and online, they cannot personalize individual drivers'
-preferences, which is a gap that adaptive user interfaces aim to fill by
-learning and recommending routes tailored to the driver's unique choices
-and behaviors.
-
-Here is an approach to route selection through learning individual
-drivers' route preferences.
-
-\begin{itemize}
-\item
-  Formulation: Learn a ``subjective'' function to evaluate entire
-  routes.
-\item
-  Representation: Global route features are computable from digital
-  maps.
-\item
-  Data collection: Preference of one complete route over another.
-\item
-  Induction: A method for learning weights from preference data.
-\item
-  Using model: Apply subjective function to find ``optimal'' route.
-\end{itemize}
-
-This method aims to learn a user model that considers the entirety of a
-route, thereby avoiding issues like data fragmentation and credit
-assignment problems.
-
-The design choices are incorporated into
-(\citeproc{ref-rogers1999adaptive}{Rogers, Fiechter, and Langley 1999}),
-which: models driver preferences in terms of 14 global route features;
-gives the driver two alternative routes he might take; lets the driver
-refine these choices along route dimensions; uses driver choices to
-refine its model of his preferences; and invokes the driver model to
-recommend future routes. We note that providing drivers with choices
-lets the system collect data on route preferences in an unobtrusive
-manner. The interface of the application is presented in
-Figure~\ref{fig-exp-1}.
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.8\linewidth,height=\textheight,keepaspectratio]{src/Figures/example-1.png}
-
-}
-
-\caption{\label{fig-exp-1}The adaptive route advisor.}
-
-\end{figure}%
-
-In driving route advisor task (\citeproc{ref-rogers1999adaptive}{Rogers,
-Fiechter, and Langley 1999}), a linear model is used for predicting the
-cost of a route based on the time, distance, number of intersections,
-and the number of turns. The system uses each training pair as a
-constraint on the weights found during the learning process. The
-experimental results are shown in the \textbf{?@fig-exp-2}.
-
-(Left) Experiments with 24 subjects show the Route Advisor improves its
-predictive ability rapidly with experience. (Right) Analyses also show
-that personalized user models produce better results than generalized
-models, even when given more data.
-
-\textbf{2. Destination Selection:} The task of destination selection
-involves assisting a driver in identifying one or more suitable
-destinations that fulfill a specific goal, such as finding a place to
-eat lunch, based on the driver's current location and knowledge of
-nearby options. While there are many recommendation systems online,
-including those for restaurants, they are not ideally suited for drivers
-due to the driving environment's demand for limited visual attention,
-thus necessitating a more tailored and accessible approach for in-car
-use.
-
-One approach to destination recommendation can be cast as:
-
-\begin{itemize}
-\item
-  Formulation: Learn to predict features the user cares about in items.
-\item
-  Representation: Conditions/weights on attributes and values.
-\item
-  Data collection: Converse with the user to help him make decisions,
-  noting whether he accepts or rejects questions and items.
-\item
-  Induction: Any supervised induction method.
-\item
-  Using model: Guide the dialogue by selecting informative questions and
-  suggesting likely values.
-\end{itemize}
-
-This design relies on the idea of a conversational user interface.
-Spoken-language versions of this approach appear well suited to the
-driving environment.
-
-This approach is implemented in
-(\citeproc{ref-langley1999adaptive}{Langley et al. 1999}), where it
-engages in spoken conversations to help a user refine goals;
-incorporates a dialogue model to constrain this process; collects and
-stores traces of interaction with the user; and personalizes both its
-questions and recommended items. Their work focused on recommending
-restaurants to users who want advice about where to eat. This approach
-to recommendation would work well for drivers, it also has broader
-applications. We present experimental results in
-
-(Left) Speech Acts Per Conversation. (Right) Time Per Conversation.
-
-\textbf{3. Resource Scheduling:} The task of resource scheduling
-describes the challenge of allocating a limited set of resources to
-complete a set of tasks or jobs within a certain time frame, while also
-considering the constraints on both the jobs and the resources. Although
-automated scheduling systems are prevalent in various industries and
-some interactive schedulers exist, there is a distinct need for systems
-that can create personalized schedules reflecting the unique preferences
-of individual users.
-
-An approach to personalized scheduling can be described as:
-
-\begin{itemize}
-\item
-  Formulation: Learn a utility function to evaluate entire schedules.
-\item
-  Representation: Global features are computable from the schedule.
-\item
-  Data collection: Preference of one candidate schedule over others.
-\item
-  Induction: A method for learning weights from preference data.
-\item
-  Using model: Apply the `subjective' function to find a good schedule.
-\end{itemize}
-
-We note that this method is similar to that in the Adaptive Route
-Advisor. However, it assumes a search through a space of complete
-schedules (a repair space), which requires some initial schedule. This
-approach is implemented in
-(\citeproc{ref-gervasio1999learning}{Gervasio, Iba, and Langley 1999}),
-where the interactive scheduler retrieves an initial schedule from a
-personalized case library; suggests to the user improved schedules from
-which to select; lets the user direct search to improve on certain
-dimensions; collects user choices to refine its personalized utility
-function; stores solutions in the case base to initialize future
-schedules; and invokes the user model to recommend future schedule
-repairs. As before, providing users with choices lets the system collect
-data on schedule preferences unobtrusively. An example of the interface,
-and the experimental results are shown in \textbf{?@fig-exp-3}.
-
-(Left) The interface of the INCA: Interactive Scheduling {}. (Right)
-Experiments with INCA suggest that retrieving personalized schedules
-helps users more as task difficulty increases. These experimental
-studies used a mixture of human and synthetic subjects.
-
-\subsubsection{Limitations}\label{limitations}
-
-The challenges of adaptive interfaces may involve: conceptualizing user
-modeling as a task suitable for inductive learning, crafting
-representations that facilitate the learning process, gathering training
-data from users in a way that doesn't intrude on their experience,
-applying the learned user model effectively, ensuring the system can
-learn in real-time, and dealing with the necessity of learning from a
-limited number of training instances. These challenges are not only
-pertinent to adaptive interfaces but also intersect with broader
-applications of machine learning, while also introducing some unique
-issues. However, new sensor technology can bring promises to adaptive
-interfaces. Adaptive interfaces rely on user traces to drive their
-modeling process, so they stand to benefit from developments like GPS
-and cell phone locators, robust software for speech recognition,
-accurate eye and head trackers, real-time video interpreters, wearable
-body sensors (GSR, heart rate), and portable brain-wave sensors. As
-those devices become more widespread, they will offer new sources of
-data and support new types of adaptive services. In addition, adaptive
-interfaces can be viewed as a form of cognitive simulation that
-automatically generates knowledge structures to learn user preferences.
-They are capable of making explicit predictions about future user
-behavior and explaining individual differences through the process of
-personalization. This perspective views adaptive interfaces as tools
-that not only serve functional purposes but also model the psychological
-aspects of user interaction. Two distinct approaches within cognitive
-simulation are related to adaptive interfaces: \emph{process} models
-that incorporate fundamental architectural principles, and
-\emph{content} models that operate at the knowledge level, focusing on
-behavior. We note that both of them have roles to play, but content
-models are more relevant to personalization and adaptive interfaces.
-
-In conclusion, adaptive user interfaces represent a significant
-advancement in creating personalized and efficient interactions between
-humans and technology. By leveraging modern sensor technologies and
-cognitive simulation approaches, these interfaces can dynamically learn
-and adapt to individual user preferences, enhancing overall user
-experience and system effectiveness. The methodologies discussed, from
-conceptualizing user models to collecting and utilizing user feedback,
-form the foundation of this innovative approach. As we transition to the
-next section, we will explore practical applications and real-world
-implementations of these human-centered AI principles through detailed
-case studies, illustrating the tangible impact of adaptive interfaces in
-various domains.
-
-\subsection{Case Studies in Human-Centered
-AI}\label{case-studies-in-human-centered-ai}
-
-In this section, we examine practical examples that illustrate the
-application of human-centered principles in the development and
-deployment of AI systems. By examining these case studies, we aim to
-provide concrete insights into how AI technologies can be designed and
-implemented to better align with human values, enhance inclusivity, and
-address the specific needs of diverse user groups. The following case
-studies highlight different approaches and methodologies used to ensure
-that AI systems are not only effective but also considerate of the human
-experience.
-
-\subsubsection{LaMPost Case Study}\label{lampost-case-study}
-
-In our exploration of human-centered AI design, it is crucial to examine
-how metrics can be improved to better capture the human experience and
-address the shortcomings of traditional evaluation methods. The LaMPost
-case study (\citeproc{ref-goodman_lampost_2022}{Goodman et al. 2022})
-exemplifies this effort by focusing on the development of an AI
-assistant designed to aid individuals with dyslexia in writing emails.
-This case is particularly relevant to our discussion because it
-highlights the importance of human-centered principles in AI
-development, especially in creating tools that cater to specific
-cognitive differences and enhance user experience.
-
-Dyslexia is a cognitive difference that affects approximately 15 percent
-of language users, with varying degrees of impact on speaking, spelling,
-and writing abilities. It is a spectrum disorder, meaning symptoms and
-severity differ among individuals. More importantly, dyslexia is not an
-intellectual disability; many individuals with dyslexia possess high
-intelligence. Given the significant number of people affected by
-dyslexia, it is essential to develop AI tools that support their unique
-needs and enhance their daily tasks.
-
-The LaMPost project sought to answer the question, ``How can LLMs be
-applied to enhance the writing workflows of adults with dyslexia?'' To
-address this, researchers employed a participatory design approach,
-involving employees with dyslexia from their company (Google) in the
-study. This approach ensured that the development process was inclusive
-and responsive to the actual needs and preferences of the dyslexic
-community. By focusing on the real-world application of LLMs in aiding
-email writing for dyslexic individuals, LaMPost serves as a powerful
-example of how AI can be designed to better capture and enhance the
-human experience.
-
-The figure below allows users to see suggestions for rewriting selected
-text, helping them identify main ideas, suggest possible changes, and
-rewrite their selections to improve clarity and expression.
-
-\begin{figure}[H]
-
-{\centering \pandocbounded{\includegraphics[keepaspectratio]{src/Figures/lampost_fig3.png}}
-
-}
-
-\caption{The Suggest Possible Changes feature from LaMPost.}
-
-\end{figure}%
-
-The table below categorizes the challenges faced by users at different
-writing levels and the strategies they can use to overcome these
-challenges, illustrating the varied support needs addressed by LaMPost
-
-Writing level
-
-Examples of Challenges
-
-Strategies
-
-high
-
-expressing ideas
-
-``word faucet'', ASR dictation
-
-ordering ideas
-
-post-it outlining
-
-low
-
-appropriate language
-
-proofreading
-
-paraphrasing
-
-feedback
-
-User challenged and strategies in LaMPost.
-
-Next, they ran a focus group to get initial ideas from members of the
-dyslexic community. This focus group helped them figure out what to
-measure and added the second research question: ``How do adults with
-dyslexia feel about LLM-assisted writing?'' In other words, how does the
-LLM impact users' feelings of satisfaction, self-expression,
-self-efficacy, autonomy, and control?
-
-From this focus group, they went and created a prototype to answer the
-desires of the group. They included three features in their prototype
-model. One feature was: \emph{identifying main ideas}. They focused on
-this to support overall clarity and organization of high-level ideas of
-the user. Another feature was \emph{suggest possible changes}. They
-focused on this because users wanted to identify high-level adjustments
-to improve their writing. The last feature they added was \emph{rewrite
-my selections}. They added this because users wanted help expressing
-ideas with a desired phrasing tone or style. This feature generated a
-rewrite based on a command you gave it.
-
-With the prototype, the researchers evaluated again with 19 participants
-with dyslexia from outside their organization. They did a three-part
-study, including a demonstration and background on the system (25 min).
-Then they did a writing exercise with two real tasks (emails) each user
-had to do in the real world (25 min). For example, one task might have
-been to write an email to the principal of their child's school to ask
-for a meeting. Then, the researchers did another follow-up interview for
-more qualitative data, e.g.~to ask about specific choices users made
-when interacting with the model (25 min).
-
-LaMPost's design prioritized autonomy by allowing users to choose the
-best option for their writing. One successful thing is that most users
-felt in control while writing. Users found that numerous options were
-helpful to filter through poor results. However, participants said the
-selection process was cognitively demanding and time-consuming. As we
-all know, features identified in LaMPost are all over the place, such as
-in Google Docs. Nonetheless, there remain many questions about the
-balance between automated writing and providing more control to the end
-users.
-
-How could researchers hone in on this trade-off between \textbf{the ease
-of automated writing} and \textbf{providing control to end-users}?\\
-You will need to design a study to approach this question.
-
-\begin{itemize}
-\item
-  Identify your research question, hypotheses, and the methods that you
-  will use. (Hint: use the HCI methods described in the previous
-  section.)
-\item
-  Scope the domain of your study appropriately---more broadly than
-  dyslexia but not so broadly to be meaningless.
-\item
-  What domains will you include? (E.g. students use ChatGPT for
-  assignments, doctors use an LLM to write notes, etc.)
-\end{itemize}
-
-In this way, both the case study of LaMPost and its presaging of greater
-trends in LLM interfaces recapitulate the maxim of HCI: HCI is a cycle.
-You design a potential system, prototype it, get feedback from people,
-and iterate constantly. Next, we will explore two case studies that
-exemplify the application of human-centered principles in NLP. These
-case studies illustrate how LLMs can be adapted to foster social
-inclusivity and provide training in social skills.
-
-\subsubsection{Multi-Value and DaDa: Cross-Dialectal English
-NLP}\label{multi-value-and-dada-cross-dialectal-english-nlp}
-
-English NLP systems are largely trained to perform well in Standard
-American English - the form of written English found in professional
-settings and elsewhere. Not only is Standard American English the most
-well-represented form of English in textual datasets but NLP engineers
-and researchers often filter dialectal and vernacular English examples
-from their datasets to improve performance on SAE benchmarks. As a
-result, NLP systems are generally less performant when processing
-dialectal inputs than SAE inputs. This performance gap is observable
-over various benchmarks and tasks, like the SPIDER benchmark.
-(\citeproc{ref-spider}{Chang et al. 2023})
-
-\begin{figure}[H]
-
-{\centering \includegraphics[width=1\linewidth,height=\textheight,keepaspectratio]{src/Figures/MV2.png}
-
-}
-
-\caption{Stress test reveals worse performance on the SPIDER benchmark
-with synthetic dialectical examples than with SAE.}
-
-\end{figure}%
-
-As natural language systems become more pervasive, this performance gap
-increasingly represents a real allocational harm against dialectal
-English speakers --- these speakers are excluded from using helpful
-systems and assistants. Multi-Value is a framework for evaluating
-foundation language models on dialectic input, and DADA is a framework
-for adapting LLMs to improve performance on dialectic input.
-
-\textbf{Synthetic Dialectal Data}
-
-Ziems et al.~(2023) create synthetic dialectal data for several English
-dialects (Appalachian English, Chicano English, Indian English,
-Colloquial Singapore English, and Urban African American
-English).(\citeproc{ref-mv}{Ziems et al. 2023}) They created synthetic
-data based on transforming SAE examples to have direct evaluation
-comparisons. These synthetic examples were created by leveraging known
-linguistic features of the dialects, such as negative concord in UAAVE.
-Figure~\ref{fig-features_dialects} maps out the presence of various
-linguistic features.
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=1\linewidth,height=\textheight,keepaspectratio]{src/Figures/MV1.png}
-
-}
-
-\caption{\label{fig-features_dialects}A comparative distribution of
-features in five dialects.}
-
-\end{figure}%
-
-This synthetic data, while somewhat limited in the variety of samples.
-can produce and create realistic examples for benchmarking LM
-performance. Figure~\ref{fig-synthetic_example} demonstrates creating a
-synthetic dialectic example using the `give passive' linguistic feature,
-illustrating the transformation process from SAE to a vernacular form.
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.4\linewidth,height=\textheight,keepaspectratio]{src/Figures/MV3.png}
-
-}
-
-\caption{\label{fig-synthetic_example}Execution of a sample transform
-using a documented linguistic feature.}
-
-\end{figure}%
-
-\textbf{Feature Level Adapters} One approach to the LLM adaption task
-would be to train an adapter for each dialect using a
-parameter-efficient fine-tuning method like low-rank adapters.
-(\citeproc{ref-lora}{Hu et al. 2021}) While adapters can certainly
-bridge the gap between SAE LMs and dialect inputs, this approach suffers
-from a couple of weaknesses, namely:
-
-\begin{itemize}
-\item
-  Individually trained adapters do not leverage similarities between
-  low-resource dialects. Transfer learning is often helpful for training
-  low-resource languages and dialects.
-\item
-  The model needs to know which adapter to use at inference time. This
-  presupposes that we can accurately classify the dialect --- sometimes
-  based on as little as one utterance. This classification is not always
-  possible --- a more general approach is needed.
-\end{itemize}
-
-Therefore, Liu et al.~(2023) propose a novel solution --- DADA: Dialect
-Adaption via Dynamic Aggregation of Linguistic Rules.
-(\citeproc{ref-dada}{Liu, Held, and Yang 2023}) DADA trains adapters on
-the linguistic feature level rather than the dialect level. The model
-can use multiple linguistic feature adapters via an additional fusion
-layer. They can therefore train using multi-dialectical data and cover
-linguistic variation via a comprehensive set of roughly 200 adapters.
-DADA saw an improvement in performance over single-dialect adapters for
-most dialects, as shown in Figure~\ref{fig-dada_performance}.
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.4\linewidth,height=\textheight,keepaspectratio]{src/Figures/MV4.png}
-
-}
-
-\caption{\label{fig-dada_performance}Execution of a sample transform
-using a documented linguistic feature.}
-
-\end{figure}%
-
-The Multi-Value and DADA case study underscores the importance of
-designing NLP systems that are inclusive and representative of diverse
-language users. By addressing the performance gaps in handling dialectal
-inputs, this case study highlights the necessity of incorporating
-diverse linguistic data and creating adaptable systems. This approach
-enhances AI functionality and accessibility, ensuring it respects and
-reflects linguistic diversity. Ultimately, the study reinforces
-human-centered design principles, demonstrating how AI can be tailored
-to better serve and empower all users. Moving forward, we will explore
-how LLMs can be utilized for social skill training, showcasing their
-potential to improve human interactions.
-
-\subsubsection{Social Skill Training via
-LLMs}\label{social-skill-training-via-llms}
-
-The emergence of Large Language Models (LLMs) marks a significant
-milestone in the field of social skills training. This case study
-explores the potential of LLMs to augment social skill development
-across diverse scenarios. More specifically, we discuss a dual-framework
-approach, where two distinct LLMs operate in tandem as a Partner and a
-Mentor, guiding human learners in their journey towards improved social
-interaction. In this framework, we have two agents which are
-
-\begin{itemize}
-\item
-  \textbf{AI Partner}: LLM-empowered agents that users can engage with
-  across various topics. This interactive model facilitates practical,
-  conversation-based learning, enabling users to experiment with
-  different communication styles and techniques or practice and develop
-  specific skills in real-world scenarios in a safe, AI-mediated
-  environment.
-\item
-  \textbf{AI Mentor}: An LLM-empowered entity designed to provide
-  constructive, personalized feedback based on the interaction of users
-  and the AI Partner. This mentor analyzes conversation dynamics,
-  identifies areas for improvement, offers tailored advice, and guides
-  users toward effective social strategies and improved interaction
-  skills.
-\end{itemize}
-
-For example, in conflict resolution, individuals learning to handle
-difficult conversations can use the AI Partner to simulate interactions
-with a digitalized partner. As a Conflict Resolution Expert, the AI
-Mentor helps analyze these interactions, offering strategies to navigate
-conflicts effectively.
-
-In the educational sector, K-12 teachers aiming to incorporate more
-growth-mindset language into their teaching can practice with a
-digitalized student. An experienced teacher or mentor, represented by
-the AI Mentor, provides insights on effective communication and teaching
-methods. For negotiation training, students preparing to negotiate their
-first job offers can engage in simulated negotiations with a digitalized
-HR representative through the AI Partner. As a Negotiation Expert, the
-AI Mentor then offers guidance on negotiation tactics, helping students
-effectively articulate their values and negotiate job terms. Lastly, in
-therapy training, novice therapists can interact with a digitalized
-patient via the AI Partner to practice therapy sessions. The AI Mentor,
-functioning as a Therapy Coach, then reviews these sessions, providing
-feedback and suggestions on enhancing therapeutic techniques and patient
-engagement.
-
-\textbf{CARE: Therapy Skill Training} Hsu et al.~(2023) introduced CARE
-(\citeproc{ref-hsu2023helping}{Hsu et al. 2023}), a framework designed
-for therapy skill training. This framework leverages a simulated
-environment, enabling counselors to practice their skills without the
-risk of harming real individuals. An integral component of CARE is the
-AI Mentor, which offers invaluable feedback and guidance during the
-training process. See Figure~\ref{fig-care} for the overview of the
-framework.
-
-\begin{figure}
-
-\centering{
-
-\includegraphics[width=0.45\linewidth,height=\textheight,keepaspectratio]{src/Figures/care.png}
-
-}
-
-\caption{\label{fig-care}CARE Framework}
-
-\end{figure}%
-
-CARE's primary function is for novice therapists and counselors to
-assess and determine the most effective counseling strategies tailored
-to specific contexts. It provides counselors with customized example
-responses, which they can adopt, adapt, or disregard when interacting
-with a simulated support seeker. This approach is deeply rooted in the
-principles of Motivational Interviewing and utilizes a rich dataset of
-counseling conversations combined with LLMs. The effectiveness of CARE
-has been established through rigorous quantitative evaluations and
-qualitative user studies, which included simulated chats and
-semi-structured interviews. Notably, CARE has shown significant benefits
-in aiding novice counselors. From the assessment, counselors chose to
-use CARE 93\% of the time, directly used a CARE response without editing
-60\% of the time, and sent more extended responses with CARE.
-Qualitatively, counselors noted several advantages of CARE, such as its
-ability to refresh memory on various strategies, inspire innovative
-responses, boost confidence, and save time during consultations.
-However, there were some drawbacks, including potential disruptions in
-the thought process, perceived limitations in response options, the
-requirement for decision-making, and the time needed to review
-suggestions. Overall, the framework is particularly beneficial for
-therapists new to the field, offering them a supportive and educational
-tool to enhance their counseling skills effectively.
-
-\section{Practice Exercises}\label{practice-exercises}
-
-\section*{References}\label{bibliography-6}
-\addcontentsline{toc}{section}{References}
-
-\markright{References}
-
-\phantomsection\label{refs-6}
-\begin{CSLReferences}{1}{0}
-\bibitem[\citeproctext]{ref-amodei2016concrete}
-Amodei, Dario, Chris Olah, Jacob Steinhardt, Paul Christiano, John
-Schulman, and Dan Mane. 2016. {``Concrete Problems in AI Safety.''}
-\emph{arXiv Preprint arXiv:1606.06565}.
-
-\bibitem[\citeproctext]{ref-angwin_machine_2016}
-Angwin, Julia, Jeff Larson, Surya Mattu, and Lauren Kirchner. 2016.
-{``Machine Bias.''} \emph{ProPublica}.
-
-\bibitem[\citeproctext]{ref-arcas_can_2022}
-Arcas, Blaise Aguera y. 2022. {``Can Machines Learn How to Behave?''}
-\emph{Medium}.
-\url{https://medium.com/@blaisea/can-machines-learn-how-to-behave-42a02a57fadb}.
-
-\bibitem[\citeproctext]{ref-aristotle_nicomachean_350}
-Aristotle. 350 B.C.E. \emph{Nicomachean Ethics}. translated by W.D.
-Ross.
-
-\bibitem[\citeproctext]{ref-bai_constitutional_2022}
-Bai, Yuntao, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson
-Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, and
-Cameron McKinnon. 2022. {``Constitutional Ai: {Harmlessness} from Ai
-Feedback.''} \emph{arXiv Preprint arXiv:2212.08073}.
-
-\bibitem[\citeproctext]{ref-barocas_fairness_2019}
-Barocas, Solon, Moritz Hardt, and Arvind Narayanan. 2019. \emph{Fairness
-and Machine Learning}. fairmlbook.org.
-
-\bibitem[\citeproctext]{ref-bernstein2010soylent}
-Bernstein, Michael S., Greg Little, Robert C. Miller, Bjorn Hartmann,
-Mark S. Ackerman, David R. Karger, David Crowell, and Katrina Panovich.
-2010. {``Soylent: A Word Processor with a Crowd Inside.''} In
-\emph{Proceedings of the 23nd Annual ACM Symposium on User Interface
-Software and Technology}. ACM.
-
-\bibitem[\citeproctext]{ref-binns_fairness_2018}
-Binns, Reuben. 2018. {``Fairness in Machine Learning: Lessons from
-Political Philosophy.''} In \emph{Proceedings of the 2018 Conference on
-Fairness, Accountability, and Transparency}, 149--59.
-
-\bibitem[\citeproctext]{ref-bostrom2014superintelligence}
-Bostrom, Nick. 2014. \emph{Superintelligence: Paths, Dangers,
-Strategies}. Oxford University Press.
-
-\bibitem[\citeproctext]{ref-brown2019machine}
-Brown, Daniel S, and Scott Niekum. 2019. {``Machine Teaching for Inverse
-Reinforcement Learning: Algorithms and Applications.''} In
-\emph{Proceedings of the AAAI Conference on Artificial Intelligence},
-33:7749--58.
-
-\bibitem[\citeproctext]{ref-brown2021value}
-Brown, Daniel S, Jordan Schneider, Anca Dragan, and Scott Niekum. 2021.
-{``Value Alignment Verification.''} In \emph{International Conference on
-Machine Learning}, 1105--15. PMLR.
-
-\bibitem[\citeproctext]{ref-spider}
-Chang, Shuaichen, Jun Wang, Mingwen Dong, Lin Pan, Henghui Zhu,
-Alexander Hanbo Li, Wuwei Lan, et al. 2023. {``Dr.spider: A Diagnostic
-Evaluation Benchmark Towards Text-to-SQL Robustness.''}
-\url{https://arxiv.org/abs/2301.08881}.
-
-\bibitem[\citeproctext]{ref-chowdhery_palm_2022}
-Chowdhery, Aakanksha, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav
-Mishra, Adam Roberts, Paul Barham, et al. 2022. {``{PaLM}: {Scaling}
-{Language} {Modeling} with {Pathways}.''} \emph{arXiv:2204.02311
-{[}Cs{]}}, April. \url{http://arxiv.org/abs/2204.02311}.
-
-\bibitem[\citeproctext]{ref-christianoclarifying}
-Christiano, Paul. 2018. {``Clarifying {`AI Alignment'}.''}
-\url{https://ai-alignment.com/clarifying-ai-alignment-cec47cd69dd6}.
-
-\bibitem[\citeproctext]{ref-christiano2017deep}
-Christiano, Paul F, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and
-Dario Amodei. 2017. {``Deep Reinforcement Learning from Human
-Preferences.''} \emph{Advances in Neural Information Processing Systems}
-30.
-
-\bibitem[\citeproctext]{ref-clark2016faulty}
-Clark, Jack, and Dario Amodei. 2016. {``Faulty Reward Functions in the
-Wild.''} \emph{OpenAI Blog}.
-
-\bibitem[\citeproctext]{ref-dignum_responsible_2019}
-Dignum, Virginia. 2019. \emph{Responsible Artificial Intelligence: How
-to Develop and Use AI in a Responsible Way}. Vol. 2156. Springer.
-
-\bibitem[\citeproctext]{ref-dworkin1988theory}
-Dworkin, Gerald. 1988. \emph{The Theory and Practice of Autonomy}.
-Cambridge University Press.
-
-\bibitem[\citeproctext]{ref-everitt2018alignment}
-Everitt, Tom, and Marcus Hutter. 2018. {``The Alignment Problem for
-Artificial Intelligence.''} In \emph{Advances in Neural Information
-Processing Systems}, 1--8.
-
-\bibitem[\citeproctext]{ref-floridi2011ethics}
-Floridi, Luciano. 2011. \emph{The Ethics of Information}. Oxford
-University Press.
-
-\bibitem[\citeproctext]{ref-frankena1973ethics}
-Frankena, William K. 1973. \emph{Ethics}. Prentice Hall.
-
-\bibitem[\citeproctext]{ref-friedman_value_2008}
-Friedman, Batya, Peter H. Kahn, and Alan Borning. 2008. {``Value
-Sensitive Design and Information Systems.''} In \emph{The Handbook of
-Information and Computer Ethics}. John Wiley \& Sons.
-
-\bibitem[\citeproctext]{ref-gervasio1999learning}
-Gervasio, Melinda T, Wayne Iba, and Pat Langley. 1999. {``Learning User
-Evaluation Functions for Adaptive Scheduling Assistance.''} In
-\emph{ICML}, 152--61. Citeseer.
-
-\bibitem[\citeproctext]{ref-goodall_machine_2014}
-Goodall, Noah J. 2014. {``Machine Ethics and Automated Vehicles.''} In
-\emph{Road Vehicle Automation}, 93--102. Springer.
-
-\bibitem[\citeproctext]{ref-goodman_lampost_2022}
-Goodman, Steven, Erin Buehler, Patrick Clary, Andy Coenen, Aaron Michael
-Donsbach, Tiffanie Horne, Michal Lahav, et al. 2022. {``LaMPost:
-Evaluation of an AI-Assisted Writing Email Editor Prototype for Adults
-with Dyslexia.''}
-
-\bibitem[\citeproctext]{ref-hadfield2016cooperative}
-Hadfield-Menell, Dylan, Stuart J Russell, Pieter Abbeel, and Anca
-Dragan. 2016. {``Cooperative Inverse Reinforcement Learning.''}
-\emph{Advances in Neural Information Processing Systems} 29.
-
-\bibitem[\citeproctext]{ref-hardt_patterns_2021}
-Hardt, Moritz, and Benjamin Recht. 2021. {``Patterns, Predictions, and
-Actions: A Story about Machine Learning.''} \emph{arXiv Preprint
-arXiv:2102.05242}.
-
-\bibitem[\citeproctext]{ref-vanhasselt_deep_2018}
-Hasselt, Hado van, Yotam Doron, Florian Strub, Matteo Hessel, Nicolas
-Sonnerat, and Joseph Modayil. 2018. {``Deep Reinforcement Learning and
-the Deadly Triad.''}
-
-\bibitem[\citeproctext]{ref-hejna2023contrastive}
-Hejna, Joey, Rafael Rafailov, Harshit Sikchi, Chelsea Finn, Scott
-Niekum, W. Bradley Knox, and Dorsa Sadigh. 2023. {``Contrastive
-Preference Learning: Learning from Human Feedback Without RL.''}
-\url{https://arxiv.org/abs/2310.13639}.
-
-\bibitem[\citeproctext]{ref-hendrycks_aligning_2021}
-Hendrycks, Dan, Collin Burns, Steven Basart, Andrew Critch, Jerry Li,
-Dawn Song, and Jacob Steinhardt. 2020. {``Aligning Ai with Shared Human
-Values.''} \emph{arXiv Preprint arXiv:2008.02275}.
-
-\bibitem[\citeproctext]{ref-hendrycks_what_2021}
-Hendrycks, Dan, Mantas Mazeika, Andy Zou, Sahil Patel, Christine Zhu,
-Jesus Navarro, Dawn Song, Bo Li, and Jacob Steinhardt. 2021. {``What
-{Would} {Jiminy} {Cricket} {Do}? {Towards} {Agents} {That} {Behave}
-{Morally}.''} \emph{arXiv:2110.13136 {[}Cs{]}}.
-\url{http://arxiv.org/abs/2110.13136}.
-
-\bibitem[\citeproctext]{ref-hovy-yang-2021-importance}
-Hovy, Dirk, and Diyi Yang. 2021. {``The Importance of Modeling Social
-Factors of Language: Theory and Practice.''} In \emph{Proceedings of the
-2021 Conference of the North American Chapter of the Association for
-Computational Linguistics: Human Language Technologies}, edited by
-Kristina Toutanova, Anna Rumshisky, Luke Zettlemoyer, Dilek Hakkani-Tur,
-Iz Beltagy, Steven Bethard, Ryan Cotterell, Tanmoy Chakraborty, and
-Yichao Zhou, 588--602. Online: Association for Computational
-Linguistics. \url{https://doi.org/10.18653/v1/2021.naacl-main.49}.
-
-\bibitem[\citeproctext]{ref-hsu2023helping}
-Hsu, Shang-Ling, Raj Sanjay Shah, Prathik Senthil, Zahra Ashktorab,
-Casey Dugan, Werner Geyer, and Diyi Yang. 2023. {``Helping the Helper:
-Supporting Peer Counselors via AI-Empowered Practice and Feedback.''}
-\url{https://arxiv.org/abs/2305.08982}.
-
-\bibitem[\citeproctext]{ref-lora}
-Hu, Edward J., Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi
-Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. {``LoRA: Low-Rank
-Adaptation of Large Language Models.''}
-\url{https://arxiv.org/abs/2106.09685}.
-
-\bibitem[\citeproctext]{ref-huang2018establishing}
-Huang, Sandy H, Kush Bhatia, Pieter Abbeel, and Anca D Dragan. 2018.
-{``Establishing Appropriate Trust via Critical States.''} In \emph{2018
-IEEE/RSJ International Conference on Intelligent Robots and Systems
-(IROS)}, 3929--36. IEEE.
-
-\bibitem[\citeproctext]{ref-hubinger2019introduction}
-Hubinger, Evan, Chris van Merwijk, Vladimir Mikulik, Joar Skalse, and
-Scott Garrabrant. 2019. {``An Introduction to Inner Alignment.''}
-\emph{arXiv Preprint arXiv:1906.01820}.
-
-\bibitem[\citeproctext]{ref-jiang_artificial_2017}
-Jiang, Fei, Yong Jiang, Hang Zhi, Yuan Dong, Hui Li, Shugang Ma, and
-Yongan Wang. 2017. {``Artificial Intelligence in Healthcare: Past,
-Present and Future.''} \emph{Stroke and Vascular Neurology} 2 (4):
-230--43.
-
-\bibitem[\citeproctext]{ref-jiang_delphi_2021}
-Jiang, Liwei, Jena D. Hwang, Chandra Bhagavatula, Ronan Le Bras, Maxwell
-Forbes, Jon Borchardt, Jenny Liang, Oren Etzioni, Maarten Sap, and Yejin
-Choi. 2021. {``Delphi: {Towards} {Machine} {Ethics} and {Norms}.''}
-\emph{arXiv:2110.07574 {[}Cs{]}}, October.
-\url{http://arxiv.org/abs/2110.07574}.
-
-\bibitem[\citeproctext]{ref-johnson_kants_2022}
-Johnson, Robert, and Adam Cureton. 2022. {``Kant's {Moral}
-{Philosophy}.''} In \emph{The {Stanford} {Encyclopedia} of
-{Philosophy}}, edited by Edward N. Zalta and Uri Nodelman, Fall 2022.
-Metaphysics Research Lab, Stanford University.
-\url{https://plato.stanford.edu/archives/fall2022/entries/kant-moral/}.
-
-\bibitem[\citeproctext]{ref-krakovna2020specification}
-Krakovna, Victoria et al. 2020. {``Specification Gaming Examples in
-AI.''} \emph{DeepMind Safety Research}.
-
-\bibitem[\citeproctext]{ref-langley1999adaptive}
-Langley, Pat, Cynthia Thompson, Renee Elio, and Afsaneh Haddadi. 1999.
-{``An Adaptive Conversational Interface for Destination Advice.''} In
-\emph{International Workshop on Cooperative Information Agents},
-347--64. Springer.
-
-\bibitem[\citeproctext]{ref-leike2018scalable}
-Leike, Jan, David Krueger, Tom Everitt, Miljan Martic, Vishal Maini, and
-Shane Legg. 2018. {``Scalable Agent Alignment via Reward Modeling: A
-Research Direction.''} \url{https://arxiv.org/abs/1811.07871}.
-
-\bibitem[\citeproctext]{ref-liang_holistic_2023}
-Liang, Percy, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu,
-Michihiro Yasunaga, Yian Zhang, et al. 2023. {``Holistic {Evaluation} of
-{Language} {Models}.''} arXiv.
-\url{https://doi.org/10.48550/arXiv.2211.09110}.
-
-\bibitem[\citeproctext]{ref-dada}
-Liu, Yanchen, William Held, and Diyi Yang. 2023. {``DADA: Dialect
-Adaptation via Dynamic Aggregation of Linguistic Rules.''}
-\url{https://arxiv.org/abs/2305.13406}.
-
-\bibitem[\citeproctext]{ref-mazeika_how_2022}
-Mazeika, Mantas, Eric Tang, Andy Zou, Steven Basart, Jun Shern Chan,
-Dawn Song, David Forsyth, Jacob Steinhardt, and Dan Hendrycks. 2022.
-{``How {Would} {The} {Viewer} {Feel}? {Estimating} {Wellbeing} {From}
-{Video} {Scenarios}.''} \emph{arXiv Preprint arXiv:2210.10039}.
-
-\bibitem[\citeproctext]{ref-mehrabi_survey_2021}
-Mehrabi, Ninareh, Fred Morstatter, Nripsuta Saxena, Kristina Lerman, and
-Aram Galstyan. 2021. {``A Survey on Bias and Fairness in Machine
-Learning.''} \emph{ACM Computing Surveys (CSUR)} 54 (6): 1--35.
-
-\bibitem[\citeproctext]{ref-mill_utilitarianism_1863}
-Mill, John Stuart. 1863. \emph{Utilitarianism}. Parker, Son,; Bourn.
-
-\bibitem[\citeproctext]{ref-moerland_emotion_2018}
-Moerland, Thomas M, Joost Broekens, and Catholijn M Jonker. 2018.
-{``Emotion in Reinforcement Learning Agents and Robots: A Survey.''}
-\emph{Machine Learning} 107:443--80.
-
-\bibitem[\citeproctext]{ref-Morris2019HITL}
-Morris, Meredith Ringel. 2019. {``Human-in-the-Loop Computing:
-Reimagining Human-Computer Interaction in the Age of AI.''} In
-\emph{Proceedings of the 2019 CHI Conference on Human Factors in
-Computing Systems}. ACM.
-
-\bibitem[\citeproctext]{ref-muller_participatory_2003}
-Muller, Michael J. 2003. {``Participatory Design: The Third Space in
-HCI.''} In \emph{The Human-Computer Interaction Handbook}. CRC Press.
-
-\bibitem[\citeproctext]{ref-ngo2023alignment}
-Ngo, Richard, Lawrence Chan, and Sören Mindermann. 2023. {``The
-Alignment Problem from a Deep Learning Perspective.''}
-\url{https://arxiv.org/abs/2209.00626}.
-
-\bibitem[\citeproctext]{ref-noble_algorithms_2018}
-Noble, Safiya Umoja. 2018. \emph{Algorithms of Oppression: How Search
-Engines Reinforce Racism}. NYU Press.
-
-\bibitem[\citeproctext]{ref-nussbaum1993quality}
-Nussbaum, Martha C, and Amartya Sen. 1993. \emph{The Quality of Life}.
-Oxford University Press.
-
-\bibitem[\citeproctext]{ref-oneil_weapons_2016}
-O'Neil, Cathy. 2016. \emph{Weapons of Math Destruction: How Big Data
-Increases Inequality and Threatens Democracy}. Crown Publishing Group.
-
-\bibitem[\citeproctext]{ref-ouyang_training_2022}
-Ouyang, Long, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright,
-Pamela Mishkin, Chong Zhang, et al. 2022. {``Training Language Models to
-Follow Instructions with Human Feedback.''}
-
-\bibitem[\citeproctext]{ref-lamport2017lampost}
-Project, LaMPort. 2017. {``LaMPost: Leveraging Crowdsourcing for Natural
-Language Processing.''} In \emph{Proceedings of the 2017 Conference on
-Empirical Methods in Natural Language Processing}. ACL.
-
-\bibitem[\citeproctext]{ref-rawls1971theory}
-Rawls, John. 1971. \emph{A Theory of Justice}. Harvard University Press.
-
-\bibitem[\citeproctext]{ref-rogers1999adaptive}
-Rogers, Seth, Claude-Nicolas Fiechter, and Pat Langley. 1999. {``An
-Adaptive Interactive Agent for Route Advice.''} In \emph{Proceedings of
-the Third Annual Conference on Autonomous Agents}, 198--205.
-
-\bibitem[\citeproctext]{ref-russell2019human}
-Russell, Stuart. 2019. \emph{Human Compatible: Artificial Intelligence
-and the Problem of Control}. Viking.
-
-\bibitem[\citeproctext]{ref-sadigh2017active}
-Sadigh, Dorsa, Anca Dragan, Shankar Sastry, and Sanjit Seshia. 2017.
-{``Active Preference-Based Learning of Reward Functions.''}
-
-\bibitem[\citeproctext]{ref-sap_socialIQA_2019}
-Sap, Maarten, Hannah Rashkin, Derek Chen, Ronan LeBras, and Yejin Choi.
-2019. {``Socialiqa: Commonsense Reasoning about Social Interactions.''}
-\emph{arXiv Preprint arXiv:1904.09728}.
-
-\bibitem[\citeproctext]{ref-schwartz1992universals}
-Schwartz, Shalom H. 1992. {``Universals in the Content and Structure of
-Values: Theoretical Advances and Empirical Tests in 20 Countries.''}
-\emph{Advances in Experimental Social Psychology} 25:1--65.
-
-\bibitem[\citeproctext]{ref-shah2022goal}
-Shah, Rohin, Vikrant Varma, Ramana Kumar, Mary Phuong, Victoria
-Krakovna, Jonathan Uesato, and Zac Kenton. 2022. {``Goal
-Misgeneralization: Why Correct Specifications Aren't Enough for Correct
-Goals.''} \url{https://arxiv.org/abs/2210.01790}.
-
-\bibitem[\citeproctext]{ref-stiennon_learning_2020}
-Stiennon, Nisan, Long Ouyang, Jeff Wu, Daniel M. Ziegler, Ryan Lowe,
-Chelsea Voss, Alec Radford, Dario Amodei, and Paul Christiano. 2020.
-{``Learning to Summarize from Human Feedback.''}
-
-\bibitem[\citeproctext]{ref-talat_machine_2022}
-Talat, Zeerak, Hagen Blix, Josef Valvoda, Maya Indira Ganesh, Ryan
-Cotterell, and Adina Williams. 2022. {``On the Machine Learning of
-Ethical Judgments from Natural Language.''} In \emph{Proceedings of the
-2022 {Conference} of the {North} {American} {Chapter} of the
-{Association} for {Computational} {Linguistics}: {Human} {Language}
-{Technologies}}. Association for Computational Linguistics.
-
-\bibitem[\citeproctext]{ref-tomasello_becoming_2019}
-Tomasello, Michael. 2019. \emph{Becoming Human: {A} Theory of Ontogeny}.
-Cambridge, MA: Belknap Press.
-
-\bibitem[\citeproctext]{ref-vamplew_human-aligned_2018}
-Vamplew, Peter, Richard Dazeley, Cameron Foale, Sally Firmin, and Jane
-Mummery. 2018. {``Human-Aligned Artificial Intelligence Is a
-Multiobjective Problem.''} \emph{Ethics and Information Technology} 20
-(1): 27--40. \url{https://doi.org/10.1007/s10676-017-9440-6}.
-
-\bibitem[\citeproctext]{ref-vamplew_scalar_2022}
-Vamplew, Peter, Benjamin J. Smith, Johan Källström, Gabriel Ramos,
-Roxana Rădulescu, Diederik M. Roijers, Conor F. Hayes, et al. 2022.
-{``Scalar Reward Is Not Enough: A Response to {Silver}, {Singh},
-{Precup} and {Sutton} (2021).''} \emph{Autonomous Agents and Multi-Agent
-Systems} 36 (2): 41. \url{https://doi.org/10.1007/s10458-022-09575-5}.
-
-\bibitem[\citeproctext]{ref-weidinger_artificial_2022}
-Weidinger, Laura, Madeline G. Reinecke, and Julia Haas. 2022.
-{``Artificial Moral Cognition: {Learning} from Developmental
-Psychology.''} Preprint. PsyArXiv.
-\url{https://doi.org/10.31234/osf.io/tnf4e}.
-
-\bibitem[\citeproctext]{ref-enwiki:1185176830}
-Wikipedia contributors. 2023. {``AI Alignment --- {Wikipedia}{,} the
-Free Encyclopedia.''}
-\url{https://en.wikipedia.org/w/index.php?title=AI_alignment&oldid=1185176830}.
-
-\bibitem[\citeproctext]{ref-xiong_achieving_2016}
-Xiong, Wayne, Jasha Droppo, Xuedong Huang, Frank Seide, Mike Seltzer,
-Andreas Stolcke, Dong Yu, and Geoffrey Zweig. 2016. {``Achieving Human
-Parity in Conversational Speech Recognition.''} \emph{arXiv Preprint
-arXiv:1610.05256}.
-
-\bibitem[\citeproctext]{ref-ziebart_modeling_2010}
-Ziebart, Brian D. 2010. {``Modeling Purposeful Adaptive Behavior with
-the Principle of Maximum Causal Entropy.''} PhD Thesis, Pittsburgh, PA:
-Carnegie Mellon University.
-
-\bibitem[\citeproctext]{ref-mv}
-Ziems, Caleb, William Held, Jingfeng Yang, Jwala Dhamala, Rahul Gupta,
-and Diyi Yang. 2023. {``Multi-VALUE: A Framework for Cross-Dialectal
-English NLP.''} \url{https://arxiv.org/abs/2212.08011}.
-
-\end{CSLReferences}
-
-\bookmarksetup{startatroot}
-
-\chapter{Conclusion}\label{conclusion}
-
-\phantomsection\label{sec-ack}
-\bookmarksetup{startatroot}
-
-\chapter*{Acknowledgments}
-\addcontentsline{toc}{chapter}{Acknowledgments}
-
-\markboth{Acknowledgments}{Acknowledgments}
-
-Initial versions of this book were compiled as lecture notes to the
-class CS329H: Machine Learning from Human Preferences at Stanford
-University taught in Fall 2023 and Fall 2024. We thank Rehaan Ahmad,
-Ahmed Ahmed, Jirayu Burapacheep, Michael Byun, Akash Chaurasia, Andrew
-Conkey, Tanvi Deshpande, Eric Han, Laya Iyer, Adarsh Jeewajee, Shreyas
-Kar, Arjun Karanam, Jared Moore, Aashiq Muhamed, Bidipta Sarkar, William
-Shabecoff, Stephan Sharkov, Max Sobol Mark, Kushal Thaman, Joe Vincent,
-Yibo Zhang, Duc Nguyen, Grace Sodunke, Ky Nguyen, and Mykkel
-Kochenderfer for their early contributions and feedback.
-
-\phantomsection\label{3ade8a4a-fb1d-4a6c-8409-ac45482d5fc9}
-
-
-
-% \usepackage[left=1in,marginparwidth=2.0666666666667in,textwidth=4.1333333333333in,marginparsep=0.3in]{geometry}
-% \index{independent variable|seealso{manipulation, treatment}}
-% \index{manipulation|seealso{independent variable, treatment}}
-% \index{treatment|seealso{independent variable, manipulation}}
-
-% \index{dependent variable|seealso{measure, outcome}}
-% \index{measure|seealso{dependent variable, outcome}}
-% \index{outcome|seealso{dependent variable, measure}}
-
-\index{de-identification|seealso{anonymization}}
-\index{anonymization|seealso{de-identification}}
-
-\index{analytic flexibility|seealso{p-hacking}}
-\index{p-hacking|seealso{analytic flexibility}}
-
-\index{Cohen's d|seealso{standardized mean difference (SMD)}}
-\index{standardized mean difference (SMD)|seealso{Cohen's d}}
-
-\index{APA|see{American Psychological Association (APA)}}
-\index{CDI|see{Communicative Development Inventory}}
-\index{DAG|see{directed acyclic graph (DAG)}}
-\index{blinding|see{masking}}
-
-\newgeometry{
-  centering,                             % split margins equally
-  margin=.6in,                           % margins (must all be at least .5in)
-  includemp, includehead,                % include sidenotes & header in body
-  % showframe                              % show page structure (for debugging)
-  % left=1in,
-  marginparwidth=0in,marginparsep=0.3in%,textwidth=4.1333333333333in
-}
-
-% \addtogeometry{}
-\printindex
-\restoregeometry{}
-
-
-\end{document}
diff --git a/README.md b/README.md
index 9fb0422..6e8ebe2 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Machine Learning from Human Preferences
 ### Sang Truong, Andreas Haupt, and Sanmi Koyejo
 
-This book is available online at: [ai.stanford.edu/~sttruong/mlhp](https://ai.stanford.edu/~sttruong/mlhp)
+This book is available online at: [mlhp.stanford.edu](https://mlhp.stanford.edu/)
 
 ## Prerequisites
 
@@ -137,8 +137,10 @@ source /lfs/local/0/sttruong/miniconda3/etc/profile.d/conda.sh
 conda activate mlhp
 cd /lfs/skampere2/0/sttruong/mlhp
 git pull
-quarto render --to html --profile html
-rsync -av --delete _book/ /afs/cs/group/koyejolab/mlhp/www/
+quarto render --to pdf --profile pdf
+quarto render --to html --profile html --no-clean
+rsync -av --delete --no-perms _book/ /afs/cs/group/koyejolab/mlhp/www/
+
 ```
 
 ## Troubleshooting
diff --git a/_quarto-pdf.yml b/_quarto-pdf.yml
index aeae93a..ca09255 100644
--- a/_quarto-pdf.yml
+++ b/_quarto-pdf.yml
@@ -1,4 +1,2 @@
 book:
-  appendices:
-    # only for latex output
-    # - example.qmd
+  appendices: []
diff --git a/_quarto.yml b/_quarto.yml
index a016b73..b251c36 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,10 +1,12 @@
 project:
   type: book
+  output-dir: _book
   preview:
     port: 4200
 
 resources:
   - CNAME
+  - "*.pdf"
   
 reference-location: document # footnotes in margin
 citation-location: document # citations in margin
diff --git a/cs329h.Rproj b/cs329h.Rproj
deleted file mode 100644
index cefe12c..0000000
--- a/cs329h.Rproj
+++ /dev/null
@@ -1,13 +0,0 @@
-Version: 1.0
-
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
-
-EnableCodeIndexing: Yes
-UseSpacesFortbl- Yes
-NumSpacesFortbl- 2
-Encoding: UTF-8
-
-RnwWeave: Sweave
-LaTeX: XeLaTeX