harvardnlp · mdarm · Apr 7, 2024
diff --git a/the_annotated_transformer.py b/the_annotated_transformer.py
@@ -433,7 +433,7 @@ def forward(self, x, memory, src_mask, tgt_mask):
 #
 # We also modify the self-attention sub-layer in the decoder stack to
 # prevent positions from attending to subsequent positions.  This
-# masking, combined with fact that the output embeddings are offset by
+# masking, combined with the fact that output embeddings are offset by
 # one position, ensures that the predictions for position $i$ can
 # depend only on the known outputs at positions less than $i$.