Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions elixir/bench/tokenize_word_bench.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
input = """
let result = add(five, ten);
=+(){},;
let five = 5;
let ten = 10;
let add = fn(x, y) {
x + y;
};
let result = add(five, ten);
!-/*5;
5 < 10 > 5;
if (5 < 10) {
return true;
} else {
return false;
}
10 == 10;
11 != 10;
return 5;
return 10;
return add(15);
let returnfoo = 1;
return returnfoo;
"""

# expected_tokens = Monkey.OldLexer.init(input)
# actual_tokens = Monkey.Lexer.init(input)
#
# if actual_tokens != expected_tokens do
# Enum.zip([actual_tokens, expected_tokens])
# |> Enum.each(fn
# {t, t} -> IO.inspect(t)
# {actual, expected} ->
# IO.inspect([actual: actual, expected: expected])
# raise "mismatched output"
# end)
# end

Benchee.run(
%{
# copy + paste old code into `Monkey.OldLexer to compare. Not checked in`
# "OldLexer" => fn -> Monkey.OldLexer.init(input) end,
"Lexer" => fn -> Monkey.Lexer.init(input) end
},
warmup: 20,
time: 20,
memory_time: 5,
reduction_time: 5
)
160 changes: 93 additions & 67 deletions elixir/lib/lexer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ defmodule Monkey.Lexer do

## Example

iex> Lexer.init("let five = 5;")
iex> Monkey.Lexer.init("let five = 5;")
Copy link
Copy Markdown
Contributor

@ryanwinchester ryanwinchester Jun 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need that if the alias in the test ¯\(ツ)/¯, but fine with me either way

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is weird, as it works fine in ExUnit, but when I had it in a livebook, it was complaining about it. I think it's a bug in livebook though. But adding it makes it work either way 🤷‍♂️

[
:let,
{:ident, "five"},
Expand All @@ -68,84 +68,110 @@ defmodule Monkey.Lexer do
lex(input, [])
end

# Recursive base-case. When the input is empty, we add an EOF token.
#
# It is more efficient to prepend to a list and reverse it than to append to
# the list while building it. Lists here are linked-lists, so the whole list
# would be copied each time we append to it.
# See: https://www.erlang.org/doc/efficiency_guide/listhandling
@spec lex(input :: String.t(), [token()]) :: [token()]
defp lex(<<>>, tokens) do
[:eof | tokens] |> Enum.reverse()
def lex(input, tokens) do
# Tail-recursively go through the input, tokenizing the character(s).
#
# We are using pure tail recursion, rather than a tail recursive main function with a helper to extract tokens.
# Everything gets given `tokens`, and is responsible for calling `lex/2` to continue iterating over `input`, and
# does not return until :eof is hit.
# This improves performance in these ways:
# - avoid work to assemble a tuple of {token, rest} to allow multiple return values
# - avoid work to pattern match to extract values from the {token, rest} return tuple
# - not allocating that tuple on the heap - so 0 heap memory is allocated at all (or needs to GCed later)
# - BEAM uses `call_only` instructions only, so no stack allocation needed, or continuation pointer management etc
# needed, and all arguments are passed via BEAM registers
#
# By doing all matching in one place here, the compiler will generate optimised lookup on the matched input. If
# this is spread out between a main function, and a helper to tokenize, it needs to do the match multiple times, and
# it won't be as well optimised due to having less context on what the execution paths would be
#
# It is more efficient to prepend to a list and reverse it than to append to
# the list while building it. Lists here are linked-lists, so the whole list
# would be copied each time we append to it.
# See: https://www.erlang.org/doc/efficiency_guide/listhandling
#
# Uses binary pattern-matching to match on the first character(s).
# e.g. `<<c::8, rest::binary>>` matches on the first 8 bits and assigns it
# to `c`, then assigns the rest of the binary to `rest`.
# For more details, see: https://hexdocs.pm/elixir/Kernel.SpecialForms.html#<<>>/1
#
# this logic works equivalent if done as multiple function heads, or a case statement (the compiled BEAM code is
# identical either way). The benefit of a case statement is each case clause only has to worry about the string
# match, and doesn't have to cart around `tokens`, as we've already done that when we entered the function.
case input do
# Recursive base-case. When the input is empty, we add an EOF token.
<<>> -> [:eof | tokens] |> Enum.reverse()
# Ignore whitespace.
<<c::8, rest::binary>> when is_whitespace(c) -> lex(rest, tokens)
<<"==", rest::binary>> -> lex(rest, [:equal | tokens])
<<"!=", rest::binary>> -> lex(rest, [:not_equal | tokens])
<<";", rest::binary>> -> lex(rest, [:semicolon | tokens])
<<",", rest::binary>> -> lex(rest, [:comma | tokens])
<<"(", rest::binary>> -> lex(rest, [:lparen | tokens])
<<")", rest::binary>> -> lex(rest, [:rparen | tokens])
<<"{", rest::binary>> -> lex(rest, [:lsquirly | tokens])
<<"}", rest::binary>> -> lex(rest, [:rsquirly | tokens])
<<"=", rest::binary>> -> lex(rest, [:assign | tokens])
<<"+", rest::binary>> -> lex(rest, [:plus | tokens])
<<"-", rest::binary>> -> lex(rest, [:minus | tokens])
<<"!", rest::binary>> -> lex(rest, [:bang | tokens])
<<"/", rest::binary>> -> lex(rest, [:slash | tokens])
<<"*", rest::binary>> -> lex(rest, [:asterisk | tokens])
<<">", rest::binary>> -> lex(rest, [:greater_than | tokens])
<<"<", rest::binary>> -> lex(rest, [:less_than | tokens])
<<"fn", rest::binary>> -> maybe_keyword(rest, byte_size("fn"), input, :function, tokens)
<<"let", rest::binary>> -> maybe_keyword(rest, byte_size("let"), input, :let, tokens)
<<"if", rest::binary>> -> maybe_keyword(rest, byte_size("if"), input, :if, tokens)
<<"else", rest::binary>> -> maybe_keyword(rest, byte_size("else"), input, :else, tokens)
<<"true", rest::binary>> -> maybe_keyword(rest, byte_size("true"), input, true, tokens)
<<"false", rest::binary>> -> maybe_keyword(rest, byte_size("false"), input, false, tokens)
<<"return", rest::binary>> -> maybe_keyword(rest, byte_size("return"), input, :return, tokens)
<<c::8, rest::binary>> when is_letter(c) -> identifier(rest, 1, input, tokens)
<<c::8, rest::binary>> when is_digit(c) -> number(rest, 1, input, tokens)
<<c::8, rest::binary>> -> lex(rest, [{:illegal, <<c>>} | tokens])
end
end

# Ignore whitespace.
defp lex(<<c::8, rest::binary>>, tokens) when is_whitespace(c) do
lex(rest, tokens)
# we have already matched that we have the start of a keyword. Check the next character to see if it indicates the
# keyword token end. If so, then we can just return the token directly. If not, then it's a plain identifier, and
@spec maybe_keyword(String.t(), integer(), String.t(), token(), [token()]) :: [token()]
defp maybe_keyword(<<c::8, rest::binary>>, ident_len, input, _keyword, tokens)
when is_letter(c) do
# we have another letter, so this isn't a keyword - but an identifier that starts with the same characters. Tokenize it as such.
identifier(rest, ident_len + 1, input, tokens)
end

# Tail-recursively go through the input, tokenizing the character(s).
defp lex(input, tokens) do
{token, rest} = tokenize(input)
lex(rest, [token | tokens])
defp maybe_keyword(rest, _ident_len, _input, keyword, tokens) do
# next char wasn't a letter, so this is actually a keyword
lex(rest, [keyword | tokens])
end

# Uses binary pattern-matching to match on the first character(s).
# e.g. `<<c::8, rest::binary>>` matches on the first 8 bits and assigns it
# to `c`, then assigns the rest of the binary to `rest`.
# For more details, see: https://hexdocs.pm/elixir/Kernel.SpecialForms.html#<<>>/1
@spec tokenize(input :: String.t()) :: {token(), rest :: String.t()}
defp tokenize(<<"==", rest::binary>>), do: {:equal, rest}
defp tokenize(<<"!=", rest::binary>>), do: {:not_equal, rest}
defp tokenize(<<";", rest::binary>>), do: {:semicolon, rest}
defp tokenize(<<",", rest::binary>>), do: {:comma, rest}
defp tokenize(<<"(", rest::binary>>), do: {:lparen, rest}
defp tokenize(<<")", rest::binary>>), do: {:rparen, rest}
defp tokenize(<<"{", rest::binary>>), do: {:lsquirly, rest}
defp tokenize(<<"}", rest::binary>>), do: {:rsquirly, rest}
defp tokenize(<<"=", rest::binary>>), do: {:assign, rest}
defp tokenize(<<"+", rest::binary>>), do: {:plus, rest}
defp tokenize(<<"-", rest::binary>>), do: {:minus, rest}
defp tokenize(<<"!", rest::binary>>), do: {:bang, rest}
defp tokenize(<<"/", rest::binary>>), do: {:slash, rest}
defp tokenize(<<"*", rest::binary>>), do: {:asterisk, rest}
defp tokenize(<<">", rest::binary>>), do: {:greater_than, rest}
defp tokenize(<<"<", rest::binary>>), do: {:less_than, rest}
defp tokenize(<<c::8, rest::binary>>) when is_letter(c), do: read_identifier(rest, <<c>>)
defp tokenize(<<c::8, rest::binary>>) when is_digit(c), do: read_number(rest, <<c>>)
defp tokenize(<<c::8, rest::binary>>), do: {{:illegal, <<c>>}, rest}

# Recursively read the input until we hit a non-letter character. Builds an
# iolist, then tokenizes the word.
@spec read_identifier(String.t(), iodata()) :: {token(), String.t()}
defp read_identifier(<<c::8, rest::binary>>, acc) when is_letter(c) do
read_identifier(rest, [acc | <<c>>])
# Recursively read the input until we hit a non-letter character.
#
# Rather than extracting a single character at a time via pattern matching and storing them in an accumulator,
# instead track how many characters we are into the original input as we go. Then, when the token end is detected,
# do a single binary pattern match of the specified length to pull out a sub binary. This means we don't accumulate
# garbage or allocate anything on the heap as we go. The sub binary will be a simple reference into the original
# input, which is very fast, and memory efficient.
@spec identifier(String.t(), integer(), String.t(), [token()]) :: [token()]
defp identifier(<<c::8, rest::binary>>, ident_len, input, tokens) when is_letter(c) do
identifier(rest, ident_len + 1, input, tokens)
end

defp read_identifier(rest, acc) do
{IO.iodata_to_binary(acc) |> tokenize_word(), rest}
defp identifier(_rest, ident_len, input, tokens) do
<<ident::bytes-size(ident_len), rest::binary>> = input
lex(rest, [{:ident, ident} | tokens])
end

# Recursively read the input until we hit a non-digit character. Builds an
# iolist, then tokenizes the number.
@spec read_number(String.t(), iodata()) :: {token(), String.t()}
defp read_number(<<c::8, rest::binary>>, acc) when is_digit(c) do
read_number(rest, [acc | <<c>>])
# Recursively read the input until we hit a non-digit character.
@spec number(String.t(), integer(), String.t(), [token()]) :: [token()]
defp number(<<c::8, rest::binary>>, number_len, input, tokens) when is_digit(c) do
number(rest, number_len + 1, input, tokens)
end

defp read_number(rest, acc) do
{{:int, IO.iodata_to_binary(acc)}, rest}
defp number(_rest, number_len, input, tokens) do
<<number::bytes-size(number_len), rest::binary>> = input
lex(rest, [{:int, number} | tokens])
end

# Tokenize the word. Checks if it is a keyword, otherwise it is an
# identifier.
@spec tokenize_word(String.t()) :: keyword_token() | {:ident, String.t()}
defp tokenize_word("fn"), do: :function
defp tokenize_word("let"), do: :let
defp tokenize_word("if"), do: :if
defp tokenize_word("else"), do: :else
defp tokenize_word("true"), do: true
defp tokenize_word("false"), do: false
defp tokenize_word("return"), do: :return
defp tokenize_word(ident), do: {:ident, ident}
end
1 change: 1 addition & 0 deletions elixir/mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ defmodule Monkey.MixProject do
# Run "mix help deps" to learn about dependencies.
defp deps do
[
{:benchee, "~> 1.1", only: [:dev, :test], runtime: false}
# {:dep_from_hexpm, "~> 0.3.0"},
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"}
]
Expand Down
5 changes: 5 additions & 0 deletions elixir/mix.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
%{
"benchee": {:hex, :benchee, "1.1.0", "f3a43817209a92a1fade36ef36b86e1052627fd8934a8b937ac9ab3a76c43062", [:mix], [{:deep_merge, "~> 1.0", [hex: :deep_merge, repo: "hexpm", optional: false]}, {:statistex, "~> 1.0", [hex: :statistex, repo: "hexpm", optional: false]}], "hexpm", "7da57d545003165a012b587077f6ba90b89210fd88074ce3c60ce239eb5e6d93"},
"deep_merge": {:hex, :deep_merge, "1.0.0", "b4aa1a0d1acac393bdf38b2291af38cb1d4a52806cf7a4906f718e1feb5ee961", [:mix], [], "hexpm", "ce708e5f094b9cd4e8f2be4f00d2f4250c4095be93f8cd6d018c753894885430"},
"statistex": {:hex, :statistex, "1.0.0", "f3dc93f3c0c6c92e5f291704cf62b99b553253d7969e9a5fa713e5481cd858a5", [:mix], [], "hexpm", "ff9d8bee7035028ab4742ff52fc80a2aa35cece833cf5319009b52f1b5a86c27"},
}