Skip to content

Commit 3041a23

Browse files
committed
feat(lexer): add string escape sequence handling
- Implement UnescapeString function for processing escape sequences - Update string tokenization to handle escaped quotes and special characters - Add comprehensive test cases for various escape sequences - Fix multi-line string handling with proper position tracking
1 parent 0bae6ee commit 3041a23

2 files changed

Lines changed: 157 additions & 15 deletions

File tree

lexer/lexer.go

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package lexer
22

33
import (
4+
"fmt"
5+
"strconv"
46
"strings"
57
"unicode"
68

@@ -182,6 +184,34 @@ func IsBinaryOperator(r rune) bool {
182184
}
183185
}
184186

187+
func UnescapeString(s string) (string, error) {
188+
stringQuoted := s
189+
isQuoted := false
190+
191+
if len(stringQuoted) == 0 {
192+
return "", nil
193+
}
194+
195+
if stringQuoted[0] == '"' && stringQuoted[len(stringQuoted)-1] == '"' {
196+
isQuoted = true
197+
} else {
198+
stringQuoted = fmt.Sprintf("\"%s\"", stringQuoted)
199+
}
200+
201+
unquoted, err := strconv.Unquote(stringQuoted)
202+
if err != nil {
203+
return "", err
204+
}
205+
206+
stringQuoted = unquoted
207+
208+
if !isQuoted {
209+
return stringQuoted, nil
210+
}
211+
212+
return fmt.Sprintf("\"%s\"", stringQuoted), nil
213+
}
214+
185215
func IsComparisonOperator(r string) bool {
186216
switch r {
187217
case "==", ">", "<", "!=", "<=", ">=":
@@ -366,7 +396,7 @@ func Tokenize(srcCode string) ([]Token, *errors.LexerError) {
366396
for position < srcLen {
367397
loopRune := runes[position] // Current rune from source
368398

369-
if loopRune == quoteRune {
399+
if loopRune == quoteRune && runes[position-1] != '\\' { // Not an escaped quote
370400
position++ // Consume the closing quote from runes
371401
currentColumn++ // The closing quote itself advances column
372402
foundEndQuote = true
@@ -392,7 +422,24 @@ func Tokenize(srcCode string) ([]Token, *errors.LexerError) {
392422
}
393423

394424
if foundEndQuote {
395-
fullLiteral := openingQuoteStr + strContentBuilder.String() + string(quoteRune)
425+
stringified := strContentBuilder.String()
426+
finalBuilder := strings.Builder{}
427+
splitted := strings.Split(strings.ReplaceAll(strings.ReplaceAll(stringified, "\r\n", "\n"), "\r", "\n"), "\n")
428+
for i, line := range splitted {
429+
unescapedLiteral, err := UnescapeString(line)
430+
if err != nil {
431+
return nil, &errors.LexerError{
432+
Character: quoteRune,
433+
Pos: unclosedStringErrorPos,
434+
Message: err.Error(),
435+
}
436+
}
437+
finalBuilder.WriteString(unescapedLiteral)
438+
if i != len(splitted)-1 {
439+
finalBuilder.WriteString("\n")
440+
}
441+
}
442+
fullLiteral := openingQuoteStr + finalBuilder.String() + string(quoteRune)
396443
tokens = append(tokens, NewToken(fullLiteral, String, tokStartLine, tokStartCol, currentLine, currentColumn))
397444
} else {
398445
return nil, &errors.LexerError{

lexer/lexer_test.go

Lines changed: 108 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,110 @@ func TestTokenFactory(t *testing.T) {
2222
}
2323
}
2424

25+
func TestUnescapeString(t *testing.T) {
26+
tests := []struct {
27+
name string
28+
input string
29+
want string
30+
wantErr bool
31+
}{
32+
{
33+
name: "Simple Unescape",
34+
input: `"hello"`,
35+
want: `"hello"`,
36+
wantErr: false,
37+
},
38+
{
39+
name: "Direct Unescape",
40+
input: `hello`,
41+
want: `hello`,
42+
wantErr: false,
43+
},
44+
{
45+
name: "Escape Sequences",
46+
input: `"\n\t"`,
47+
want: "\"\n\t\"",
48+
wantErr: false,
49+
},
50+
{
51+
name: "Empty String",
52+
input: `""`,
53+
want: `""`,
54+
wantErr: false,
55+
},
56+
{
57+
name: "Invalid String",
58+
input: `"hello`,
59+
want: "",
60+
wantErr: true,
61+
},
62+
{
63+
name: "More Escape Sequences",
64+
input: `\n\t\r`,
65+
want: "\n\t\r",
66+
wantErr: false,
67+
},
68+
}
69+
70+
for i, tt := range tests {
71+
tt := tt
72+
t.Run(tt.name, func(t *testing.T) {
73+
// t.Parallel()
74+
got, err := lexer.UnescapeString(tt.input)
75+
if (err != nil) != tt.wantErr {
76+
t.Errorf("UnescapeString() %d error = %v, wantErr %v", i + 1, err, tt.wantErr)
77+
return
78+
}
79+
if got != tt.want {
80+
t.Errorf("UnescapeString() %d got = %v, want %v", i + 1, got, tt.want)
81+
}
82+
})
83+
}
84+
}
85+
2586
func TestTokenize(t *testing.T) {
2687
tests := []struct {
2788
name string
2889
input string
2990
want []lexer.Token
3091
wantErr bool
3192
}{
93+
{
94+
name: "String Escape Sequences",
95+
input: `"\n\t\r"`,
96+
want: []lexer.Token{
97+
lexer.NewToken("\"\n\t\r\"", lexer.String, 1, 1, 1, 9),
98+
lexer.NewToken("<EOF>", lexer.EOF, 1, 9, 1, 9),
99+
},
100+
wantErr: false,
101+
},
102+
{
103+
name: "Multiple Escape Sequences",
104+
input: `"Hello\nWorld\t!\"'"`,
105+
want: []lexer.Token{
106+
lexer.NewToken("\"Hello\nWorld\t!\"'\"", lexer.String, 1, 1, 1, 21),
107+
lexer.NewToken("<EOF>", lexer.EOF, 1, 21, 1, 21),
108+
},
109+
wantErr: false,
110+
},
111+
{
112+
name: "All Escape Sequences",
113+
input: `"\\\\n\t\r\b\f\\\"'"`,
114+
want: []lexer.Token{
115+
lexer.NewToken("\"\\\\n\t\r\b\f\\\"'\"", lexer.String, 1, 1, 1, 21),
116+
lexer.NewToken("<EOF>", lexer.EOF, 1, 21, 1, 21),
117+
},
118+
wantErr: false,
119+
},
120+
{
121+
name: "Unicode Escape Sequence",
122+
input: `"\u2388 <- UNICODE"`,
123+
want: []lexer.Token{
124+
lexer.NewToken("\"\u2388 <- UNICODE\"", lexer.String, 1, 1, 1, 20),
125+
lexer.NewToken("<EOF>", lexer.EOF, 1, 20, 1, 20),
126+
},
127+
wantErr: false,
128+
},
32129
{
33130
name: "Simple Arithmetic Tokenization",
34131
input: "(4+2)*3",
@@ -247,8 +344,6 @@ func TestTokenize(t *testing.T) {
247344
},
248345
{
249346
name: "String with newline",
250-
// CURRENT LEXER BEHAVIOR: Treats '\\' and 'n' as two separate characters.
251-
// It does NOT process "\\n" into an actual newline character.
252347
input: "let s = \"line1\\nline2\";",
253348
want: []lexer.Token{
254349
lexer.NewToken("let", lexer.Let, 1, 1, 1, 4),
@@ -263,7 +358,7 @@ func TestTokenize(t *testing.T) {
263358
// "line2" (5 chars) -> curL=1, curC=22
264359
// Consume ". curL=1, curC=23.
265360
// So, EndLine=1, EndCol=23.
266-
lexer.NewToken("\"line1\\nline2\"", lexer.String, 1, 9, 1, 23),
361+
lexer.NewToken("\"line1\nline2\"", lexer.String, 1, 9, 1, 23),
267362
lexer.NewToken(";", lexer.SemiColon, 1, 23, 1, 24),
268363
lexer.NewToken("<EOF>", lexer.EOF, 1, 24, 1, 24),
269364
},
@@ -289,34 +384,34 @@ func TestTokenize(t *testing.T) {
289384
// d -> curL=2, curC=3
290385
// ' Consume '. curL=2, curC=4
291386
// So, EndLine=2, EndCol=4
292-
lexer.NewToken("'ab\r\ncd'", lexer.String, 1, 9, 2, 4),
387+
lexer.NewToken("'ab\ncd'", lexer.String, 1, 9, 2, 4),
293388
lexer.NewToken(";", lexer.SemiColon, 2, 4, 2, 5),
294389
lexer.NewToken("<EOF>", lexer.EOF, 2, 5, 2, 5),
295390
},
296391
wantErr: false,
297392
},
298393
}
299394

300-
for _, tt := range tests {
395+
for i, tt := range tests {
301396
tt := tt
302397
t.Run(tt.name, func(t *testing.T) {
303-
t.Parallel()
398+
// t.Parallel()
304399

305400
got, err := lexer.Tokenize(tt.input)
306401

307402
if (err != nil) != tt.wantErr {
308-
t.Fatalf("Tokenize() error = %v, wantErr %v", err, tt.wantErr)
403+
t.Fatalf("Tokenize() %d error = %v, wantErr %v", i + 1, err, tt.wantErr)
309404
}
310405

311406
if !tt.wantErr {
312407
if !reflect.DeepEqual(got, tt.want) {
313408
if len(got) != len(tt.want) {
314-
t.Errorf("Tokenize() token count mismatch:\nexpected: %d tokens\n got: %d tokens", len(tt.want), len(got))
409+
t.Errorf("Tokenize() %d token count mismatch:\nexpected: %d tokens\n got: %d tokens", i + 1, len(tt.want), len(got))
315410
t.Logf("Expected tokens: %+v", tt.want)
316411
t.Logf("Got tokens: %+v", got)
317412
for i := 0; i < min(len(got), len(tt.want)); i++ {
318413
if !reflect.DeepEqual(got[i], tt.want[i]) {
319-
t.Errorf("Token %d mismatch (first diff):\nexpected: %+v\n got: %+v", i, tt.want[i], got[i])
414+
t.Errorf("Tokenize() %d mismatch (first diff):\nexpected: %+v\n got: %+v", i + 1, tt.want[i], got[i])
320415
}
321416
}
322417
if len(got) > len(tt.want) {
@@ -325,10 +420,10 @@ func TestTokenize(t *testing.T) {
325420
t.Errorf("Missing tokens, expected: %+v", tt.want[len(got):])
326421
}
327422
} else {
328-
for i := range got {
329-
if !reflect.DeepEqual(got[i], tt.want[i]) {
330-
t.Errorf("Tokenize() mismatch at index %d:\nexpected: %+v (%s)\n got: %+v (%s)",
331-
i, tt.want[i], lexer.Stringify(tt.want[i].Type), got[i], lexer.Stringify(got[i].Type))
423+
for j := range got {
424+
if !reflect.DeepEqual(got[j], tt.want[j]) {
425+
t.Errorf("Tokenize() %d mismatch at index %d:\nexpected: %+v (%s)\n got: %+v (%s)",
426+
i + 1, j, tt.want[j], lexer.Stringify(tt.want[j].Type), got[j], lexer.Stringify(got[j].Type))
332427
break
333428
}
334429
}

0 commit comments

Comments
 (0)