diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/ExpressionToken.java b/src/main/java/com/hubspot/jinjava/tree/parse/ExpressionToken.java index d8d9996d5..1c0d679c0 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/ExpressionToken.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/ExpressionToken.java @@ -54,7 +54,14 @@ public int getType() { @Override protected void parse() { - this.expr = WhitespaceUtils.unwrap(image, "{{", "}}"); + // Use the symbols-derived delimiter strings instead of the hardcoded "{{" / "}}" + // so that custom delimiters (e.g. "\VAR{" / "}") are stripped correctly. + this.expr = + WhitespaceUtils.unwrap( + image, + getSymbols().getExpressionStart(), + getSymbols().getExpressionEnd() + ); this.expr = handleTrim(expr); this.expr = StringUtils.trimToEmpty(this.expr); } diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/NoteToken.java b/src/main/java/com/hubspot/jinjava/tree/parse/NoteToken.java index 3f5360e67..450f9ccbd 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/NoteToken.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/NoteToken.java @@ -48,8 +48,11 @@ public int getType() { */ @Override protected void parse() { - if (image.length() > 4) { // {# #} - handleTrim(image.substring(2, image.length() - 2)); + int startLen = getSymbols().getCommentStartLength(); + int endLen = getSymbols().getCommentEndLength(); + + if (image.length() > startLen + endLen) { + handleTrim(image.substring(startLen, image.length() - endLen)); } content = ""; } diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java b/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java new file mode 100644 index 000000000..242abd241 --- /dev/null +++ b/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java @@ -0,0 +1,269 @@ +/********************************************************************** + * Copyright (c) 2014 HubSpot Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **********************************************************************/ +package com.hubspot.jinjava.tree.parse; + +/** + * A {@link TokenScannerSymbols} implementation that supports arbitrary multi-character + * delimiter strings, addressing + * issue #195. + * + *

Unlike {@link DefaultTokenScannerSymbols}, which is constrained to single-character + * prefixes and postfixes, this class allows any non-empty string for each of the six + * delimiter roles. The delimiters do not need to share a common prefix character. + * + *

{@link TokenScanner} detects this class via {@link #isStringBased()} and activates + * a string-matching scan path. {@link ExpressionToken}, {@link TagToken}, and + * {@link NoteToken} use the length accessors on {@link TokenScannerSymbols} (e.g. + * {@link #getExpressionStartLength()}) to strip delimiters correctly regardless of length. + * + *

The single-character abstract methods inherited from {@link TokenScannerSymbols} + * return private Unicode Private-Use-Area sentinel values. These are used only as + * token-kind discriminators inside {@link Token#newToken} and must never be used for + * scanning template text. + * + *

Example

+ *
{@code
+ * JinjavaConfig config = JinjavaConfig.newBuilder()
+ *     .withTokenScannerSymbols(StringTokenScannerSymbols.builder()
+ *         .withVariableStartString("\\VAR{")
+ *         .withVariableEndString("}")
+ *         .withBlockStartString("\\BLOCK{")
+ *         .withBlockEndString("}")
+ *         .withCommentStartString("\\#{")
+ *         .withCommentEndString("}")
+ *         .build())
+ *     .build();
+ * }
+ */ +public class StringTokenScannerSymbols extends TokenScannerSymbols { + + private static final long serialVersionUID = 1L; + + // ── Internal sentinel chars ──────────────────────────────────────────────── + // Unicode Private Use Area values — guaranteed never to appear in real template + // text, so Token.newToken()'s if-chain dispatches to the right Token subclass. + static final char SENTINEL_FIXED = '\uE000'; + static final char SENTINEL_NOTE = '\uE001'; + static final char SENTINEL_TAG = '\uE002'; + static final char SENTINEL_EXPR_START = '\uE003'; + static final char SENTINEL_EXPR_END = '\uE004'; + static final char SENTINEL_PREFIX = '\uE005'; // unused for scanning + static final char SENTINEL_POSTFIX = '\uE006'; // unused for scanning + static final char SENTINEL_NEWLINE = '\n'; // real newline for line tracking + static final char SENTINEL_TRIM = '-'; // real trim char + + // ── The configured string delimiters ────────────────────────────────────── + private final String variableStartString; + private final String variableEndString; + private final String blockStartString; + private final String blockEndString; + private final String commentStartString; + private final String commentEndString; + // Optional; null means disabled. + private final String lineStatementPrefix; + private final String lineCommentPrefix; + + private StringTokenScannerSymbols(Builder builder) { + this.variableStartString = builder.variableStartString; + this.variableEndString = builder.variableEndString; + this.blockStartString = builder.blockStartString; + this.blockEndString = builder.blockEndString; + this.commentStartString = builder.commentStartString; + this.commentEndString = builder.commentEndString; + this.lineStatementPrefix = builder.lineStatementPrefix; + this.lineCommentPrefix = builder.lineCommentPrefix; + } + + // ── Abstract char contract — returns sentinels only ─────────────────────── + + @Override + public char getPrefixChar() { + return SENTINEL_PREFIX; + } + + @Override + public char getPostfixChar() { + return SENTINEL_POSTFIX; + } + + @Override + public char getFixedChar() { + return SENTINEL_FIXED; + } + + @Override + public char getNoteChar() { + return SENTINEL_NOTE; + } + + @Override + public char getTagChar() { + return SENTINEL_TAG; + } + + @Override + public char getExprStartChar() { + return SENTINEL_EXPR_START; + } + + @Override + public char getExprEndChar() { + return SENTINEL_EXPR_END; + } + + @Override + public char getNewlineChar() { + return SENTINEL_NEWLINE; + } + + @Override + public char getTrimChar() { + return SENTINEL_TRIM; + } + + // ── String-level getters: MUST override the base-class lazy cache ────────── + // The base class builds these from the char methods above, which would produce + // garbage sentinel strings. We override them to return the real delimiters so + // that ExpressionToken, TagToken, and NoteToken strip content correctly. + + @Override + public String getExpressionStart() { + return variableStartString; + } + + @Override + public String getExpressionEnd() { + return variableEndString; + } + + @Override + public String getExpressionStartWithTag() { + return blockStartString; + } + + @Override + public String getExpressionEndWithTag() { + return blockEndString; + } + + @Override + public String getOpeningComment() { + return commentStartString; + } + + @Override + public String getClosingComment() { + return commentEndString; + } + + @Override + public String getLineStatementPrefix() { + return lineStatementPrefix; + } + + @Override + public String getLineCommentPrefix() { + return lineCommentPrefix; + } + + // ── isStringBased flag ──────────────────────────────────────────────────── + + @Override + public boolean isStringBased() { + return true; + } + + // ── Builder ──────────────────────────────────────────────────────────────── + + public static Builder builder() { + return new Builder(); + } + + public static final class Builder { + + // Defaults mirror the standard Jinja2 delimiters, so building with no + // overrides behaves identically to DefaultTokenScannerSymbols. + private String variableStartString = "{{"; + private String variableEndString = "}}"; + private String blockStartString = "{%"; + private String blockEndString = "%}"; + private String commentStartString = "{#"; + private String commentEndString = "#}"; + private String lineStatementPrefix = null; // disabled by default + private String lineCommentPrefix = null; // disabled by default + + public Builder withVariableStartString(String s) { + this.variableStartString = requireNonEmpty(s, "variableStartString"); + return this; + } + + public Builder withVariableEndString(String s) { + this.variableEndString = requireNonEmpty(s, "variableEndString"); + return this; + } + + public Builder withBlockStartString(String s) { + this.blockStartString = requireNonEmpty(s, "blockStartString"); + return this; + } + + public Builder withBlockEndString(String s) { + this.blockEndString = requireNonEmpty(s, "blockEndString"); + return this; + } + + public Builder withCommentStartString(String s) { + this.commentStartString = requireNonEmpty(s, "commentStartString"); + return this; + } + + public Builder withCommentEndString(String s) { + this.commentEndString = requireNonEmpty(s, "commentEndString"); + return this; + } + + /** + * Sets the line statement prefix (e.g. {@code "%%"}). A line beginning with + * this prefix is treated as a block tag, equivalent to wrapping its content + * in the configured block delimiters. Pass {@code null} to disable (default). + */ + public Builder withLineStatementPrefix(String s) { + this.lineStatementPrefix = s; + return this; + } + + /** + * Sets the line comment prefix (e.g. {@code "%#"}). A line beginning with + * this prefix is stripped entirely from the output. Pass {@code null} to + * disable (default). + */ + public Builder withLineCommentPrefix(String s) { + this.lineCommentPrefix = s; + return this; + } + + public StringTokenScannerSymbols build() { + return new StringTokenScannerSymbols(this); + } + + private static String requireNonEmpty(String value, String name) { + if (value == null || value.isEmpty()) { + throw new IllegalArgumentException(name + " must not be null or empty"); + } + return value; + } + } +} diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TagToken.java b/src/main/java/com/hubspot/jinjava/tree/parse/TagToken.java index a737dd96c..0c500c145 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TagToken.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TagToken.java @@ -54,7 +54,10 @@ public int getType() { */ @Override protected void parse() { - if (image.length() < 4) { + int startLen = getSymbols().getTagStartLength(); + int endLen = getSymbols().getTagEndLength(); + + if (image.length() < startLen + endLen) { throw new TemplateSyntaxException( image, "Malformed tag token", @@ -63,7 +66,7 @@ protected void parse() { ); } - content = image.substring(2, image.length() - 2); + content = image.substring(startLen, image.length() - endLen); content = handleTrim(content); int nameStart = -1, pos = 0, len = content.length(); diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java index 7e53b295a..fc203ef21 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java @@ -42,6 +42,24 @@ public class TokenScanner extends AbstractIterator { private final TokenScannerSymbols symbols; private final WhitespaceControlParser whitespaceControlParser; + // String-based path state — only populated when symbols.isStringBased() == true. + private final boolean stringBased; + private final char[] varStart; + private final char[] varEnd; + private final char[] blkStart; + private final char[] blkEnd; + private final char[] cmtStart; + private final char[] cmtEnd; + + // Optional line-oriented prefixes; null when not configured. + private final char[] lineStmtPrefix; + private final char[] lineCommentPrefix; + + // Remembers where the current opening delimiter began so the emitted block/comment + // token image starts from the opener (not the content), letting parse() strip the + // correct number of delimiter characters from both ends. + private int blockOpenerStart = 0; + public TokenScanner(String input, JinjavaConfig config) { this.config = config; @@ -58,15 +76,523 @@ public TokenScanner(String input, JinjavaConfig config) { inQuote = 0; currLine = 1; lastNewlinePos = 0; + blockOpenerStart = 0; symbols = config.getTokenScannerSymbols(); + stringBased = symbols.isStringBased(); whitespaceControlParser = config.getLegacyOverrides().isParseWhitespaceControlStrictly() ? WhitespaceControlParser.STRICT : WhitespaceControlParser.LENIENT; + + if (stringBased) { + varStart = symbols.getExpressionStart().toCharArray(); + varEnd = symbols.getExpressionEnd().toCharArray(); + blkStart = symbols.getExpressionStartWithTag().toCharArray(); + blkEnd = symbols.getExpressionEndWithTag().toCharArray(); + cmtStart = symbols.getOpeningComment().toCharArray(); + cmtEnd = symbols.getClosingComment().toCharArray(); + + String lsp = symbols.getLineStatementPrefix(); + lineStmtPrefix = (lsp != null && !lsp.isEmpty()) ? lsp.toCharArray() : null; + + String lcp = symbols.getLineCommentPrefix(); + lineCommentPrefix = (lcp != null && !lcp.isEmpty()) ? lcp.toCharArray() : null; + } else { + varStart = varEnd = blkStart = blkEnd = cmtStart = cmtEnd = null; + lineStmtPrefix = null; + lineCommentPrefix = null; + } } + // ── Dispatch ─────────────────────────────────────────────────────────────── + private Token getNextToken() { + return stringBased ? getNextTokenStringBased() : getNextTokenCharBased(); + } + + // ── String-based scanning path ───────────────────────────────────────────── + // + // tokenStart — start of the next text region to buffer. + // blockOpenerStart — position of the current opening delimiter; the emitted + // block/comment token image begins here. + // lastStart / tokenLength — the slice passed to Token.newToken(). + // + // Two-phase emission: + // 1. Opener detected → flush buffered plain text as TEXT, record + // blockOpenerStart, advance tokenStart/currPost past the opener into + // the block content, set inBlock/inComment. + // 2. Closer detected → emit is[blockOpenerStart .. closerEnd) as the + // appropriate token type; advance tokenStart = currPost = closerEnd. + + // Sentinel returned by scan helpers to mean "a delimiter was matched and + // scanner state was updated — loop again without advancing currPost". + // Any non-null return from a helper that is NOT this sentinel is a real token. + private static final Token DELIMITER_MATCHED = new TextToken( + "", + 0, + 0, + new DefaultTokenScannerSymbols() + ); + + private Token getNextTokenStringBased() { + while (currPost < length) { + char c = is[currPost]; + + if (c == '\n') { + currLine++; + lastNewlinePos = currPost + 1; + } + + if (inComment > 0) { + Token t = scanInsideComment(); + if (t != null) { + return t; + } + continue; // scanInsideComment advanced currPost + } + + if (inBlock > 0) { + Token t = scanInsideBlock(c); + if (t == DELIMITER_MATCHED) { + continue; // closer not yet found, currPost already advanced + } + if (t != null) { + return t; + } + continue; + } + + if (inRaw == 0) { + Token t = scanPlainText(c); + if (t == DELIMITER_MATCHED) { + continue; // opener matched, state updated, no pending text + } + if (t != null) { + return t; // pending text flushed, or line-statement token + } + // null means nothing matched — fall through to advance + } else { + Token t = scanRawMode(); + if (t == DELIMITER_MATCHED) { + continue; + } + if (t != null) { + return t; + } + } + + currPost++; + } + + if (currPost > tokenStart) { + return getEndTokenStringBased(); + } + return null; + } + + /** Scans one character while inside a comment block; advances {@code currPost}. */ + private Token scanInsideComment() { + if (regionMatches(currPost, cmtEnd)) { + lastStart = blockOpenerStart; + tokenLength = currPost + cmtEnd.length - blockOpenerStart; + tokenStart = currPost + cmtEnd.length; + currPost = tokenStart; + inComment = 0; + int kind = tokenKind; + tokenKind = symbols.getFixed(); + return emitStringToken(kind); + } + currPost++; + return null; + } + + /** + * Scans one character while inside a variable or tag block; advances + * {@code currPost}. Returns a real token when the closer is found, or + * {@link #DELIMITER_MATCHED} (meaning "keep looping") otherwise. + */ + private Token scanInsideBlock(char c) { + if (inQuote != 0) { + // Inside a quoted string: a backslash escapes the next character so a + // delimiter or quote character following it does not prematurely close + // the block or the string. + if (c == '\\') { + currPost += (currPost + 1 < length) ? 2 : 1; + return DELIMITER_MATCHED; + } + if (c == inQuote) { + inQuote = 0; + } + currPost++; + return DELIMITER_MATCHED; + } + // Outside a quoted string: a backslash escapes the next character. + if (c == '\\') { + currPost += (currPost + 1 < length) ? 2 : 1; + return DELIMITER_MATCHED; + } + if (c == '\'' || c == '"') { + inQuote = c; + currPost++; + return DELIMITER_MATCHED; + } + // Check for the closing delimiter matching the current block type. + char[] closeDelim = closingDelimFor(tokenKind); + if (closeDelim != null && regionMatches(currPost, closeDelim)) { + lastStart = blockOpenerStart; + tokenLength = currPost + closeDelim.length - blockOpenerStart; + tokenStart = currPost + closeDelim.length; + currPost = tokenStart; + inBlock = 0; + int kind = tokenKind; + tokenKind = symbols.getFixed(); + return emitStringToken(kind); + } + currPost++; + return DELIMITER_MATCHED; + } + + /** + * Scans for openers while in normal (non-raw) plain-text mode. + * Returns a real token when one is ready to emit, {@link #DELIMITER_MATCHED} + * when an opener was matched with no pending text, or {@code null} when + * nothing matched (caller should advance {@code currPost}). + */ + private Token scanPlainText(char c) { + // ── Line statement prefix (e.g. "%% if foo") ────────────────────────── + if ( + lineStmtPrefix != null && + isStartOfLine(currPost) && + regionMatches(currPost, lineStmtPrefix) + ) { + return handleLineStatement(); + } + // ── Line comment prefix (e.g. "%# this is ignored") ─────────────────── + if ( + lineCommentPrefix != null && + isStartOfLine(currPost) && + regionMatches(currPost, lineCommentPrefix) + ) { + return handleLineComment(); + } + // ── Variable opener e.g. "{{" or "\VAR{" ────────────────────────────── + if (regionMatches(currPost, varStart)) { + return openBlock(varStart, symbols.getExprStart(), false); + } + // ── Block opener e.g. "{%" or "\BLOCK{" ─────────────────────────────── + if (regionMatches(currPost, blkStart)) { + return openBlock(blkStart, symbols.getTag(), false); + } + // ── Comment opener e.g. "{#" or "\#{" ───────────────────────────────── + if (regionMatches(currPost, cmtStart)) { + return openBlock(cmtStart, symbols.getNote(), true); + } + return null; // nothing matched + } + + /** + * Scans for the endraw block opener while in raw mode. + * Returns a real token, {@link #DELIMITER_MATCHED}, or {@code null}. + */ + private Token scanRawMode() { + if (regionMatches(currPost, blkStart)) { + int contentStart = currPost + blkStart.length; + int pos = contentStart; + while (pos < length && Character.isWhitespace(is[pos])) { + pos++; + } + if (charArrayRegionMatches(is, pos, "endraw")) { + Token pending = flushTextBefore(currPost); + blockOpenerStart = currPost; + tokenStart = contentStart; + currPost = tokenStart; + tokenKind = symbols.getTag(); + inBlock = 1; + if (pending != null) { + return pending; + } + return DELIMITER_MATCHED; + } + } + return null; + } + + /** + * Opens a variable or tag block (sets {@code inBlock}) or a comment block + * (sets {@code inComment}). Flushes any pending text first. + * Returns the pending text token if one exists, {@link #DELIMITER_MATCHED} otherwise. + */ + private Token openBlock(char[] opener, int kind, boolean isComment) { + Token pending = flushTextBefore(currPost); + blockOpenerStart = currPost; + tokenStart = currPost + opener.length; + currPost = tokenStart; + tokenKind = kind; + if (isComment) { + inComment = 1; + } else { + inBlock = 1; + } + return (pending != null) ? pending : DELIMITER_MATCHED; + } + + /** + * Handles a line statement prefix: consumes the line, builds a synthetic block + * tag token, and returns appropriately (stashing the tag if text was pending). + */ + private Token handleLineStatement() { + Token pending = flushTextBefore(lineIndentStart(currPost)); + + int contentStart = currPost + lineStmtPrefix.length; + while (contentStart < length && is[contentStart] == ' ') { + contentStart++; + } + int contentEnd = contentStart; + while (contentEnd < length && is[contentEnd] != '\n') { + contentEnd++; + } + String inner = String.valueOf(is, contentStart, contentEnd - contentStart).trim(); + String syntheticImage = + symbols.getExpressionStartWithTag() + + " " + + inner + + " " + + symbols.getExpressionEndWithTag(); + + int next = contentEnd; + if (next < length && is[next] == '\n') { + next++; + currLine++; + lastNewlinePos = next; + } + tokenStart = next; + currPost = next; + + Token stmtToken = Token.newToken( + symbols.getTag(), + symbols, + whitespaceControlParser, + syntheticImage, + currLine, + 1 + ); + if (pending != null) { + pendingToken = stmtToken; + return pending; + } + return stmtToken; + } + + /** + * Handles a line comment prefix: consumes the entire line (including newline) + * and returns any pending text token, or {@link #DELIMITER_MATCHED} if none. + */ + private Token handleLineComment() { + Token pending = flushTextBefore(lineIndentStart(currPost)); + + int end = currPost + lineCommentPrefix.length; + while (end < length && is[end] != '\n') { + end++; + } + int next = end; + if (next < length && is[next] == '\n') { + next++; + currLine++; + lastNewlinePos = next; + } + tokenStart = next; + currPost = next; + + // The comment itself produces no token. Return pending text if any, + // otherwise DELIMITER_MATCHED so the caller loops without advancing currPost. + return (pending != null) ? pending : DELIMITER_MATCHED; + } + + /** + * Returns the position of the first character of the indentation on the line + * containing {@code pos} — i.e. the position just after the preceding newline + * (or 0 if at the start of input). This is used to exclude leading horizontal + * whitespace from the text token flushed before a line prefix match, so that + * indented line statements and line comments don't leave whitespace in the output. + */ + private int lineIndentStart(int pos) { + // Walk back past the horizontal whitespace that isStartOfLine already accepted. + int p = pos - 1; + while (p >= 0 && (is[p] == ' ' || is[p] == '\t')) { + p--; + } + // p is now at the newline before the indentation, or at -1. + return p + 1; + } + + // ── One-slot stash for the synthetic tag after a line-statement ───────── + // When a line-statement prefix is found and there is pending text to flush + // first, we return the text token immediately and stash the synthetic tag + // here so computeNext() picks it up on the very next call. + private Token pendingToken = null; + + @Override + protected Token computeNext() { + // Drain any stashed token first. + if (pendingToken != null) { + Token t = pendingToken; + pendingToken = null; + return t; + } + + Token t = getNextToken(); + if (t == null) { + return endOfData(); + } + return t; + } + + // ── Helpers ─────────────────────────────────────────────────────────────── + + /** + * Returns true when {@code pos} is at the start of a line — i.e. it is either + * the very first character of the input, or the character immediately after a + * newline (accounting for any leading whitespace that lstripBlocks may allow). + */ + private boolean isStartOfLine(int pos) { + if (pos == 0) { + return true; + } + // Walk backwards past any horizontal whitespace (spaces/tabs). + int p = pos - 1; + while (p >= 0 && (is[p] == ' ' || is[p] == '\t')) { + p--; + } + // True if we hit the beginning of the input or a newline. + return p < 0 || is[p] == '\n'; + } + + /** + * If {@code is[tokenStart..upTo)} contains un-emitted plain text, captures it + * as a TEXT token and returns it. Returns {@code null} for zero-length regions. + * Does NOT update {@code tokenStart} — the caller sets it after returning. + */ + private Token flushTextBefore(int upTo) { + int textLen = upTo - tokenStart; + if (textLen <= 0) { + return null; + } + lastStart = tokenStart; + tokenLength = textLen; + return emitStringToken(symbols.getFixed()); + } + + /** Returns the closing delimiter for the currently open block kind. */ + private char[] closingDelimFor(int currentKind) { + if (currentKind == symbols.getExprStart()) { + return varEnd; + } + if (currentKind == symbols.getTag()) { + return blkEnd; + } + if (currentKind == symbols.getNote()) { + return cmtEnd; + } + return null; + } + + /** + * Constructs a token from {@code lastStart}/{@code tokenLength}, then applies + * trimBlocks and raw-mode post-processing identical to the char-based path. + */ + private Token emitStringToken(int kind) { + Token t = Token.newToken( + kind, + symbols, + whitespaceControlParser, + String.valueOf(is, lastStart, tokenLength), + currLine, + lastStart - lastNewlinePos + 1 + ); + + if ( + (t instanceof TagToken || t instanceof NoteToken) && + config.isTrimBlocks() && + currPost < length && + is[currPost] == '\n' + ) { + lastNewlinePos = currPost + 1; + ++currPost; + ++tokenStart; + } + + if (t instanceof TagToken) { + TagToken tt = (TagToken) t; + if ("raw".equals(tt.getTagName())) { + inRaw = 1; + return tt; + } else if ("endraw".equals(tt.getTagName())) { + inRaw = 0; + return tt; + } + } + + if (inRaw > 0 && t.getType() != symbols.getFixed()) { + return Token.newToken( + symbols.getFixed(), + symbols, + whitespaceControlParser, + t.image, + currLine, + lastStart - lastNewlinePos + 1 + ); + } + + return t; + } + + /** + * Emits whatever remains at end-of-input. + * Advances {@code tokenStart = currPost} so subsequent calls return null. + */ + private Token getEndTokenStringBased() { + tokenLength = currPost - tokenStart; + lastStart = tokenStart; + tokenStart = currPost; + int type = symbols.getFixed(); + if (inComment > 0) { + type = symbols.getNote(); + } else if (inBlock > 0) { + return new UnclosedToken( + String.valueOf(is, lastStart, tokenLength), + currLine, + lastStart - lastNewlinePos + 1, + symbols, + whitespaceControlParser + ); + } + return Token.newToken( + type, + symbols, + whitespaceControlParser, + String.valueOf(is, lastStart, tokenLength), + currLine, + lastStart - lastNewlinePos + 1 + ); + } + + /** Returns true if {@code is[pos..]} starts with {@code pattern}. */ + private boolean regionMatches(int pos, char[] pattern) { + if (pos + pattern.length > length) { + return false; + } + for (int i = 0; i < pattern.length; i++) { + if (is[pos + i] != pattern[i]) { + return false; + } + } + return true; + } + + // ── Original char-based scanning path (completely unchanged) ────────────── + + private Token getNextTokenCharBased() { char c; while (currPost < length) { c = is[currPost++]; @@ -311,15 +837,4 @@ private boolean matchToken(char kind) { return kind == tokenKind; } } - - @Override - protected Token computeNext() { - Token t = getNextToken(); - - if (t == null) { - return endOfData(); - } - - return t; - } } diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java index 771dbda41..638220853 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java @@ -129,4 +129,90 @@ public static boolean isNoteTagOrExprChar(TokenScannerSymbols symbols, char c) { c == symbols.getNote() || c == symbols.getTag() || c == symbols.getExprStartChar() ); } + + // ── New API ──────────────────────────────────────────────────────────────── + + /** + * Returns {@code true} if this instance uses arbitrary string delimiters that + * require the string-matching scan path in {@link TokenScanner}. + * + *

The default returns {@code false}, so all existing subclasses are unaffected. + * {@link StringTokenScannerSymbols} overrides this to return {@code true}. + */ + public boolean isStringBased() { + return false; + } + + /** + * Length of the variable/expression opening delimiter (e.g. 2 for {@code "{{"}), + * used by {@link ExpressionToken#parse()} instead of the hardcoded constant 2. + */ + public int getExpressionStartLength() { + return getExpressionStart().length(); + } + + /** + * Length of the variable/expression closing delimiter (e.g. 2 for {@code "}}"}), + * used by {@link ExpressionToken#parse()} instead of the hardcoded constant 2. + */ + public int getExpressionEndLength() { + return getExpressionEnd().length(); + } + + /** + * Length of the block/tag opening delimiter (e.g. 2 for {@code "{%"}), + * used by {@link TagToken#parse()} instead of the hardcoded constant 2. + */ + public int getTagStartLength() { + return getExpressionStartWithTag().length(); + } + + /** + * Length of the block/tag closing delimiter (e.g. 2 for {@code "%}"}), + * used by {@link TagToken#parse()} instead of the hardcoded constant 2. + */ + public int getTagEndLength() { + return getExpressionEndWithTag().length(); + } + + /** + * Length of the comment opening delimiter (e.g. 2 for {@code "{#"}), + * used by {@link NoteToken#parse()} instead of the hardcoded constant 2. + */ + public int getCommentStartLength() { + return getOpeningComment().length(); + } + + /** + * Length of the comment closing delimiter (e.g. 2 for {@code "#}"}), + * used by {@link NoteToken#parse()} instead of the hardcoded constant 2. + */ + public int getCommentEndLength() { + return getClosingComment().length(); + } + + /** + * Optional line statement prefix (e.g. {@code "%%"}). When non-null, any line + * that begins with this prefix (after optional horizontal whitespace) is treated + * as a block tag statement, equivalent to wrapping its content in the block + * delimiters. Returns {@code null} by default (feature disabled). + * + *

Only used by {@link StringTokenScannerSymbols}; has no effect in the + * char-based path. + */ + public String getLineStatementPrefix() { + return null; + } + + /** + * Optional line comment prefix (e.g. {@code "%#"}). When non-null, any line + * that begins with this prefix (after optional horizontal whitespace) is stripped + * entirely from the output. Returns {@code null} by default (feature disabled). + * + *

Only used by {@link StringTokenScannerSymbols}; has no effect in the + * char-based path. + */ + public String getLineCommentPrefix() { + return null; + } } diff --git a/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java new file mode 100644 index 000000000..50affae8c --- /dev/null +++ b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java @@ -0,0 +1,346 @@ +package com.hubspot.jinjava.tree.parse; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.hubspot.jinjava.BaseJinjavaTest; +import com.hubspot.jinjava.Jinjava; +import com.hubspot.jinjava.JinjavaConfig; +import com.hubspot.jinjava.lib.filter.JoinFilterTest.User; +import java.util.HashMap; +import org.junit.Before; +import org.junit.Test; + +public class StringTokenScannerSymbolsTest { + + // ── Shared symbol configurations ─────────────────────────────────────────── + + /** LaTeX-style delimiters as used in the original issue #195 example. */ + private static final StringTokenScannerSymbols LATEX_SYMBOLS = StringTokenScannerSymbols + .builder() + .withVariableStartString("\\VAR{") + .withVariableEndString("}") + .withBlockStartString("\\BLOCK{") + .withBlockEndString("}") + .withCommentStartString("\\#{") + .withCommentEndString("}") + .build(); + + /** Angle-bracket style — same delimiters as the existing CustomTokenScannerSymbolsTest. */ + private static final StringTokenScannerSymbols ANGLE_SYMBOLS = StringTokenScannerSymbols + .builder() + .withVariableStartString("<<") + .withVariableEndString(">>") + .withBlockStartString("<%") + .withBlockEndString("%>") + .withCommentStartString("<#") + .withCommentEndString("#>") + .build(); + + private Jinjava latexJinjava; + private Jinjava angleJinjava; + + @Before + public void setup() { + latexJinjava = + new Jinjava( + BaseJinjavaTest.newConfigBuilder().withTokenScannerSymbols(LATEX_SYMBOLS).build() + ); + latexJinjava + .getGlobalContext() + .put("numbers", Lists.newArrayList(1L, 2L, 3L, 4L, 5L)); + + angleJinjava = + new Jinjava( + BaseJinjavaTest.newConfigBuilder().withTokenScannerSymbols(ANGLE_SYMBOLS).build() + ); + angleJinjava + .getGlobalContext() + .put("numbers", Lists.newArrayList(1L, 2L, 3L, 4L, 5L)); + } + + // ── Plain text ───────────────────────────────────────────────────────────── + + @Test + public void itRendersPlainText() { + String template = "jinjava interpreter works correctly"; + assertThat(latexJinjava.render(template, new HashMap<>())).isEqualTo(template); + assertThat(angleJinjava.render(template, new HashMap<>())).isEqualTo(template); + } + + // ── Variable expressions ─────────────────────────────────────────────────── + + @Test + public void itRendersVariablesWithLatexSymbols() { + assertThat(latexJinjava.render("\\VAR{ name }", ImmutableMap.of("name", "World"))) + .isEqualTo("World"); + } + + @Test + public void itRendersVariablesWithAngleSymbols() { + assertThat(angleJinjava.render("<< name >>", ImmutableMap.of("name", "World"))) + .isEqualTo("World"); + } + + // ── Default delimiters pass through as literal text ──────────────────────── + + @Test + public void itPassesThroughDefaultCurlyBracesAsLiteralText() { + // With custom delimiters, {{ }} must be treated as plain text, not expressions. + assertThat( + latexJinjava.render( + "{{ not a variable }} \\VAR{ name }", + ImmutableMap.of("name", "Jorge") + ) + ) + .isEqualTo("{{ not a variable }} Jorge"); + + assertThat( + angleJinjava.render( + "{{ not a variable }} << name >>", + ImmutableMap.of("name", "Jorge") + ) + ) + .isEqualTo("{{ not a variable }} Jorge"); + } + + // ── Block tags ───────────────────────────────────────────────────────────── + + @Test + public void itRendersIfBlockWithLatexSymbols() { + assertThat( + latexJinjava.render( + "\\BLOCK{ if show }hello\\BLOCK{ endif }", + ImmutableMap.of("show", true) + ) + ) + .isEqualTo("hello"); + + assertThat( + latexJinjava.render( + "\\BLOCK{ if show }hello\\BLOCK{ endif }", + ImmutableMap.of("show", false) + ) + ) + .isEqualTo(""); + } + + @Test + public void itRendersSetBlockWithAngleSymbols() { + assertThat( + angleJinjava.render( + "<% set d=d | default(\"some random value\") %><< d >>", + new HashMap<>() + ) + ) + .isEqualTo("some random value"); + } + + // ── Comments ─────────────────────────────────────────────────────────────── + + @Test + public void itStripsCommentsWithLatexSymbols() { + assertThat(latexJinjava.render("before\\#{ this is ignored }after", new HashMap<>())) + .isEqualTo("beforeafter"); + } + + @Test + public void itStripsCommentsWithAngleSymbols() { + assertThat(angleJinjava.render("before<# this is ignored #>after", new HashMap<>())) + .isEqualTo("beforeafter"); + } + + // ── Filters ──────────────────────────────────────────────────────────────── + + @Test + public void itRendersFiltersWithLatexSymbols() { + assertThat(latexJinjava.render("\\VAR{ [1, 2, 3, 3]|union(null) }", new HashMap<>())) + .isEqualTo("[1, 2, 3]"); + assertThat( + latexJinjava.render("\\VAR{ numbers|select('equalto', 3) }", new HashMap<>()) + ) + .isEqualTo("[3]"); + } + + @Test + public void itRendersFiltersWithAngleSymbols() { + assertThat(angleJinjava.render("<< [1, 2, 3, 3]|union(null) >>", new HashMap<>())) + .isEqualTo("[1, 2, 3]"); + assertThat(angleJinjava.render("<< numbers|select('equalto', 3) >>", new HashMap<>())) + .isEqualTo("[3]"); + } + + @Test + public void itRendersMapFilterWithLatexSymbols() { + assertThat( + latexJinjava.render( + "\\VAR{ users|map(attribute='username')|join(', ') }", + ImmutableMap.of( + "users", + (Object) Lists.newArrayList(new User("foo"), new User("bar")) + ) + ) + ) + .isEqualTo("foo, bar"); + } + + @Test + public void itRendersMapFilterWithAngleSymbols() { + assertThat( + angleJinjava.render( + "<< users|map(attribute='username')|join(', ') >>", + ImmutableMap.of( + "users", + (Object) Lists.newArrayList(new User("foo"), new User("bar")) + ) + ) + ) + .isEqualTo("foo, bar"); + } + + // ── Delimiter characters inside string literals in expressions ───────────── + + @Test + public void itHandlesClosingDelimiterInsideQuotedString() { + // The "}" inside the default string must not prematurely close \VAR{ + assertThat(latexJinjava.render("\\VAR{ name | default(\"}\") }", new HashMap<>())) + .isEqualTo("}"); + } + + @Test + public void itHandlesClosingDelimiterInsideQuotedStringAngle() { + // ">>" inside a quoted string must not close the << expression + assertThat(angleJinjava.render("<< name | default(\">>\") >>", new HashMap<>())) + .isEqualTo(">>"); + } + + // ── Builder defaults produce same behaviour as DefaultTokenScannerSymbols ── + + @Test + public void defaultBuilderBehavesLikeDefaultSymbols() { + Jinjava defaultJinjava = new Jinjava(); + Jinjava stringBasedDefaultJinjava = new Jinjava( + JinjavaConfig + .newBuilder() + .withTokenScannerSymbols(StringTokenScannerSymbols.builder().build()) + .build() + ); + String template = "{{ greeting }}, {{ name }}!"; + ImmutableMap ctx = ImmutableMap.of( + "greeting", + "Hello", + "name", + "World" + ); + assertThat(stringBasedDefaultJinjava.render(template, ctx)) + .isEqualTo(defaultJinjava.render(template, ctx)); + } + + // ── Builder validation ───────────────────────────────────────────────────── + + @Test + public void builderRejectsEmptyDelimiter() { + assertThatThrownBy(() -> + StringTokenScannerSymbols.builder().withVariableStartString("").build() + ) + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void builderRejectsNullDelimiter() { + assertThatThrownBy(() -> + StringTokenScannerSymbols.builder().withBlockEndString(null).build() + ) + .isInstanceOf(IllegalArgumentException.class); + } + + // ── Line statement prefix ────────────────────────────────────────────────── + + @Test + public void itRendersLineStatementPrefix() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols.builder().withLineStatementPrefix("%%").build() + ); + // "%% if show" is equivalent to "{% if show %}" + String template = "%% if show\nhello\n%% endif"; + assertThat(j.render(template, ImmutableMap.of("show", true))).isEqualTo("hello\n"); + assertThat(j.render(template, ImmutableMap.of("show", false))).isEqualTo(""); + } + + @Test + public void itRendersLineStatementPrefixWithLeadingWhitespace() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols.builder().withLineStatementPrefix("%%").build() + ); + // Leading spaces before the prefix are allowed + String template = " %% if show\nhello\n %% endif"; + assertThat(j.render(template, ImmutableMap.of("show", true))).isEqualTo("hello\n"); + } + + @Test + public void itRendersLineStatementMixedWithBlockDelimiters() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols + .builder() + .withVariableStartString("<<") + .withVariableEndString(">>") + .withBlockStartString("<%") + .withBlockEndString("%>") + .withCommentStartString("<#") + .withCommentEndString("#>") + .withLineStatementPrefix("%%") + .build() + ); + String template = "%% set x = 42\n<< x >>"; + assertThat(j.render(template, new HashMap<>())).isEqualTo("42"); + } + + // ── Line comment prefix ──────────────────────────────────────────────────── + + @Test + public void itStripsLineCommentPrefix() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() + ); + String template = "before\n%# this whole line is a comment\nafter"; + assertThat(j.render(template, new HashMap<>())).isEqualTo("before\nafter"); + } + + @Test + public void itStripsLineCommentWithLeadingWhitespace() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() + ); + String template = "before\n %# indented comment\nafter"; + assertThat(j.render(template, new HashMap<>())).isEqualTo("before\nafter"); + } + + @Test + public void itHandlesBothLinePrefixesTogether() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols + .builder() + .withVariableStartString("<<") + .withVariableEndString(">>") + .withBlockStartString("<%") + .withBlockEndString("%>") + .withCommentStartString("<#") + .withCommentEndString("#>") + .withLineStatementPrefix("%%") + .withLineCommentPrefix("%#") + .build() + ); + String template = "%# this is stripped\n%% set x = 7\n<< x >>"; + assertThat(j.render(template, new HashMap<>())).isEqualTo("7"); + } + + // ── Helper ──────────────────────────────────────────────────────────────── + + private Jinjava jinjavaWith(StringTokenScannerSymbols symbols) { + return new Jinjava( + BaseJinjavaTest.newConfigBuilder().withTokenScannerSymbols(symbols).build() + ); + } +}