Skip to content

Commit 081a33a

Browse files
authored
Revert "feat(markdown)!: strict math delimiters, robust inline code parsing; drop plain parentheses as math"
1 parent 4c883ca commit 081a33a

File tree

8 files changed

+89
-223
lines changed

8 files changed

+89
-223
lines changed

package.json

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -112,17 +112,7 @@
112112
},
113113
"dependencies": {
114114
"@floating-ui/dom": "^1.7.4",
115-
"markdown-it": "^14.1.0",
116-
"markdown-it-container": "^4.0.0",
117-
"markdown-it-emoji": "^3.0.0",
118-
"markdown-it-footnote": "^4.0.0",
119-
"markdown-it-ins": "^4.0.0",
120-
"markdown-it-mark": "^4.0.0",
121-
"markdown-it-sub": "^2.0.0",
122-
"markdown-it-sup": "^2.0.0",
123-
"markdown-it-task-checkbox": "^1.0.6",
124-
"markdown-it-ts": "0.0.2-beta.4",
125-
"stream-markdown-parser": "workspace:^"
115+
"stream-markdown-parser": "^0.0.30"
126116
},
127117
"devDependencies": {
128118
"@antfu/eslint-config": "^5.4.1",

packages/markdown-parser/src/config.ts

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,6 @@ export interface MathOptions {
1212
commands?: readonly string[]
1313
/** Whether to escape standalone '!' (default: true). */
1414
escapeExclamation?: boolean
15-
/**
16-
* Strict delimiter mode.
17-
* - When true, only explicit TeX delimiters are recognized as math:
18-
* inline: `$...$` and `\(...\)`; block: `$$...$$` and `\[...\]`.
19-
* - Heuristics and mid-state (unclosed) math detection are disabled.
20-
*/
21-
strictDelimiters?: boolean
2215
}
2316

2417
let defaultMathOptions: MathOptions | undefined

packages/markdown-parser/src/parser/inline-parsers/index.ts

Lines changed: 40 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,6 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
4646
const result: ParsedNode[] = []
4747
let currentTextNode: TextNode | null = null
4848

49-
// Fallback: markdown-it sometimes mis-tokenizes code_inline around CJK text
50-
// yielding code_inline nodes that contain non-ASCII prose (e.g. "方法会根据").
51-
// When detected, prefer re-parsing from the original raw string to rebuild
52-
// inline_code spans and basic strong emphasis segments in a stable way.
53-
const hasSuspiciousCodeInline = tokens.some(t => t.type === 'code_inline' && /[^\x00-\x7F]/.test(String(t.content ?? '')))
54-
const hasBackticksInRaw = typeof raw === 'string' && /`/.test(raw)
55-
const codeInlineCount = tokens.reduce((n, t) => n + (t.type === 'code_inline' ? 1 : 0), 0)
56-
const rawBacktickCount = typeof raw === 'string' ? ((raw.match(/`/g) || []).length) : 0
57-
// When backtick count is even, expected inline-code spans ~= backtickCount/2.
58-
const expectedSpans = rawBacktickCount % 2 === 0 ? rawBacktickCount / 2 : 0
59-
const mismatchedSpanCount = expectedSpans > 0 && codeInlineCount !== expectedSpans
60-
const hasMathInlineUsingBackticks = tokens.some(t => t.type === 'math_inline' && /`/.test(String((t as any).raw ?? t.content ?? '')))
61-
if (hasBackticksInRaw && (hasSuspiciousCodeInline || mismatchedSpanCount || hasMathInlineUsingBackticks)) {
62-
return parseFromRawWithCodeAndStrong(String(raw ?? ''))
63-
}
64-
6549
let i = 0
6650
// Note: strong-token normalization and list-item normalization are
6751
// applied during markdown-it parsing via core rules (plugins that
@@ -190,50 +174,58 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
190174
if (!/`[^`]*/.test(content))
191175
return false
192176

177+
// Close any current text node and handle inline code
178+
resetCurrentTextNode()
193179
const code_start = content.indexOf('`')
194180
const code_end = content.indexOf('`', code_start + 1)
195-
196-
// If we don't have a closing backtick within this token, don't emit a partial
197-
// inline_code node. Instead, merge the rest of inline tokens into a single
198-
// string and re-run parsing so the code span is handled atomically.
199-
if (code_end === -1) {
200-
let merged = content
201-
for (let j = i + 1; j < tokens.length; j++)
202-
merged += String((tokens[j].content ?? '') + (tokens[j].markup ?? ''))
203-
204-
// Consume to the end since we've merged remaining tokens
205-
i = tokens.length - 1
206-
handleToken({ type: 'text', content: merged, raw: merged } as unknown as MarkdownToken)
207-
i++
208-
return true
209-
}
210-
211-
// Close any current text node and handle the text before the code span
212-
resetCurrentTextNode()
213-
const beforeText = content.slice(0, code_start)
214-
const codeContent = content.slice(code_start + 1, code_end)
215-
const after = content.slice(code_end + 1)
216-
217-
if (beforeText) {
218-
// Try to parse emphasis/strong inside the pre-code fragment, without
219-
// advancing the outer token index `i` permanently.
220-
const handled = handleEmphasisAndStrikethrough(beforeText, _token)
221-
if (!handled)
222-
pushText(beforeText, beforeText)
223-
else
181+
const _text = content.slice(0, code_start)
182+
const codeContent = code_end === -1 ? content.slice(code_start) : content.slice(code_start, code_end)
183+
const after = code_end === -1 ? '' : content.slice(code_end + 1)
184+
if (_text) {
185+
// Try to re-run emphasis/strong parsing on the fragment before the code span
186+
// but avoid mutating the outer token index `i` (handlers sometimes increment it).
187+
const handled = handleEmphasisAndStrikethrough(_text, _token)
188+
// restore index so we don't skip tokens in the outer loop
189+
if (!handled) {
190+
pushText(_text, _text)
191+
}
192+
else {
224193
i--
194+
}
225195
}
226196

197+
const code = codeContent.replace(/`/g, '')
227198
pushParsed({
228199
type: 'inline_code',
229-
code: codeContent,
230-
raw: String(codeContent ?? ''),
200+
code,
201+
raw: String(code ?? ''),
231202
} as ParsedNode)
232203

204+
// afterCode 可能也存在很多情况包括多个 code,我们递归处理 --- IGNORE ---
233205
if (after) {
234-
handleToken({ type: 'text', content: after, raw: after } as unknown as MarkdownToken)
206+
handleToken({
207+
type: 'text',
208+
content: after,
209+
raw: String(after ?? ''),
210+
})
235211
i--
236212
}
213+
else if (code_end === -1) {
214+
// 要把下一个 token 也合并进来,把类型变成 text
215+
const nextToken = tokens[i + 1]
216+
if (nextToken) {
217+
let fixedAfter = after
218+
for (let j = i + 1; j < tokens.length; j++) {
219+
fixedAfter += String(((tokens[j].content ?? '') + (tokens[j].markup ?? '')))
220+
}
221+
i = tokens.length - 1
222+
handleToken({
223+
type: 'text',
224+
content: fixedAfter,
225+
raw: String(fixedAfter ?? ''),
226+
})
227+
}
228+
}
237229
i++
238230
return true
239231
}
@@ -871,79 +863,3 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
871863

872864
return result
873865
}
874-
875-
// Minimal, robust fallback parser: split raw by backticks into
876-
// text and inline_code, and parse simple **strong** inside text.
877-
function parseFromRawWithCodeAndStrong(raw: string): ParsedNode[] {
878-
// Tokenize raw handling two constructs: backticks `...` and strong **...**
879-
// Build a small AST allowing code inside strong and vice versa.
880-
const root: ParsedNode[] = []
881-
const stack: Array<{ type: 'root' | 'strong', children: ParsedNode[] }> = [{ type: 'root', children: root }]
882-
let i = 0
883-
884-
function cur() { return stack[stack.length - 1].children }
885-
886-
function pushText(s: string) {
887-
if (!s)
888-
return
889-
const last = cur()[cur().length - 1]
890-
if (last && last.type === 'text') {
891-
(last as any).content += s
892-
;(last as any).raw += s
893-
}
894-
else {
895-
cur().push({ type: 'text', content: s, raw: s } as ParsedNode)
896-
}
897-
}
898-
899-
while (i < raw.length) {
900-
// strong open/close
901-
if (raw[i] === '*' && raw[i + 1] === '*') {
902-
// If already inside strong, close it; otherwise open new strong
903-
const isClosing = stack.length > 1 && stack[stack.length - 1].type === 'strong'
904-
i += 2
905-
if (isClosing) {
906-
const nodeChildren = stack.pop()!.children
907-
cur().push({ type: 'strong', children: nodeChildren, raw: `**${nodeChildren.map(n => (n as any).raw ?? '').join('')}**` } as ParsedNode)
908-
}
909-
else {
910-
stack.push({ type: 'strong', children: [] })
911-
}
912-
continue
913-
}
914-
915-
// inline code
916-
if (raw[i] === '`') {
917-
const start = i
918-
const close = raw.indexOf('`', i + 1)
919-
if (close === -1) {
920-
// no closing tick; treat as text
921-
pushText(raw.slice(i))
922-
break
923-
}
924-
const code = raw.slice(i + 1, close)
925-
cur().push({ type: 'inline_code', code, raw: code } as ParsedNode)
926-
i = close + 1
927-
continue
928-
}
929-
930-
// regular text: read until next special
931-
let next = raw.indexOf('`', i)
932-
const nextStrong = raw.indexOf('**', i)
933-
if (nextStrong !== -1 && (next === -1 || nextStrong < next))
934-
next = nextStrong
935-
if (next === -1)
936-
next = raw.length
937-
pushText(raw.slice(i, next))
938-
i = next
939-
}
940-
941-
// If there are unclosed strongs, degrade them into plain text with ** markers
942-
while (stack.length > 1) {
943-
const dangling = stack.pop()!
944-
const content = dangling.children.map(n => (n as any).raw ?? (n as any).content ?? '').join('')
945-
pushText(`**${content}`)
946-
}
947-
948-
return root
949-
}

packages/markdown-parser/src/parser/node-parsers/list-parser.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,11 @@ export function parseList(
6363
tokens[k].type === 'bullet_list_open'
6464
|| tokens[k].type === 'ordered_list_open'
6565
) {
66-
// Parse nested list (do not skip '*' — treat all bullet types consistently)
66+
if (tokens[k].markup === '*') {
67+
k++
68+
continue
69+
}
70+
// Parse nested list
6771
const [nestedListNode, newIndex] = parseNestedList(tokens, k)
6872
itemChildren.push(nestedListNode)
6973
k = newIndex
@@ -202,6 +206,11 @@ function parseNestedList(
202206
tokens[k].type === 'bullet_list_open'
203207
|| tokens[k].type === 'ordered_list_open'
204208
) {
209+
if (tokens[k].markup === '*') {
210+
k++
211+
continue
212+
}
213+
205214
// Handle deeper nested lists
206215
const [deeperNestedListNode, newIndex] = parseNestedList(tokens, k)
207216
itemChildren.push(deeperNestedListNode)

packages/markdown-parser/src/plugins/isMathLike.ts

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,9 @@ export function isMathLike(s: string) {
9090
const funcCall = FUNC_CALL_RE.test(norm)
9191
// common math words
9292
const words = WORDS_RE.test(norm)
93-
// 纯单个英文字母也渲染成数学公式(常见变量/元素符号)
94-
// e.g. (w) (x) (y) (z) 或 $H$, $x$ 等
95-
const pureWord = /^\([a-z]\)$/i.test(stripped) || /^[a-z]$/i.test(stripped)
96-
// 简单的化学式/下标:如 H_2O, CO_2, CH_3CH_2OH, CH_3COOH
97-
const chemicalLike = /^(?:[A-Z][a-z]?(_\{?[A-Za-z0-9]+\}?|\^[A-Za-z0-9]+)?)+$/i.test(stripped)
93+
// 纯单个英文字母也渲染成数学公式
94+
// e.g. (w) (x) (y) (z)
95+
const pureWord = /^\([a-z]\)$/i.test(stripped)
9896

99-
return texCmd || texCmdWithBraces || texBraceStart || texSpecific || superSub || ops || funcCall || words || pureWord || chemicalLike
97+
return texCmd || texCmdWithBraces || texBraceStart || texSpecific || superSub || ops || funcCall || words || pureWord
10098
}

packages/markdown-parser/src/plugins/math.ts

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -198,18 +198,14 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
198198
// Inline rule for \(...\) and $$...$$ and $...$
199199
const mathInline = (state: unknown, silent: boolean) => {
200200
const s = state as any
201-
const strict = !!mathOpts?.strictDelimiters
202201

203202
if (/^\*[^*]+/.test(s.src)) {
204203
return false
205204
}
206205
const delimiters: [string, string][] = [
207206
['$', '$'],
208-
// Support explicit TeX inline delimiters only: \( ... \)
209207
['\\(', '\\)'],
210-
// Do NOT treat plain parentheses as math delimiters. Using ['\(', '\)']
211-
// accidentally becomes ['(', ')'] in JS/TS strings and over-matches
212-
// regular text like "(0 <= t < S-1)", causing false math detection.
208+
['\(', '\)'],
213209
]
214210

215211
let searchPos = 0
@@ -280,8 +276,7 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
280276
continue
281277
}
282278
if (endIdx === -1) {
283-
// Do not treat segments containing inline code as math
284-
if (!strict && isMathLike(content) && !content.includes('`')) {
279+
if (isMathLike(content)) {
285280
searchPos = index + open.length
286281
foundAny = true
287282
if (!silent) {
@@ -326,13 +321,7 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
326321
}
327322
}
328323
const content = src.slice(index + open.length, endIdx)
329-
// Skip treating as math when the content contains inline-code backticks
330-
// Always accept explicit dollar-delimited math ($...$) even if the
331-
// heuristic deems it not math-like (to support cases like $H$, $CO_2$).
332-
const hasBacktick = content.includes('`')
333-
const isDollar = open === '$'
334-
const shouldSkip = strict ? hasBacktick : (hasBacktick || (!isDollar && !isMathLike(content)))
335-
if (shouldSkip) {
324+
if (!isMathLike(content)) {
336325
// push remaining text after last match
337326
// not math-like; skip this match and continue scanning
338327
searchPos = endIdx + close.length
@@ -438,17 +427,11 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
438427
silent: boolean,
439428
) => {
440429
const s = state as any
441-
const strict = !!mathOpts?.strictDelimiters
442-
const delimiters: [string, string][] = strict
443-
? [
444-
['\\[', '\\]'],
445-
['$$', '$$'],
446-
]
447-
: [
448-
['\\[', '\\]'],
449-
['\[', '\]'],
450-
['$$', '$$'],
451-
]
430+
const delimiters: [string, string][] = [
431+
['\\[', '\\]'],
432+
['\[', '\]'],
433+
['$$', '$$'],
434+
]
452435
const startPos = s.bMarks[startLine] + s.tShift[startLine]
453436
const lineText = s.src.slice(startPos, s.eMarks[startLine]).trim()
454437
let matched = false
@@ -542,10 +525,6 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
542525
}
543526
}
544527

545-
// In strict mode, do not emit mid-state (unclosed) block math
546-
if (strict && !found)
547-
return false
548-
549528
const token: any = s.push('math_block', 'math', 0)
550529
token.content = normalizeStandaloneBackslashT(content)
551530
token.markup

0 commit comments

Comments
 (0)