Skip to content

Commit 4c883ca

Browse files
authored
Merge pull request #148 from neoragex2002/PR
feat(markdown)!: strict math delimiters, robust inline code parsing; drop plain parentheses as math
2 parents a3fd3b5 + 8890aab commit 4c883ca

File tree

8 files changed

+223
-89
lines changed

8 files changed

+223
-89
lines changed

package.json

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,17 @@
112112
},
113113
"dependencies": {
114114
"@floating-ui/dom": "^1.7.4",
115-
"stream-markdown-parser": "^0.0.30"
115+
"markdown-it": "^14.1.0",
116+
"markdown-it-container": "^4.0.0",
117+
"markdown-it-emoji": "^3.0.0",
118+
"markdown-it-footnote": "^4.0.0",
119+
"markdown-it-ins": "^4.0.0",
120+
"markdown-it-mark": "^4.0.0",
121+
"markdown-it-sub": "^2.0.0",
122+
"markdown-it-sup": "^2.0.0",
123+
"markdown-it-task-checkbox": "^1.0.6",
124+
"markdown-it-ts": "0.0.2-beta.4",
125+
"stream-markdown-parser": "workspace:^"
116126
},
117127
"devDependencies": {
118128
"@antfu/eslint-config": "^5.4.1",

packages/markdown-parser/src/config.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@ export interface MathOptions {
1212
commands?: readonly string[]
1313
/** Whether to escape standalone '!' (default: true). */
1414
escapeExclamation?: boolean
15+
/**
16+
* Strict delimiter mode.
17+
* - When true, only explicit TeX delimiters are recognized as math:
18+
* inline: `$...$` and `\(...\)`; block: `$$...$$` and `\[...\]`.
19+
* - Heuristics and mid-state (unclosed) math detection are disabled.
20+
*/
21+
strictDelimiters?: boolean
1522
}
1623

1724
let defaultMathOptions: MathOptions | undefined

packages/markdown-parser/src/parser/inline-parsers/index.ts

Lines changed: 124 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,22 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
4646
const result: ParsedNode[] = []
4747
let currentTextNode: TextNode | null = null
4848

49+
// Fallback: markdown-it sometimes mis-tokenizes code_inline around CJK text
50+
// yielding code_inline nodes that contain non-ASCII prose (e.g. "方法会根据").
51+
// When detected, prefer re-parsing from the original raw string to rebuild
52+
// inline_code spans and basic strong emphasis segments in a stable way.
53+
const hasSuspiciousCodeInline = tokens.some(t => t.type === 'code_inline' && /[^\x00-\x7F]/.test(String(t.content ?? '')))
54+
const hasBackticksInRaw = typeof raw === 'string' && /`/.test(raw)
55+
const codeInlineCount = tokens.reduce((n, t) => n + (t.type === 'code_inline' ? 1 : 0), 0)
56+
const rawBacktickCount = typeof raw === 'string' ? ((raw.match(/`/g) || []).length) : 0
57+
// When backtick count is even, expected inline-code spans ~= backtickCount/2.
58+
const expectedSpans = rawBacktickCount % 2 === 0 ? rawBacktickCount / 2 : 0
59+
const mismatchedSpanCount = expectedSpans > 0 && codeInlineCount !== expectedSpans
60+
const hasMathInlineUsingBackticks = tokens.some(t => t.type === 'math_inline' && /`/.test(String((t as any).raw ?? t.content ?? '')))
61+
if (hasBackticksInRaw && (hasSuspiciousCodeInline || mismatchedSpanCount || hasMathInlineUsingBackticks)) {
62+
return parseFromRawWithCodeAndStrong(String(raw ?? ''))
63+
}
64+
4965
let i = 0
5066
// Note: strong-token normalization and list-item normalization are
5167
// applied during markdown-it parsing via core rules (plugins that
@@ -174,58 +190,50 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
174190
if (!/`[^`]*/.test(content))
175191
return false
176192

177-
// Close any current text node and handle inline code
178-
resetCurrentTextNode()
179193
const code_start = content.indexOf('`')
180194
const code_end = content.indexOf('`', code_start + 1)
181-
const _text = content.slice(0, code_start)
182-
const codeContent = code_end === -1 ? content.slice(code_start) : content.slice(code_start, code_end)
183-
const after = code_end === -1 ? '' : content.slice(code_end + 1)
184-
if (_text) {
185-
// Try to re-run emphasis/strong parsing on the fragment before the code span
186-
// but avoid mutating the outer token index `i` (handlers sometimes increment it).
187-
const handled = handleEmphasisAndStrikethrough(_text, _token)
188-
// restore index so we don't skip tokens in the outer loop
189-
if (!handled) {
190-
pushText(_text, _text)
191-
}
192-
else {
195+
196+
// If we don't have a closing backtick within this token, don't emit a partial
197+
// inline_code node. Instead, merge the rest of inline tokens into a single
198+
// string and re-run parsing so the code span is handled atomically.
199+
if (code_end === -1) {
200+
let merged = content
201+
for (let j = i + 1; j < tokens.length; j++)
202+
merged += String((tokens[j].content ?? '') + (tokens[j].markup ?? ''))
203+
204+
// Consume to the end since we've merged remaining tokens
205+
i = tokens.length - 1
206+
handleToken({ type: 'text', content: merged, raw: merged } as unknown as MarkdownToken)
207+
i++
208+
return true
209+
}
210+
211+
// Close any current text node and handle the text before the code span
212+
resetCurrentTextNode()
213+
const beforeText = content.slice(0, code_start)
214+
const codeContent = content.slice(code_start + 1, code_end)
215+
const after = content.slice(code_end + 1)
216+
217+
if (beforeText) {
218+
// Try to parse emphasis/strong inside the pre-code fragment, without
219+
// advancing the outer token index `i` permanently.
220+
const handled = handleEmphasisAndStrikethrough(beforeText, _token)
221+
if (!handled)
222+
pushText(beforeText, beforeText)
223+
else
193224
i--
194-
}
195225
}
196226

197-
const code = codeContent.replace(/`/g, '')
198227
pushParsed({
199228
type: 'inline_code',
200-
code,
201-
raw: String(code ?? ''),
229+
code: codeContent,
230+
raw: String(codeContent ?? ''),
202231
} as ParsedNode)
203232

204-
// afterCode 可能也存在很多情况包括多个 code,我们递归处理 --- IGNORE ---
205233
if (after) {
206-
handleToken({
207-
type: 'text',
208-
content: after,
209-
raw: String(after ?? ''),
210-
})
234+
handleToken({ type: 'text', content: after, raw: after } as unknown as MarkdownToken)
211235
i--
212236
}
213-
else if (code_end === -1) {
214-
// 要把下一个 token 也合并进来,把类型变成 text
215-
const nextToken = tokens[i + 1]
216-
if (nextToken) {
217-
let fixedAfter = after
218-
for (let j = i + 1; j < tokens.length; j++) {
219-
fixedAfter += String(((tokens[j].content ?? '') + (tokens[j].markup ?? '')))
220-
}
221-
i = tokens.length - 1
222-
handleToken({
223-
type: 'text',
224-
content: fixedAfter,
225-
raw: String(fixedAfter ?? ''),
226-
})
227-
}
228-
}
229237
i++
230238
return true
231239
}
@@ -863,3 +871,79 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
863871

864872
return result
865873
}
874+
875+
// Minimal, robust fallback parser: split raw by backticks into
876+
// text and inline_code, and parse simple **strong** inside text.
877+
function parseFromRawWithCodeAndStrong(raw: string): ParsedNode[] {
878+
// Tokenize raw handling two constructs: backticks `...` and strong **...**
879+
// Build a small AST allowing code inside strong and vice versa.
880+
const root: ParsedNode[] = []
881+
const stack: Array<{ type: 'root' | 'strong', children: ParsedNode[] }> = [{ type: 'root', children: root }]
882+
let i = 0
883+
884+
function cur() { return stack[stack.length - 1].children }
885+
886+
function pushText(s: string) {
887+
if (!s)
888+
return
889+
const last = cur()[cur().length - 1]
890+
if (last && last.type === 'text') {
891+
(last as any).content += s
892+
;(last as any).raw += s
893+
}
894+
else {
895+
cur().push({ type: 'text', content: s, raw: s } as ParsedNode)
896+
}
897+
}
898+
899+
while (i < raw.length) {
900+
// strong open/close
901+
if (raw[i] === '*' && raw[i + 1] === '*') {
902+
// If already inside strong, close it; otherwise open new strong
903+
const isClosing = stack.length > 1 && stack[stack.length - 1].type === 'strong'
904+
i += 2
905+
if (isClosing) {
906+
const nodeChildren = stack.pop()!.children
907+
cur().push({ type: 'strong', children: nodeChildren, raw: `**${nodeChildren.map(n => (n as any).raw ?? '').join('')}**` } as ParsedNode)
908+
}
909+
else {
910+
stack.push({ type: 'strong', children: [] })
911+
}
912+
continue
913+
}
914+
915+
// inline code
916+
if (raw[i] === '`') {
917+
const start = i
918+
const close = raw.indexOf('`', i + 1)
919+
if (close === -1) {
920+
// no closing tick; treat as text
921+
pushText(raw.slice(i))
922+
break
923+
}
924+
const code = raw.slice(i + 1, close)
925+
cur().push({ type: 'inline_code', code, raw: code } as ParsedNode)
926+
i = close + 1
927+
continue
928+
}
929+
930+
// regular text: read until next special
931+
let next = raw.indexOf('`', i)
932+
const nextStrong = raw.indexOf('**', i)
933+
if (nextStrong !== -1 && (next === -1 || nextStrong < next))
934+
next = nextStrong
935+
if (next === -1)
936+
next = raw.length
937+
pushText(raw.slice(i, next))
938+
i = next
939+
}
940+
941+
// If there are unclosed strongs, degrade them into plain text with ** markers
942+
while (stack.length > 1) {
943+
const dangling = stack.pop()!
944+
const content = dangling.children.map(n => (n as any).raw ?? (n as any).content ?? '').join('')
945+
pushText(`**${content}`)
946+
}
947+
948+
return root
949+
}

packages/markdown-parser/src/parser/node-parsers/list-parser.ts

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,7 @@ export function parseList(
6363
tokens[k].type === 'bullet_list_open'
6464
|| tokens[k].type === 'ordered_list_open'
6565
) {
66-
if (tokens[k].markup === '*') {
67-
k++
68-
continue
69-
}
70-
// Parse nested list
66+
// Parse nested list (do not skip '*' — treat all bullet types consistently)
7167
const [nestedListNode, newIndex] = parseNestedList(tokens, k)
7268
itemChildren.push(nestedListNode)
7369
k = newIndex
@@ -206,11 +202,6 @@ function parseNestedList(
206202
tokens[k].type === 'bullet_list_open'
207203
|| tokens[k].type === 'ordered_list_open'
208204
) {
209-
if (tokens[k].markup === '*') {
210-
k++
211-
continue
212-
}
213-
214205
// Handle deeper nested lists
215206
const [deeperNestedListNode, newIndex] = parseNestedList(tokens, k)
216207
itemChildren.push(deeperNestedListNode)

packages/markdown-parser/src/plugins/isMathLike.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,11 @@ export function isMathLike(s: string) {
9090
const funcCall = FUNC_CALL_RE.test(norm)
9191
// common math words
9292
const words = WORDS_RE.test(norm)
93-
// 纯单个英文字母也渲染成数学公式
94-
// e.g. (w) (x) (y) (z)
95-
const pureWord = /^\([a-z]\)$/i.test(stripped)
93+
// 纯单个英文字母也渲染成数学公式(常见变量/元素符号)
94+
// e.g. (w) (x) (y) (z) 或 $H$, $x$ 等
95+
const pureWord = /^\([a-z]\)$/i.test(stripped) || /^[a-z]$/i.test(stripped)
96+
// 简单的化学式/下标:如 H_2O, CO_2, CH_3CH_2OH, CH_3COOH
97+
const chemicalLike = /^(?:[A-Z][a-z]?(_\{?[A-Za-z0-9]+\}?|\^[A-Za-z0-9]+)?)+$/i.test(stripped)
9698

97-
return texCmd || texCmdWithBraces || texBraceStart || texSpecific || superSub || ops || funcCall || words || pureWord
99+
return texCmd || texCmdWithBraces || texBraceStart || texSpecific || superSub || ops || funcCall || words || pureWord || chemicalLike
98100
}

packages/markdown-parser/src/plugins/math.ts

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -198,14 +198,18 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
198198
// Inline rule for \(...\) and $$...$$ and $...$
199199
const mathInline = (state: unknown, silent: boolean) => {
200200
const s = state as any
201+
const strict = !!mathOpts?.strictDelimiters
201202

202203
if (/^\*[^*]+/.test(s.src)) {
203204
return false
204205
}
205206
const delimiters: [string, string][] = [
206207
['$', '$'],
208+
// Support explicit TeX inline delimiters only: \( ... \)
207209
['\\(', '\\)'],
208-
['\(', '\)'],
210+
// Do NOT treat plain parentheses as math delimiters. Using ['\(', '\)']
211+
// accidentally becomes ['(', ')'] in JS/TS strings and over-matches
212+
// regular text like "(0 <= t < S-1)", causing false math detection.
209213
]
210214

211215
let searchPos = 0
@@ -276,7 +280,8 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
276280
continue
277281
}
278282
if (endIdx === -1) {
279-
if (isMathLike(content)) {
283+
// Do not treat segments containing inline code as math
284+
if (!strict && isMathLike(content) && !content.includes('`')) {
280285
searchPos = index + open.length
281286
foundAny = true
282287
if (!silent) {
@@ -321,7 +326,13 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
321326
}
322327
}
323328
const content = src.slice(index + open.length, endIdx)
324-
if (!isMathLike(content)) {
329+
// Skip treating as math when the content contains inline-code backticks
330+
// Always accept explicit dollar-delimited math ($...$) even if the
331+
// heuristic deems it not math-like (to support cases like $H$, $CO_2$).
332+
const hasBacktick = content.includes('`')
333+
const isDollar = open === '$'
334+
const shouldSkip = strict ? hasBacktick : (hasBacktick || (!isDollar && !isMathLike(content)))
335+
if (shouldSkip) {
325336
// push remaining text after last match
326337
// not math-like; skip this match and continue scanning
327338
searchPos = endIdx + close.length
@@ -427,11 +438,17 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
427438
silent: boolean,
428439
) => {
429440
const s = state as any
430-
const delimiters: [string, string][] = [
431-
['\\[', '\\]'],
432-
['\[', '\]'],
433-
['$$', '$$'],
434-
]
441+
const strict = !!mathOpts?.strictDelimiters
442+
const delimiters: [string, string][] = strict
443+
? [
444+
['\\[', '\\]'],
445+
['$$', '$$'],
446+
]
447+
: [
448+
['\\[', '\\]'],
449+
['\[', '\]'],
450+
['$$', '$$'],
451+
]
435452
const startPos = s.bMarks[startLine] + s.tShift[startLine]
436453
const lineText = s.src.slice(startPos, s.eMarks[startLine]).trim()
437454
let matched = false
@@ -525,6 +542,10 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
525542
}
526543
}
527544

545+
// In strict mode, do not emit mid-state (unclosed) block math
546+
if (strict && !found)
547+
return false
548+
528549
const token: any = s.push('math_block', 'math', 0)
529550
token.content = normalizeStandaloneBackslashT(content)
530551
token.markup

0 commit comments

Comments
 (0)