Merge pull request #148 from neoragex2002/PR

Simon-He95 · web-flow · commit 4c883cabbd7b · 2025-11-24T09:59:46.000+08:00
feat(markdown)!: strict math delimiters, robust inline code parsing; drop plain parentheses as math
diff --git a/package.json b/package.json
@@ -112,7 +112,17 @@
   },
   "dependencies": {
     "@floating-ui/dom": "^1.7.4",
-    "stream-markdown-parser": "^0.0.30"
+    "markdown-it": "^14.1.0",
+    "markdown-it-container": "^4.0.0",
+    "markdown-it-emoji": "^3.0.0",
+    "markdown-it-footnote": "^4.0.0",
+    "markdown-it-ins": "^4.0.0",
+    "markdown-it-mark": "^4.0.0",
+    "markdown-it-sub": "^2.0.0",
+    "markdown-it-sup": "^2.0.0",
+    "markdown-it-task-checkbox": "^1.0.6",
+    "markdown-it-ts": "0.0.2-beta.4",
+    "stream-markdown-parser": "workspace:^"
   },
   "devDependencies": {
     "@antfu/eslint-config": "^5.4.1",
diff --git a/packages/markdown-parser/src/config.ts b/packages/markdown-parser/src/config.ts
@@ -12,6 +12,13 @@ export interface MathOptions {
   commands?: readonly string[]
   /** Whether to escape standalone '!' (default: true). */
   escapeExclamation?: boolean
+  /**
+   * Strict delimiter mode.
+   * - When true, only explicit TeX delimiters are recognized as math:
+   *   inline: `$...$` and `\(...\)`; block: `$$...$$` and `\[...\]`.
+   * - Heuristics and mid-state (unclosed) math detection are disabled.
+   */
+  strictDelimiters?: boolean
 }
 
 let defaultMathOptions: MathOptions | undefined
diff --git a/packages/markdown-parser/src/parser/inline-parsers/index.ts b/packages/markdown-parser/src/parser/inline-parsers/index.ts
@@ -46,6 +46,22 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
   const result: ParsedNode[] = []
   let currentTextNode: TextNode | null = null
 
+  // Fallback: markdown-it sometimes mis-tokenizes code_inline around CJK text
+  // yielding code_inline nodes that contain non-ASCII prose (e.g. "方法会根据").
+  // When detected, prefer re-parsing from the original raw string to rebuild
+  // inline_code spans and basic strong emphasis segments in a stable way.
+  const hasSuspiciousCodeInline = tokens.some(t => t.type === 'code_inline' && /[^\x00-\x7F]/.test(String(t.content ?? '')))
+  const hasBackticksInRaw = typeof raw === 'string' && /`/.test(raw)
+  const codeInlineCount = tokens.reduce((n, t) => n + (t.type === 'code_inline' ? 1 : 0), 0)
+  const rawBacktickCount = typeof raw === 'string' ? ((raw.match(/`/g) || []).length) : 0
+  // When backtick count is even, expected inline-code spans ~= backtickCount/2.
+  const expectedSpans = rawBacktickCount % 2 === 0 ? rawBacktickCount / 2 : 0
+  const mismatchedSpanCount = expectedSpans > 0 && codeInlineCount !== expectedSpans
+  const hasMathInlineUsingBackticks = tokens.some(t => t.type === 'math_inline' && /`/.test(String((t as any).raw ?? t.content ?? '')))
+  if (hasBackticksInRaw && (hasSuspiciousCodeInline || mismatchedSpanCount || hasMathInlineUsingBackticks)) {
+    return parseFromRawWithCodeAndStrong(String(raw ?? ''))
+  }
+
   let i = 0
   // Note: strong-token normalization and list-item normalization are
   // applied during markdown-it parsing via core rules (plugins that
@@ -174,58 +190,50 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
     if (!/`[^`]*/.test(content))
       return false
 
-    // Close any current text node and handle inline code
-    resetCurrentTextNode()
     const code_start = content.indexOf('`')
     const code_end = content.indexOf('`', code_start + 1)
-    const _text = content.slice(0, code_start)
-    const codeContent = code_end === -1 ? content.slice(code_start) : content.slice(code_start, code_end)
-    const after = code_end === -1 ? '' : content.slice(code_end + 1)
-    if (_text) {
-      // Try to re-run emphasis/strong parsing on the fragment before the code span
-      // but avoid mutating the outer token index `i` (handlers sometimes increment it).
-      const handled = handleEmphasisAndStrikethrough(_text, _token)
-      // restore index so we don't skip tokens in the outer loop
-      if (!handled) {
-        pushText(_text, _text)
-      }
-      else {
+
+    // If we don't have a closing backtick within this token, don't emit a partial
+    // inline_code node. Instead, merge the rest of inline tokens into a single
+    // string and re-run parsing so the code span is handled atomically.
+    if (code_end === -1) {
+      let merged = content
+      for (let j = i + 1; j < tokens.length; j++)
+        merged += String((tokens[j].content ?? '') + (tokens[j].markup ?? ''))
+
+      // Consume to the end since we've merged remaining tokens
+      i = tokens.length - 1
+      handleToken({ type: 'text', content: merged, raw: merged } as unknown as MarkdownToken)
+      i++
+      return true
+    }
+
+    // Close any current text node and handle the text before the code span
+    resetCurrentTextNode()
+    const beforeText = content.slice(0, code_start)
+    const codeContent = content.slice(code_start + 1, code_end)
+    const after = content.slice(code_end + 1)
+
+    if (beforeText) {
+      // Try to parse emphasis/strong inside the pre-code fragment, without
+      // advancing the outer token index `i` permanently.
+      const handled = handleEmphasisAndStrikethrough(beforeText, _token)
+      if (!handled)
+        pushText(beforeText, beforeText)
+      else
         i--
-      }
     }
 
-    const code = codeContent.replace(/`/g, '')
     pushParsed({
       type: 'inline_code',
-      code,
-      raw: String(code ?? ''),
+      code: codeContent,
+      raw: String(codeContent ?? ''),
     } as ParsedNode)
 
-    // afterCode 可能也存在很多情况包括多个 code，我们递归处理 --- IGNORE ---
     if (after) {
-      handleToken({
-        type: 'text',
-        content: after,
-        raw: String(after ?? ''),
-      })
+      handleToken({ type: 'text', content: after, raw: after } as unknown as MarkdownToken)
       i--
     }
-    else if (code_end === -1) {
-      // 要把下一个 token 也合并进来，把类型变成 text
-      const nextToken = tokens[i + 1]
-      if (nextToken) {
-        let fixedAfter = after
-        for (let j = i + 1; j < tokens.length; j++) {
-          fixedAfter += String(((tokens[j].content ?? '') + (tokens[j].markup ?? '')))
-        }
-        i = tokens.length - 1
-        handleToken({
-          type: 'text',
-          content: fixedAfter,
-          raw: String(fixedAfter ?? ''),
-        })
-      }
-    }
     i++
     return true
   }
@@ -863,3 +871,79 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
 
   return result
 }
+
+// Minimal, robust fallback parser: split raw by backticks into
+// text and inline_code, and parse simple **strong** inside text.
+function parseFromRawWithCodeAndStrong(raw: string): ParsedNode[] {
+  // Tokenize raw handling two constructs: backticks `...` and strong **...**
+  // Build a small AST allowing code inside strong and vice versa.
+  const root: ParsedNode[] = []
+  const stack: Array<{ type: 'root' | 'strong', children: ParsedNode[] }> = [{ type: 'root', children: root }]
+  let i = 0
+
+  function cur() { return stack[stack.length - 1].children }
+
+  function pushText(s: string) {
+    if (!s)
+      return
+    const last = cur()[cur().length - 1]
+    if (last && last.type === 'text') {
+      (last as any).content += s
+      ;(last as any).raw += s
+    }
+    else {
+      cur().push({ type: 'text', content: s, raw: s } as ParsedNode)
+    }
+  }
+
+  while (i < raw.length) {
+    // strong open/close
+    if (raw[i] === '*' && raw[i + 1] === '*') {
+      // If already inside strong, close it; otherwise open new strong
+      const isClosing = stack.length > 1 && stack[stack.length - 1].type === 'strong'
+      i += 2
+      if (isClosing) {
+        const nodeChildren = stack.pop()!.children
+        cur().push({ type: 'strong', children: nodeChildren, raw: `**${nodeChildren.map(n => (n as any).raw ?? '').join('')}**` } as ParsedNode)
+      }
+      else {
+        stack.push({ type: 'strong', children: [] })
+      }
+      continue
+    }
+
+    // inline code
+    if (raw[i] === '`') {
+      const start = i
+      const close = raw.indexOf('`', i + 1)
+      if (close === -1) {
+        // no closing tick; treat as text
+        pushText(raw.slice(i))
+        break
+      }
+      const code = raw.slice(i + 1, close)
+      cur().push({ type: 'inline_code', code, raw: code } as ParsedNode)
+      i = close + 1
+      continue
+    }
+
+    // regular text: read until next special
+    let next = raw.indexOf('`', i)
+    const nextStrong = raw.indexOf('**', i)
+    if (nextStrong !== -1 && (next === -1 || nextStrong < next))
+      next = nextStrong
+    if (next === -1)
+      next = raw.length
+    pushText(raw.slice(i, next))
+    i = next
+  }
+
+  // If there are unclosed strongs, degrade them into plain text with ** markers
+  while (stack.length > 1) {
+    const dangling = stack.pop()!
+    const content = dangling.children.map(n => (n as any).raw ?? (n as any).content ?? '').join('')
+    pushText(`**${content}`)
+  }
+
+  return root
+}
diff --git a/packages/markdown-parser/src/parser/node-parsers/list-parser.ts b/packages/markdown-parser/src/parser/node-parsers/list-parser.ts
@@ -63,11 +63,7 @@ export function parseList(
           tokens[k].type === 'bullet_list_open'
           || tokens[k].type === 'ordered_list_open'
         ) {
-          if (tokens[k].markup === '*') {
-            k++
-            continue
-          }
-          // Parse nested list
+          // Parse nested list (do not skip '*' — treat all bullet types consistently)
           const [nestedListNode, newIndex] = parseNestedList(tokens, k)
           itemChildren.push(nestedListNode)
           k = newIndex
@@ -206,11 +202,6 @@ function parseNestedList(
           tokens[k].type === 'bullet_list_open'
           || tokens[k].type === 'ordered_list_open'
         ) {
-          if (tokens[k].markup === '*') {
-            k++
-            continue
-          }
-
           // Handle deeper nested lists
           const [deeperNestedListNode, newIndex] = parseNestedList(tokens, k)
           itemChildren.push(deeperNestedListNode)
diff --git a/packages/markdown-parser/src/plugins/isMathLike.ts b/packages/markdown-parser/src/plugins/isMathLike.ts
@@ -90,9 +90,11 @@ export function isMathLike(s: string) {
   const funcCall = FUNC_CALL_RE.test(norm)
   // common math words
   const words = WORDS_RE.test(norm)
-  // 纯单个英文字母也渲染成数学公式
-  // e.g. (w) (x) (y) (z)
-  const pureWord = /^\([a-z]\)$/i.test(stripped)
+  // 纯单个英文字母也渲染成数学公式（常见变量/元素符号）
+  // e.g. (w) (x) (y) (z) 或 $H$, $x$ 等
+  const pureWord = /^\([a-z]\)$/i.test(stripped) || /^[a-z]$/i.test(stripped)
+  // 简单的化学式/下标：如 H_2O, CO_2, CH_3CH_2OH, CH_3COOH
+  const chemicalLike = /^(?:[A-Z][a-z]?(_\{?[A-Za-z0-9]+\}?|\^[A-Za-z0-9]+)?)+$/i.test(stripped)
 
-  return texCmd || texCmdWithBraces || texBraceStart || texSpecific || superSub || ops || funcCall || words || pureWord
+  return texCmd || texCmdWithBraces || texBraceStart || texSpecific || superSub || ops || funcCall || words || pureWord || chemicalLike
 }
diff --git a/packages/markdown-parser/src/plugins/math.ts b/packages/markdown-parser/src/plugins/math.ts
@@ -198,14 +198,18 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
   // Inline rule for \(...\) and $$...$$ and $...$
   const mathInline = (state: unknown, silent: boolean) => {
     const s = state as any
+    const strict = !!mathOpts?.strictDelimiters
 
     if (/^\*[^*]+/.test(s.src)) {
       return false
     }
     const delimiters: [string, string][] = [
       ['$', '$'],
+      // Support explicit TeX inline delimiters only: \( ... \)
       ['\\(', '\\)'],
-      ['\(', '\)'],
+      // Do NOT treat plain parentheses as math delimiters. Using ['\(', '\)']
+      // accidentally becomes ['(', ')'] in JS/TS strings and over-matches
+      // regular text like "(0 <= t < S-1)", causing false math detection.
     ]
 
     let searchPos = 0
@@ -276,7 +280,8 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
             continue
           }
           if (endIdx === -1) {
-            if (isMathLike(content)) {
+            // Do not treat segments containing inline code as math
+            if (!strict && isMathLike(content) && !content.includes('`')) {
               searchPos = index + open.length
               foundAny = true
               if (!silent) {
@@ -321,7 +326,13 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
           }
         }
         const content = src.slice(index + open.length, endIdx)
-        if (!isMathLike(content)) {
+        // Skip treating as math when the content contains inline-code backticks
+        // Always accept explicit dollar-delimited math ($...$) even if the
+        // heuristic deems it not math-like (to support cases like $H$, $CO_2$).
+        const hasBacktick = content.includes('`')
+        const isDollar = open === '$'
+        const shouldSkip = strict ? hasBacktick : (hasBacktick || (!isDollar && !isMathLike(content)))
+        if (shouldSkip) {
           // push remaining text after last match
           // not math-like; skip this match and continue scanning
           searchPos = endIdx + close.length
@@ -427,11 +438,17 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
     silent: boolean,
   ) => {
     const s = state as any
-    const delimiters: [string, string][] = [
-      ['\\[', '\\]'],
-      ['\[', '\]'],
-      ['$$', '$$'],
-    ]
+    const strict = !!mathOpts?.strictDelimiters
+    const delimiters: [string, string][] = strict
+      ? [
+          ['\\[', '\\]'],
+          ['$$', '$$'],
+        ]
+      : [
+          ['\\[', '\\]'],
+          ['\[', '\]'],
+          ['$$', '$$'],
+        ]
     const startPos = s.bMarks[startLine] + s.tShift[startLine]
     const lineText = s.src.slice(startPos, s.eMarks[startLine]).trim()
     let matched = false
@@ -525,6 +542,10 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
       }
     }
 
+    // In strict mode, do not emit mid-state (unclosed) block math
+    if (strict && !found)
+      return false
+
     const token: any = s.push('math_block', 'math', 0)
     token.content = normalizeStandaloneBackslashT(content)
     token.markup
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/vite.config.ts b/vite.config.ts