Revert "feat(markdown)!: strict math delimiters, robust inline code parsing; drop plain parentheses as math"

Simon-He95 · web-flow · commit 081a33a08296 · 2025-11-24T10:08:34.000+08:00
diff --git a/package.json b/package.json
@@ -112,17 +112,7 @@
   },
   "dependencies": {
     "@floating-ui/dom": "^1.7.4",
-    "markdown-it": "^14.1.0",
-    "markdown-it-container": "^4.0.0",
-    "markdown-it-emoji": "^3.0.0",
-    "markdown-it-footnote": "^4.0.0",
-    "markdown-it-ins": "^4.0.0",
-    "markdown-it-mark": "^4.0.0",
-    "markdown-it-sub": "^2.0.0",
-    "markdown-it-sup": "^2.0.0",
-    "markdown-it-task-checkbox": "^1.0.6",
-    "markdown-it-ts": "0.0.2-beta.4",
-    "stream-markdown-parser": "workspace:^"
+    "stream-markdown-parser": "^0.0.30"
   },
   "devDependencies": {
     "@antfu/eslint-config": "^5.4.1",
diff --git a/packages/markdown-parser/src/config.ts b/packages/markdown-parser/src/config.ts
@@ -12,13 +12,6 @@ export interface MathOptions {
   commands?: readonly string[]
   /** Whether to escape standalone '!' (default: true). */
   escapeExclamation?: boolean
-  /**
-   * Strict delimiter mode.
-   * - When true, only explicit TeX delimiters are recognized as math:
-   *   inline: `$...$` and `\(...\)`; block: `$$...$$` and `\[...\]`.
-   * - Heuristics and mid-state (unclosed) math detection are disabled.
-   */
-  strictDelimiters?: boolean
 }
 
 let defaultMathOptions: MathOptions | undefined
diff --git a/packages/markdown-parser/src/parser/inline-parsers/index.ts b/packages/markdown-parser/src/parser/inline-parsers/index.ts
@@ -46,22 +46,6 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
   const result: ParsedNode[] = []
   let currentTextNode: TextNode | null = null
 
-  // Fallback: markdown-it sometimes mis-tokenizes code_inline around CJK text
-  // yielding code_inline nodes that contain non-ASCII prose (e.g. "方法会根据").
-  // When detected, prefer re-parsing from the original raw string to rebuild
-  // inline_code spans and basic strong emphasis segments in a stable way.
-  const hasSuspiciousCodeInline = tokens.some(t => t.type === 'code_inline' && /[^\x00-\x7F]/.test(String(t.content ?? '')))
-  const hasBackticksInRaw = typeof raw === 'string' && /`/.test(raw)
-  const codeInlineCount = tokens.reduce((n, t) => n + (t.type === 'code_inline' ? 1 : 0), 0)
-  const rawBacktickCount = typeof raw === 'string' ? ((raw.match(/`/g) || []).length) : 0
-  // When backtick count is even, expected inline-code spans ~= backtickCount/2.
-  const expectedSpans = rawBacktickCount % 2 === 0 ? rawBacktickCount / 2 : 0
-  const mismatchedSpanCount = expectedSpans > 0 && codeInlineCount !== expectedSpans
-  const hasMathInlineUsingBackticks = tokens.some(t => t.type === 'math_inline' && /`/.test(String((t as any).raw ?? t.content ?? '')))
-  if (hasBackticksInRaw && (hasSuspiciousCodeInline || mismatchedSpanCount || hasMathInlineUsingBackticks)) {
-    return parseFromRawWithCodeAndStrong(String(raw ?? ''))
-  }
-
   let i = 0
   // Note: strong-token normalization and list-item normalization are
   // applied during markdown-it parsing via core rules (plugins that
@@ -190,50 +174,58 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
     if (!/`[^`]*/.test(content))
       return false
 
+    // Close any current text node and handle inline code
+    resetCurrentTextNode()
     const code_start = content.indexOf('`')
     const code_end = content.indexOf('`', code_start + 1)
-
-    // If we don't have a closing backtick within this token, don't emit a partial
-    // inline_code node. Instead, merge the rest of inline tokens into a single
-    // string and re-run parsing so the code span is handled atomically.
-    if (code_end === -1) {
-      let merged = content
-      for (let j = i + 1; j < tokens.length; j++)
-        merged += String((tokens[j].content ?? '') + (tokens[j].markup ?? ''))
-
-      // Consume to the end since we've merged remaining tokens
-      i = tokens.length - 1
-      handleToken({ type: 'text', content: merged, raw: merged } as unknown as MarkdownToken)
-      i++
-      return true
-    }
-
-    // Close any current text node and handle the text before the code span
-    resetCurrentTextNode()
-    const beforeText = content.slice(0, code_start)
-    const codeContent = content.slice(code_start + 1, code_end)
-    const after = content.slice(code_end + 1)
-
-    if (beforeText) {
-      // Try to parse emphasis/strong inside the pre-code fragment, without
-      // advancing the outer token index `i` permanently.
-      const handled = handleEmphasisAndStrikethrough(beforeText, _token)
-      if (!handled)
-        pushText(beforeText, beforeText)
-      else
+    const _text = content.slice(0, code_start)
+    const codeContent = code_end === -1 ? content.slice(code_start) : content.slice(code_start, code_end)
+    const after = code_end === -1 ? '' : content.slice(code_end + 1)
+    if (_text) {
+      // Try to re-run emphasis/strong parsing on the fragment before the code span
+      // but avoid mutating the outer token index `i` (handlers sometimes increment it).
+      const handled = handleEmphasisAndStrikethrough(_text, _token)
+      // restore index so we don't skip tokens in the outer loop
+      if (!handled) {
+        pushText(_text, _text)
+      }
+      else {
         i--
+      }
     }
 
+    const code = codeContent.replace(/`/g, '')
     pushParsed({
       type: 'inline_code',
-      code: codeContent,
-      raw: String(codeContent ?? ''),
+      code,
+      raw: String(code ?? ''),
     } as ParsedNode)
 
+    // afterCode 可能也存在很多情况包括多个 code，我们递归处理 --- IGNORE ---
     if (after) {
-      handleToken({ type: 'text', content: after, raw: after } as unknown as MarkdownToken)
+      handleToken({
+        type: 'text',
+        content: after,
+        raw: String(after ?? ''),
+      })
       i--
     }
+    else if (code_end === -1) {
+      // 要把下一个 token 也合并进来，把类型变成 text
+      const nextToken = tokens[i + 1]
+      if (nextToken) {
+        let fixedAfter = after
+        for (let j = i + 1; j < tokens.length; j++) {
+          fixedAfter += String(((tokens[j].content ?? '') + (tokens[j].markup ?? '')))
+        }
+        i = tokens.length - 1
+        handleToken({
+          type: 'text',
+          content: fixedAfter,
+          raw: String(fixedAfter ?? ''),
+        })
+      }
+    }
     i++
     return true
   }
@@ -871,79 +863,3 @@ export function parseInlineTokens(tokens: MarkdownToken[], raw?: string, pPreTok
 
   return result
 }
-
-// Minimal, robust fallback parser: split raw by backticks into
-// text and inline_code, and parse simple **strong** inside text.
-function parseFromRawWithCodeAndStrong(raw: string): ParsedNode[] {
-  // Tokenize raw handling two constructs: backticks `...` and strong **...**
-  // Build a small AST allowing code inside strong and vice versa.
-  const root: ParsedNode[] = []
-  const stack: Array<{ type: 'root' | 'strong', children: ParsedNode[] }> = [{ type: 'root', children: root }]
-  let i = 0
-
-  function cur() { return stack[stack.length - 1].children }
-
-  function pushText(s: string) {
-    if (!s)
-      return
-    const last = cur()[cur().length - 1]
-    if (last && last.type === 'text') {
-      (last as any).content += s
-      ;(last as any).raw += s
-    }
-    else {
-      cur().push({ type: 'text', content: s, raw: s } as ParsedNode)
-    }
-  }
-
-  while (i < raw.length) {
-    // strong open/close
-    if (raw[i] === '*' && raw[i + 1] === '*') {
-      // If already inside strong, close it; otherwise open new strong
-      const isClosing = stack.length > 1 && stack[stack.length - 1].type === 'strong'
-      i += 2
-      if (isClosing) {
-        const nodeChildren = stack.pop()!.children
-        cur().push({ type: 'strong', children: nodeChildren, raw: `**${nodeChildren.map(n => (n as any).raw ?? '').join('')}**` } as ParsedNode)
-      }
-      else {
-        stack.push({ type: 'strong', children: [] })
-      }
-      continue
-    }
-
-    // inline code
-    if (raw[i] === '`') {
-      const start = i
-      const close = raw.indexOf('`', i + 1)
-      if (close === -1) {
-        // no closing tick; treat as text
-        pushText(raw.slice(i))
-        break
-      }
-      const code = raw.slice(i + 1, close)
-      cur().push({ type: 'inline_code', code, raw: code } as ParsedNode)
-      i = close + 1
-      continue
-    }
-
-    // regular text: read until next special
-    let next = raw.indexOf('`', i)
-    const nextStrong = raw.indexOf('**', i)
-    if (nextStrong !== -1 && (next === -1 || nextStrong < next))
-      next = nextStrong
-    if (next === -1)
-      next = raw.length
-    pushText(raw.slice(i, next))
-    i = next
-  }
-
-  // If there are unclosed strongs, degrade them into plain text with ** markers
-  while (stack.length > 1) {
-    const dangling = stack.pop()!
-    const content = dangling.children.map(n => (n as any).raw ?? (n as any).content ?? '').join('')
-    pushText(`**${content}`)
-  }
-
-  return root
-}
diff --git a/packages/markdown-parser/src/parser/node-parsers/list-parser.ts b/packages/markdown-parser/src/parser/node-parsers/list-parser.ts
@@ -63,7 +63,11 @@ export function parseList(
           tokens[k].type === 'bullet_list_open'
           || tokens[k].type === 'ordered_list_open'
         ) {
-          // Parse nested list (do not skip '*' — treat all bullet types consistently)
+          if (tokens[k].markup === '*') {
+            k++
+            continue
+          }
+          // Parse nested list
           const [nestedListNode, newIndex] = parseNestedList(tokens, k)
           itemChildren.push(nestedListNode)
           k = newIndex
@@ -202,6 +206,11 @@ function parseNestedList(
           tokens[k].type === 'bullet_list_open'
           || tokens[k].type === 'ordered_list_open'
         ) {
+          if (tokens[k].markup === '*') {
+            k++
+            continue
+          }
+
           // Handle deeper nested lists
           const [deeperNestedListNode, newIndex] = parseNestedList(tokens, k)
           itemChildren.push(deeperNestedListNode)
diff --git a/packages/markdown-parser/src/plugins/isMathLike.ts b/packages/markdown-parser/src/plugins/isMathLike.ts
@@ -90,11 +90,9 @@ export function isMathLike(s: string) {
   const funcCall = FUNC_CALL_RE.test(norm)
   // common math words
   const words = WORDS_RE.test(norm)
-  // 纯单个英文字母也渲染成数学公式（常见变量/元素符号）
-  // e.g. (w) (x) (y) (z) 或 $H$, $x$ 等
-  const pureWord = /^\([a-z]\)$/i.test(stripped) || /^[a-z]$/i.test(stripped)
-  // 简单的化学式/下标：如 H_2O, CO_2, CH_3CH_2OH, CH_3COOH
-  const chemicalLike = /^(?:[A-Z][a-z]?(_\{?[A-Za-z0-9]+\}?|\^[A-Za-z0-9]+)?)+$/i.test(stripped)
+  // 纯单个英文字母也渲染成数学公式
+  // e.g. (w) (x) (y) (z)
+  const pureWord = /^\([a-z]\)$/i.test(stripped)
 
-  return texCmd || texCmdWithBraces || texBraceStart || texSpecific || superSub || ops || funcCall || words || pureWord || chemicalLike
+  return texCmd || texCmdWithBraces || texBraceStart || texSpecific || superSub || ops || funcCall || words || pureWord
 }
diff --git a/packages/markdown-parser/src/plugins/math.ts b/packages/markdown-parser/src/plugins/math.ts
@@ -198,18 +198,14 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
   // Inline rule for \(...\) and $$...$$ and $...$
   const mathInline = (state: unknown, silent: boolean) => {
     const s = state as any
-    const strict = !!mathOpts?.strictDelimiters
 
     if (/^\*[^*]+/.test(s.src)) {
       return false
     }
     const delimiters: [string, string][] = [
       ['$', '$'],
-      // Support explicit TeX inline delimiters only: \( ... \)
       ['\\(', '\\)'],
-      // Do NOT treat plain parentheses as math delimiters. Using ['\(', '\)']
-      // accidentally becomes ['(', ')'] in JS/TS strings and over-matches
-      // regular text like "(0 <= t < S-1)", causing false math detection.
+      ['\(', '\)'],
     ]
 
     let searchPos = 0
@@ -280,8 +276,7 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
             continue
           }
           if (endIdx === -1) {
-            // Do not treat segments containing inline code as math
-            if (!strict && isMathLike(content) && !content.includes('`')) {
+            if (isMathLike(content)) {
               searchPos = index + open.length
               foundAny = true
               if (!silent) {
@@ -326,13 +321,7 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
           }
         }
         const content = src.slice(index + open.length, endIdx)
-        // Skip treating as math when the content contains inline-code backticks
-        // Always accept explicit dollar-delimited math ($...$) even if the
-        // heuristic deems it not math-like (to support cases like $H$, $CO_2$).
-        const hasBacktick = content.includes('`')
-        const isDollar = open === '$'
-        const shouldSkip = strict ? hasBacktick : (hasBacktick || (!isDollar && !isMathLike(content)))
-        if (shouldSkip) {
+        if (!isMathLike(content)) {
           // push remaining text after last match
           // not math-like; skip this match and continue scanning
           searchPos = endIdx + close.length
@@ -438,17 +427,11 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
     silent: boolean,
   ) => {
     const s = state as any
-    const strict = !!mathOpts?.strictDelimiters
-    const delimiters: [string, string][] = strict
-      ? [
-          ['\\[', '\\]'],
-          ['$$', '$$'],
-        ]
-      : [
-          ['\\[', '\\]'],
-          ['\[', '\]'],
-          ['$$', '$$'],
-        ]
+    const delimiters: [string, string][] = [
+      ['\\[', '\\]'],
+      ['\[', '\]'],
+      ['$$', '$$'],
+    ]
     const startPos = s.bMarks[startLine] + s.tShift[startLine]
     const lineText = s.src.slice(startPos, s.eMarks[startLine]).trim()
     let matched = false
@@ -542,10 +525,6 @@ export function applyMath(md: MarkdownIt, mathOpts?: MathOptions) {
       }
     }
 
-    // In strict mode, do not emit mid-state (unclosed) block math
-    if (strict && !found)
-      return false
-
     const token: any = s.push('math_block', 'math', 0)
     token.content = normalizeStandaloneBackslashT(content)
     token.markup
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/vite.config.ts b/vite.config.ts