Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions crawl4ai/html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,18 +715,20 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None:
self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
self.o(" \n")
if tag in ["td", "th"] and start:
if self.split_next_td:
self.o("| ")
# Always output pipe before every cell (GFM compliance)
self.o("| ")
self.split_next_td = True

if tag == "tr" and start:
self.td_count = 0
if tag == "tr" and not start:
# Add trailing pipe for GFM compliance
self.o(" |")
self.split_next_td = False
self.soft_br()
if tag == "tr" and not start and self.table_start:
# Underline table header
self.o("|".join(["---"] * self.td_count))
# Underline table header with proper GFM format
self.o("| " + " | ".join(["---"] * self.td_count) + " |")
self.soft_br()
self.table_start = False
if tag in ["td", "th"] and start:
Expand Down
153 changes: 153 additions & 0 deletions tests/test_table_gfm_compliance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""
Unit tests for GFM-compliant markdown table generation.

Tests that html2text generates tables with proper leading and trailing
pipe delimiters as per GitHub Flavored Markdown specification.

Fixes: https://github.com/unclecode/crawl4ai/issues/1731
"""

import pytest
import sys
import os

# Add parent directory to path to import html2text
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

from crawl4ai.html2text import HTML2Text


class TestTableGFMCompliance:
"""Test suite for GFM-compliant table generation"""

def test_table_has_leading_pipes(self):
"""Test that all table rows start with |"""
html = '<table><tr><th>A</th><th>B</th></tr><tr><td>1</td><td>2</td></tr></table>'
h = HTML2Text()
h.body_width = 0 # Disable wrapping
result = h.handle(html)

lines = [l.strip() for l in result.split('\n') if '|' in l]
assert len(lines) > 0, "No table rows found in output"

for i, line in enumerate(lines):
assert line.startswith('|'), f"Line {i+1} missing leading pipe: {repr(line)}"

def test_table_has_trailing_pipes(self):
"""Test that all table rows end with |"""
html = '<table><tr><th>A</th><th>B</th></tr><tr><td>1</td><td>2</td></tr></table>'
h = HTML2Text()
h.body_width = 0 # Disable wrapping
result = h.handle(html)

lines = [l.strip() for l in result.split('\n') if '|' in l]
assert len(lines) > 0, "No table rows found in output"

for i, line in enumerate(lines):
assert line.endswith('|'), f"Line {i+1} missing trailing pipe: {repr(line)}"

def test_separator_row_has_pipes(self):
"""Test that separator row has proper format | --- | --- |"""
html = '<table><tr><th>A</th><th>B</th></tr><tr><td>1</td><td>2</td></tr></table>'
h = HTML2Text()
h.body_width = 0 # Disable wrapping
result = h.handle(html)

# Find separator row (contains ---)
lines = [l.strip() for l in result.split('\n') if '---' in l]
assert len(lines) > 0, "No separator row found"

separator = lines[0]
assert separator.startswith('|'), f"Separator missing leading pipe: {repr(separator)}"
assert separator.endswith('|'), f"Separator missing trailing pipe: {repr(separator)}"

# Check format is like | --- | --- |
assert '|' in separator[1:-1], "Separator should have pipes between dashes"

def test_works_with_pad_tables_false(self):
"""Test GFM compliance with pad_tables=False (default)"""
html = '<table><tr><th>A</th><th>B</th></tr><tr><td>1</td><td>2</td></tr></table>'
h = HTML2Text()
h.body_width = 0 # Disable wrapping
h.pad_tables = False
result = h.handle(html)

lines = [l.strip() for l in result.split('\n') if '|' in l]
assert len(lines) > 0, "No table rows found"

for line in lines:
assert line.startswith('|') and line.endswith('|'), \
f"Row not GFM compliant with pad_tables=False: {repr(line)}"

def test_works_with_pad_tables_true(self):
"""Test GFM compliance with pad_tables=True"""
html = '<table><tr><th>A</th><th>B</th></tr><tr><td>1</td><td>2</td></tr></table>'
h = HTML2Text()
h.body_width = 0 # Disable wrapping
h.pad_tables = True
result = h.handle(html)

# Filter out table markers used by pad_tables
lines = [l.strip() for l in result.split('\n')
if '|' in l and 'special_marker' not in l]
assert len(lines) > 0, "No table rows found"

for line in lines:
assert line.startswith('|') and line.endswith('|'), \
f"Row not GFM compliant with pad_tables=True: {repr(line)}"

def test_multirow_table(self):
"""Test that large tables maintain GFM compliance"""
html = '''
<table>
<tr><th>Parameter</th><th>Guideline</th><th>Sources</th></tr>
<tr><td>Arsenic (2006)</td><td>0.010 ALARA</td><td>Naturally occurring</td></tr>
<tr><td>Lead (2019)</td><td>0.005 ALARA</td><td>Plumbing</td></tr>
<tr><td>Mercury (2019)</td><td>0.001</td><td>Industrial</td></tr>
</table>
'''
h = HTML2Text()
h.body_width = 0 # Disable wrapping
result = h.handle(html)

lines = [l.strip() for l in result.split('\n') if '|' in l]
# Should have: 1 header + 1 separator + 3 data rows = 5 rows
assert len(lines) >= 5, f"Expected at least 5 table rows, got {len(lines)}"

# All rows should be GFM compliant
for i, line in enumerate(lines):
assert line.startswith('|'), f"Line {i+1} missing leading pipe"
assert line.endswith('|'), f"Line {i+1} missing trailing pipe"

def test_single_column_table(self):
"""Test GFM compliance with single column table"""
html = '<table><tr><th>Header</th></tr><tr><td>Data</td></tr></table>'
h = HTML2Text()
h.body_width = 0 # Disable wrapping
result = h.handle(html)

lines = [l.strip() for l in result.split('\n') if '|' in l]
assert len(lines) > 0, "No table rows found"

for line in lines:
assert line.startswith('|') and line.endswith('|'), \
f"Single column row not GFM compliant: {repr(line)}"

def test_empty_cells(self):
"""Test GFM compliance with empty table cells"""
html = '<table><tr><th>A</th><th>B</th></tr><tr><td></td><td>Data</td></tr></table>'
h = HTML2Text()
h.body_width = 0 # Disable wrapping
result = h.handle(html)

lines = [l.strip() for l in result.split('\n') if '|' in l]
assert len(lines) > 0, "No table rows found"

for line in lines:
assert line.startswith('|') and line.endswith('|'), \
f"Row with empty cell not GFM compliant: {repr(line)}"


if __name__ == '__main__':
# Run tests
pytest.main([__file__, '-v'])