diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py index ca15b4534..bd4a47e6f 100644 --- a/crawl4ai/html2text/__init__.py +++ b/crawl4ai/html2text/__init__.py @@ -715,18 +715,20 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None: self.o("") self.o(" \n") if tag in ["td", "th"] and start: - if self.split_next_td: - self.o("| ") + # Always output pipe before every cell (GFM compliance) + self.o("| ") self.split_next_td = True if tag == "tr" and start: self.td_count = 0 if tag == "tr" and not start: + # Add trailing pipe for GFM compliance + self.o(" |") self.split_next_td = False self.soft_br() if tag == "tr" and not start and self.table_start: - # Underline table header - self.o("|".join(["---"] * self.td_count)) + # Underline table header with proper GFM format + self.o("| " + " | ".join(["---"] * self.td_count) + " |") self.soft_br() self.table_start = False if tag in ["td", "th"] and start: diff --git a/tests/test_table_gfm_compliance.py b/tests/test_table_gfm_compliance.py new file mode 100644 index 000000000..267d722c0 --- /dev/null +++ b/tests/test_table_gfm_compliance.py @@ -0,0 +1,153 @@ +""" +Unit tests for GFM-compliant markdown table generation. + +Tests that html2text generates tables with proper leading and trailing +pipe delimiters as per GitHub Flavored Markdown specification. + +Fixes: https://github.com/unclecode/crawl4ai/issues/1731 +""" + +import pytest +import sys +import os + +# Add parent directory to path to import html2text +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +from crawl4ai.html2text import HTML2Text + + +class TestTableGFMCompliance: + """Test suite for GFM-compliant table generation""" + + def test_table_has_leading_pipes(self): + """Test that all table rows start with |""" + html = '
AB
12
' + h = HTML2Text() + h.body_width = 0 # Disable wrapping + result = h.handle(html) + + lines = [l.strip() for l in result.split('\n') if '|' in l] + assert len(lines) > 0, "No table rows found in output" + + for i, line in enumerate(lines): + assert line.startswith('|'), f"Line {i+1} missing leading pipe: {repr(line)}" + + def test_table_has_trailing_pipes(self): + """Test that all table rows end with |""" + html = '
AB
12
' + h = HTML2Text() + h.body_width = 0 # Disable wrapping + result = h.handle(html) + + lines = [l.strip() for l in result.split('\n') if '|' in l] + assert len(lines) > 0, "No table rows found in output" + + for i, line in enumerate(lines): + assert line.endswith('|'), f"Line {i+1} missing trailing pipe: {repr(line)}" + + def test_separator_row_has_pipes(self): + """Test that separator row has proper format | --- | --- |""" + html = '
AB
12
' + h = HTML2Text() + h.body_width = 0 # Disable wrapping + result = h.handle(html) + + # Find separator row (contains ---) + lines = [l.strip() for l in result.split('\n') if '---' in l] + assert len(lines) > 0, "No separator row found" + + separator = lines[0] + assert separator.startswith('|'), f"Separator missing leading pipe: {repr(separator)}" + assert separator.endswith('|'), f"Separator missing trailing pipe: {repr(separator)}" + + # Check format is like | --- | --- | + assert '|' in separator[1:-1], "Separator should have pipes between dashes" + + def test_works_with_pad_tables_false(self): + """Test GFM compliance with pad_tables=False (default)""" + html = '
AB
12
' + h = HTML2Text() + h.body_width = 0 # Disable wrapping + h.pad_tables = False + result = h.handle(html) + + lines = [l.strip() for l in result.split('\n') if '|' in l] + assert len(lines) > 0, "No table rows found" + + for line in lines: + assert line.startswith('|') and line.endswith('|'), \ + f"Row not GFM compliant with pad_tables=False: {repr(line)}" + + def test_works_with_pad_tables_true(self): + """Test GFM compliance with pad_tables=True""" + html = '
AB
12
' + h = HTML2Text() + h.body_width = 0 # Disable wrapping + h.pad_tables = True + result = h.handle(html) + + # Filter out table markers used by pad_tables + lines = [l.strip() for l in result.split('\n') + if '|' in l and 'special_marker' not in l] + assert len(lines) > 0, "No table rows found" + + for line in lines: + assert line.startswith('|') and line.endswith('|'), \ + f"Row not GFM compliant with pad_tables=True: {repr(line)}" + + def test_multirow_table(self): + """Test that large tables maintain GFM compliance""" + html = ''' + + + + + +
ParameterGuidelineSources
Arsenic (2006)0.010 ALARANaturally occurring
Lead (2019)0.005 ALARAPlumbing
Mercury (2019)0.001Industrial
+ ''' + h = HTML2Text() + h.body_width = 0 # Disable wrapping + result = h.handle(html) + + lines = [l.strip() for l in result.split('\n') if '|' in l] + # Should have: 1 header + 1 separator + 3 data rows = 5 rows + assert len(lines) >= 5, f"Expected at least 5 table rows, got {len(lines)}" + + # All rows should be GFM compliant + for i, line in enumerate(lines): + assert line.startswith('|'), f"Line {i+1} missing leading pipe" + assert line.endswith('|'), f"Line {i+1} missing trailing pipe" + + def test_single_column_table(self): + """Test GFM compliance with single column table""" + html = '
Header
Data
' + h = HTML2Text() + h.body_width = 0 # Disable wrapping + result = h.handle(html) + + lines = [l.strip() for l in result.split('\n') if '|' in l] + assert len(lines) > 0, "No table rows found" + + for line in lines: + assert line.startswith('|') and line.endswith('|'), \ + f"Single column row not GFM compliant: {repr(line)}" + + def test_empty_cells(self): + """Test GFM compliance with empty table cells""" + html = '
AB
Data
' + h = HTML2Text() + h.body_width = 0 # Disable wrapping + result = h.handle(html) + + lines = [l.strip() for l in result.split('\n') if '|' in l] + assert len(lines) > 0, "No table rows found" + + for line in lines: + assert line.startswith('|') and line.endswith('|'), \ + f"Row with empty cell not GFM compliant: {repr(line)}" + + +if __name__ == '__main__': + # Run tests + pytest.main([__file__, '-v'])