diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py
index ca15b4534..bd4a47e6f 100644
--- a/crawl4ai/html2text/__init__.py
+++ b/crawl4ai/html2text/__init__.py
@@ -715,18 +715,20 @@ def link_url(self: HTML2Text, link: str, title: str = "") -> None:
self.o("" + config.TABLE_MARKER_FOR_PAD + ">")
self.o(" \n")
if tag in ["td", "th"] and start:
- if self.split_next_td:
- self.o("| ")
+ # Always output pipe before every cell (GFM compliance)
+ self.o("| ")
self.split_next_td = True
if tag == "tr" and start:
self.td_count = 0
if tag == "tr" and not start:
+ # Add trailing pipe for GFM compliance
+ self.o(" |")
self.split_next_td = False
self.soft_br()
if tag == "tr" and not start and self.table_start:
- # Underline table header
- self.o("|".join(["---"] * self.td_count))
+ # Underline table header with proper GFM format
+ self.o("| " + " | ".join(["---"] * self.td_count) + " |")
self.soft_br()
self.table_start = False
if tag in ["td", "th"] and start:
diff --git a/tests/test_table_gfm_compliance.py b/tests/test_table_gfm_compliance.py
new file mode 100644
index 000000000..267d722c0
--- /dev/null
+++ b/tests/test_table_gfm_compliance.py
@@ -0,0 +1,153 @@
+"""
+Unit tests for GFM-compliant markdown table generation.
+
+Tests that html2text generates tables with proper leading and trailing
+pipe delimiters as per GitHub Flavored Markdown specification.
+
+Fixes: https://github.com/unclecode/crawl4ai/issues/1731
+"""
+
+import pytest
+import sys
+import os
+
+# Add parent directory to path to import html2text
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+from crawl4ai.html2text import HTML2Text
+
+
+class TestTableGFMCompliance:
+ """Test suite for GFM-compliant table generation"""
+
+ def test_table_has_leading_pipes(self):
+ """Test that all table rows start with |"""
+ html = '
'
+ h = HTML2Text()
+ h.body_width = 0 # Disable wrapping
+ result = h.handle(html)
+
+ lines = [l.strip() for l in result.split('\n') if '|' in l]
+ assert len(lines) > 0, "No table rows found in output"
+
+ for i, line in enumerate(lines):
+ assert line.startswith('|'), f"Line {i+1} missing leading pipe: {repr(line)}"
+
+ def test_table_has_trailing_pipes(self):
+ """Test that all table rows end with |"""
+ html = ''
+ h = HTML2Text()
+ h.body_width = 0 # Disable wrapping
+ result = h.handle(html)
+
+ lines = [l.strip() for l in result.split('\n') if '|' in l]
+ assert len(lines) > 0, "No table rows found in output"
+
+ for i, line in enumerate(lines):
+ assert line.endswith('|'), f"Line {i+1} missing trailing pipe: {repr(line)}"
+
+ def test_separator_row_has_pipes(self):
+ """Test that separator row has proper format | --- | --- |"""
+ html = ''
+ h = HTML2Text()
+ h.body_width = 0 # Disable wrapping
+ result = h.handle(html)
+
+ # Find separator row (contains ---)
+ lines = [l.strip() for l in result.split('\n') if '---' in l]
+ assert len(lines) > 0, "No separator row found"
+
+ separator = lines[0]
+ assert separator.startswith('|'), f"Separator missing leading pipe: {repr(separator)}"
+ assert separator.endswith('|'), f"Separator missing trailing pipe: {repr(separator)}"
+
+ # Check format is like | --- | --- |
+ assert '|' in separator[1:-1], "Separator should have pipes between dashes"
+
+ def test_works_with_pad_tables_false(self):
+ """Test GFM compliance with pad_tables=False (default)"""
+ html = ''
+ h = HTML2Text()
+ h.body_width = 0 # Disable wrapping
+ h.pad_tables = False
+ result = h.handle(html)
+
+ lines = [l.strip() for l in result.split('\n') if '|' in l]
+ assert len(lines) > 0, "No table rows found"
+
+ for line in lines:
+ assert line.startswith('|') and line.endswith('|'), \
+ f"Row not GFM compliant with pad_tables=False: {repr(line)}"
+
+ def test_works_with_pad_tables_true(self):
+ """Test GFM compliance with pad_tables=True"""
+ html = ''
+ h = HTML2Text()
+ h.body_width = 0 # Disable wrapping
+ h.pad_tables = True
+ result = h.handle(html)
+
+ # Filter out table markers used by pad_tables
+ lines = [l.strip() for l in result.split('\n')
+ if '|' in l and 'special_marker' not in l]
+ assert len(lines) > 0, "No table rows found"
+
+ for line in lines:
+ assert line.startswith('|') and line.endswith('|'), \
+ f"Row not GFM compliant with pad_tables=True: {repr(line)}"
+
+ def test_multirow_table(self):
+ """Test that large tables maintain GFM compliance"""
+ html = '''
+
+ | Parameter | Guideline | Sources |
+ | Arsenic (2006) | 0.010 ALARA | Naturally occurring |
+ | Lead (2019) | 0.005 ALARA | Plumbing |
+ | Mercury (2019) | 0.001 | Industrial |
+
+ '''
+ h = HTML2Text()
+ h.body_width = 0 # Disable wrapping
+ result = h.handle(html)
+
+ lines = [l.strip() for l in result.split('\n') if '|' in l]
+ # Should have: 1 header + 1 separator + 3 data rows = 5 rows
+ assert len(lines) >= 5, f"Expected at least 5 table rows, got {len(lines)}"
+
+ # All rows should be GFM compliant
+ for i, line in enumerate(lines):
+ assert line.startswith('|'), f"Line {i+1} missing leading pipe"
+ assert line.endswith('|'), f"Line {i+1} missing trailing pipe"
+
+ def test_single_column_table(self):
+ """Test GFM compliance with single column table"""
+ html = ''
+ h = HTML2Text()
+ h.body_width = 0 # Disable wrapping
+ result = h.handle(html)
+
+ lines = [l.strip() for l in result.split('\n') if '|' in l]
+ assert len(lines) > 0, "No table rows found"
+
+ for line in lines:
+ assert line.startswith('|') and line.endswith('|'), \
+ f"Single column row not GFM compliant: {repr(line)}"
+
+ def test_empty_cells(self):
+ """Test GFM compliance with empty table cells"""
+ html = ''
+ h = HTML2Text()
+ h.body_width = 0 # Disable wrapping
+ result = h.handle(html)
+
+ lines = [l.strip() for l in result.split('\n') if '|' in l]
+ assert len(lines) > 0, "No table rows found"
+
+ for line in lines:
+ assert line.startswith('|') and line.endswith('|'), \
+ f"Row with empty cell not GFM compliant: {repr(line)}"
+
+
+if __name__ == '__main__':
+ # Run tests
+ pytest.main([__file__, '-v'])