diff --git a/pydoll/browser/tab.py b/pydoll/browser/tab.py index 0a4efbcd..3f2a41e7 100644 --- a/pydoll/browser/tab.py +++ b/pydoll/browser/tab.py @@ -51,6 +51,7 @@ TopLevelTargetRequired, WaitElementTimeout, ) +from pydoll.exporters.markdown import HTMLtoMarkdown from pydoll.interactions import KeyboardAPI, ScrollAPI from pydoll.protocol.browser.types import DownloadBehavior, DownloadProgressState from pydoll.protocol.page.events import PageEvent @@ -568,6 +569,32 @@ async def go_to(self, url: str, timeout: int = 300): logger.error(f'Page load timeout after {timeout}s for URL: {url}') raise PageLoadTimeout() + async def to_markdown(self, skip_nav: bool = True, skip_images: bool = False, **options) -> str: + """ + Export current page HTML to Markdown. + + Args: + skip_nav: Skip navigation elements (nav, aside, header, footer) + skip_images: Skip image elements + **options: Additional options for the converter + + Returns: + Markdown representation of the page + """ + + result = await self.execute_script('return document.documentElement.outerHTML') + + try: + html = result['result']['result']['value'] + except (KeyError, TypeError) as e: + raise ValueError( + f'Failed to extract HTML from page. Unexpected result structure: {type(result)}' + ) from e + + convertor = HTMLtoMarkdown(skip_nav=skip_nav, skip_images=skip_images, **options) + + return convertor.convert(html) + async def refresh( self, ignore_cache: bool = False, diff --git a/pydoll/exporters/__init__.py b/pydoll/exporters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pydoll/exporters/markdown.py b/pydoll/exporters/markdown.py new file mode 100644 index 00000000..f075eeec --- /dev/null +++ b/pydoll/exporters/markdown.py @@ -0,0 +1,487 @@ +from __future__ import annotations + +import re +from html.parser import HTMLParser +from typing import Dict, List, Optional, Set + + +class HTMLtoMarkdown(HTMLParser): + """ + HTML to Markdown converter using Python's built-in HTMLParser. + + This class parses HTML content and converts it to Markdown format, + handling common HTML elements like headings, paragraphs, links, + images, lists, tables, code blocks, and text formatting. + + The converter is designed to produce clean, readable Markdown output + suitable for documentation, content extraction, or archival purposes. + It automatically handles nested elements, maintains proper spacing, + and can optionally skip navigation elements and images. + + Attributes: + SKIP_TAGS: HTML tags that are always skipped during conversion + (script, style, noscript, svg). + OPTIONAL_SKIP_TAGS: HTML tags that can be optionally skipped + (nav, aside, header, footer). + BLOCK_TAGS: HTML tags that represent block-level elements + and require proper spacing. + + Example: + Convert HTML to Markdown:: + + converter = HTMLtoMarkdown(skip_nav=True) + markdown = converter.convert('
Content
') + # Output: '# Title\n\nContent' + + Skip images in conversion:: + + converter = HTMLtoMarkdown(skip_images=True) + markdown = converter.convert('Text
')
+ # Output: 'Text'
+ """
+
+ SKIP_TAGS: Set[str] = {'script', 'style', 'noscript', 'svg'}
+
+ OPTIONAL_SKIP_TAGS: Set[str] = {'nav', 'aside', 'header', 'footer'}
+
+ BLOCK_TAGS: Set[str] = {
+ 'p',
+ 'div',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'ul',
+ 'ol',
+ 'li',
+ 'blockquote',
+ 'pre',
+ 'table',
+ 'tr',
+ 'article',
+ 'section',
+ }
+
+ BOLD_TAGS: Set[str] = {'strong', 'b'}
+ ITALIC_TAGS: Set[str] = {'em', 'i'}
+ TABLE_CELL_TAGS: Set[str] = {'td', 'th'}
+ CODE_TAGS: Set[str] = {'code', 'pre'}
+ LIST_TAGS: Set[str] = {'ul', 'ol', 'li'}
+ LIST_END_TAGS: Set[str] = {'ul', 'ol'}
+ TABLE_STRUCTURE_TAGS: Set[str] = {'table', 'tr'}
+ SPECIAL_INLINE_TAGS: Set[str] = {'a', 'img', 'blockquote'}
+ HEADING_TAG_LENGTH: int = 2
+
+ def __init__(
+ self, skip_nav: bool = True, skip_images: bool = False, code_fence: str = '```', **kwargs
+ ):
+ """
+ Initialize the HTML to Markdown converter.
+
+ Args:
+ skip_nav: If True, skips navigation-related elements including
+ nav, aside, header, and footer tags. This is useful for
+ extracting main content without site navigation clutter.
+ Defaults to True.
+ skip_images: If True, skips all img tags during conversion.
+ Useful when only text content is needed. Defaults to False.
+ code_fence: The string to use for fenced code blocks.
+ Defaults to triple backticks (```).
+ **kwargs: Additional keyword arguments (reserved for future use).
+
+ Example:
+ Create a converter that includes navigation::
+
+ converter = HTMLtoMarkdown(skip_nav=False)
+
+ Create a converter with custom code fences::
+
+ converter = HTMLtoMarkdown(code_fence='~~~')
+ """
+ super().__init__()
+ self.skip_nav = skip_nav
+ self.skip_images = skip_images
+ self.code_fence = code_fence
+
+ self.markdown: List[str] = []
+ self.skip_depth: int = 0
+ self.current_tag_stack: List[str] = []
+
+ self.in_pre: bool = False
+ self.in_code: bool = False
+ self.list_stack: List[str] = []
+ self.list_counters: List[int] = []
+
+ self.bold_depth: int = 0
+ self.italic_depth: int = 0
+
+ self.link_buffer: Optional[Dict] = None
+ self.table_buffer: Optional[Dict] = None
+
+ def convert(self, html: str) -> str:
+ """
+ Convert HTML string to Markdown format.
+
+ This is the main entry point for conversion. It parses the provided
+ HTML content and returns a clean Markdown string with normalized
+ whitespace (no more than two consecutive newlines).
+
+ Args:
+ html: The HTML string to convert. Can be a complete HTML document
+ or a fragment.
+
+ Returns:
+ The converted Markdown string, stripped of leading and trailing
+ whitespace with normalized line breaks.
+
+ Example:
+ Convert a simple HTML fragment::
+
+ converter = HTMLtoMarkdown()
+ result = converter.convert('Hello world
') + # Returns: 'Hello **world**' + + Convert HTML with multiple elements:: + + html = ''' +First paragraph.
+Content
' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown() + + assert '# Title' in result + assert 'Content' in result + assert_mock_called_at_least_once(tab._connection_handler) + + @pytest.mark.asyncio + async def test_to_markdown_skip_nav_default(self, tab): + """Test that navigation elements are skipped by default.""" + html_content = 'Main content
' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown() + + assert 'Navigation' not in result + assert 'Main content' in result + + @pytest.mark.asyncio + async def test_to_markdown_include_nav(self, tab): + """Test including navigation elements when skip_nav=False.""" + html_content = 'Main content
' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown(skip_nav=False) + + assert 'Navigation' in result + assert 'Main content' in result + + @pytest.mark.asyncio + async def test_to_markdown_skip_images(self, tab): + """Test skipping images when skip_images=True.""" + html_content = 'Text
'
+ tab._connection_handler.execute_command.return_value = {
+ 'result': {'result': {'value': html_content}}
+ }
+
+ result = await tab.to_markdown(skip_images=True)
+
+ assert 'Text' in result
+ assert '' in result
+
+ @pytest.mark.asyncio
+ async def test_to_markdown_custom_options(self, tab):
+ """Test passing custom options to converter."""
+ html_content = 'code' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown(code_fence='~~~') + + assert '~~~' in result + assert '```' not in result + + @pytest.mark.asyncio + async def test_to_markdown_invalid_result_structure_keyerror(self, tab): + """Test ValueError raised when result has unexpected structure (KeyError).""" + # Missing 'result' key in response + tab._connection_handler.execute_command.return_value = {} + + with pytest.raises(ValueError, match='Failed to extract HTML from page'): + await tab.to_markdown() + + @pytest.mark.asyncio + async def test_to_markdown_invalid_result_structure_typeerror(self, tab): + """Test ValueError raised when result has None values (TypeError).""" + # Value is None instead of expected dict + tab._connection_handler.execute_command.return_value = { + 'result': {'result': None} + } + + with pytest.raises(ValueError, match='Failed to extract HTML from page'): + await tab.to_markdown() + + @pytest.mark.asyncio + async def test_to_markdown_complex_html(self, tab): + """Test conversion of complex HTML with multiple elements.""" + html_content = ''' + + +
Introduction with bold and italic.
+This is a paragraph.
') + assert result == 'This is a paragraph.' + + def test_convert_multiple_paragraphs(self): + """Test conversion of multiple paragraphs with proper spacing.""" + converter = HTMLtoMarkdown() + html = 'First paragraph.
Second paragraph.
' + result = converter.convert(html) + assert 'First paragraph.' in result + assert 'Second paragraph.' in result + # Should have proper separation + assert '\n\n' in result + + def test_convert_line_break(self): + """Test conversion of br element.""" + converter = HTMLtoMarkdown() + result = converter.convert('Line oneprint() function')
+ assert result == 'Use the `print()` function'
+
+ def test_convert_nested_formatting(self):
+ """Test conversion of nested formatting elements."""
+ converter = HTMLtoMarkdown()
+ result = converter.convert('bold and italic')
+ assert '**' in result
+ assert '*' in result
+ assert 'bold and italic' in result
+
+
+class TestHTMLtoMarkdownLinks:
+ """Tests for link conversion."""
+
+ def test_convert_link(self):
+ """Test conversion of anchor tag to markdown link."""
+ converter = HTMLtoMarkdown()
+ result = converter.convert('Example')
+ assert result == '[Example](https://example.com)'
+
+ def test_convert_link_without_href(self):
+ """Test conversion of anchor tag without href."""
+ converter = HTMLtoMarkdown()
+ result = converter.convert('No Link')
+ assert result == '[No Link]()'
+
+ def test_convert_link_with_formatted_text(self):
+ """Test conversion of link with formatted text."""
+ converter = HTMLtoMarkdown()
+ result = converter.convert('Bold Link')
+ assert 'Bold Link' in result
+ assert 'https://example.com' in result
+
+
+class TestHTMLtoMarkdownImages:
+ """Tests for image conversion."""
+
+ def test_convert_image(self):
+ """Test conversion of img tag to markdown image."""
+ converter = HTMLtoMarkdown()
+ result = converter.convert('
')
+ assert result == ''
+
+ def test_convert_image_without_alt(self):
+ """Test conversion of img tag without alt text."""
+ converter = HTMLtoMarkdown()
+ result = converter.convert('
')
+ assert result == ''
+
+ def test_skip_images_option(self):
+ """Test that images are skipped when skip_images is True."""
+ converter = HTMLtoMarkdown(skip_images=True)
+ result = converter.convert('Text
')
+ assert '
Text

'
+ result = converter.convert(html)
+ assert '![' not in result
+ assert '1.png' not in result
+ assert '2.png' not in result
+ assert '3.png' not in result
+ assert 'Text' in result
+
+
+class TestHTMLtoMarkdownLists:
+ """Tests for list conversion."""
+
+ def test_convert_unordered_list(self):
+ """Test conversion of unordered list."""
+ converter = HTMLtoMarkdown()
+ html = 'def hello():\n print("Hello")'
+ result = converter.convert(html)
+ assert '```' in result
+ assert 'def hello():' in result
+ assert 'print("Hello")' in result
+
+ def test_convert_pre_block_preserves_whitespace(self):
+ """Test that pre blocks preserve whitespace."""
+ converter = HTMLtoMarkdown()
+ html = 'Line 1\n Line 2 indented' + result = converter.convert(html) + assert 'Line 1\n Line 2 indented' in result + + def test_custom_code_fence(self): + """Test conversion with custom code fence.""" + converter = HTMLtoMarkdown(code_fence='~~~') + html = '
code here' + result = converter.convert(html) + assert '~~~' in result + assert '```' not in result + + +class TestHTMLtoMarkdownBlockquotes: + """Tests for blockquote conversion.""" + + def test_convert_blockquote(self): + """Test conversion of blockquote element.""" + converter = HTMLtoMarkdown() + result = converter.convert('
Quoted text') + assert '> Quoted text' in result + + +class TestHTMLtoMarkdownTables: + """Tests for table conversion.""" + + def test_convert_simple_table(self): + """Test conversion of simple table.""" + converter = HTMLtoMarkdown() + html = ''' +
| Name | Age |
|---|---|
| Alice | 25 |
| Bob | 30 |
| A | B | C |
|---|---|---|
| 1 | 2 |
Text
' + result = converter.convert(html) + assert 'alert' not in result + assert 'Text' in result + + def test_skip_style_tags(self): + """Test that style tags are skipped.""" + converter = HTMLtoMarkdown() + html = 'Text
' + result = converter.convert(html) + assert 'color' not in result + assert '.class' not in result + assert 'Text' in result + + def test_skip_noscript_tags(self): + """Test that noscript tags are skipped.""" + converter = HTMLtoMarkdown() + html = 'Text
' + result = converter.convert(html) + assert 'Enable JavaScript' not in result + assert 'Text' in result + + def test_skip_svg_tags(self): + """Test that svg tags are skipped.""" + converter = HTMLtoMarkdown() + html = 'Text
' + result = converter.convert(html) + assert 'circle' not in result + assert 'Text' in result + + +class TestHTMLtoMarkdownNavSkipping: + """Tests for navigation element skipping.""" + + def test_skip_nav_by_default(self): + """Test that nav elements are skipped by default.""" + converter = HTMLtoMarkdown() + html = 'Main content
' + result = converter.convert(html) + assert 'Navigation content' not in result + assert 'Main content' in result + + def test_skip_header_by_default(self): + """Test that header elements are skipped by default.""" + converter = HTMLtoMarkdown() + html = 'Main content
' + result = converter.convert(html) + assert 'Header content' not in result + assert 'Main content' in result + + def test_skip_footer_by_default(self): + """Test that footer elements are skipped by default.""" + converter = HTMLtoMarkdown() + html = 'Main content
' + result = converter.convert(html) + assert 'Footer content' not in result + assert 'Main content' in result + + def test_skip_aside_by_default(self): + """Test that aside elements are skipped by default.""" + converter = HTMLtoMarkdown() + html = 'Main content
' + result = converter.convert(html) + assert 'Sidebar content' not in result + assert 'Main content' in result + + def test_include_nav_when_skip_nav_false(self): + """Test that nav elements are included when skip_nav is False.""" + converter = HTMLtoMarkdown(skip_nav=False) + html = 'Main content
' + result = converter.convert(html) + assert 'Navigation content' in result + assert 'Main content' in result + + def test_include_header_when_skip_nav_false(self): + """Test that header elements are included when skip_nav is False.""" + converter = HTMLtoMarkdown(skip_nav=False) + html = 'Main content
' + result = converter.convert(html) + assert 'Header content' in result + assert 'Main content' in result + + +class TestHTMLtoMarkdownComplexHtml: + """Tests for complex HTML structures.""" + + def test_convert_full_page_structure(self): + """Test conversion of a full page structure.""" + converter = HTMLtoMarkdown() + html = ''' + + +Introduction paragraph.
+Content with bold and italic.
+Nested content
Start bold and italic end.
+code block
+ '''
+ result = converter.convert(html)
+ assert 'Start' in result
+ assert '**' in result
+ assert '*' in result
+ assert '```' in result
+ assert 'code block' in result
+
+
+class TestHTMLtoMarkdownEdgeCases:
+ """Tests for edge cases and special scenarios."""
+
+ def test_empty_tags(self):
+ """Test conversion of empty tags."""
+ converter = HTMLtoMarkdown()
+ result = converter.convert('')
+ # Should not crash and return empty or minimal content
+ assert isinstance(result, str)
+
+ def test_self_closing_tags(self):
+ """Test conversion of self-closing tags."""
+ converter = HTMLtoMarkdown()
+ result = converter.convert('
')
+ assert isinstance(result, str)
+
+ def test_excessive_newlines_normalized(self):
+ """Test that excessive newlines are normalized to double newlines."""
+ converter = HTMLtoMarkdown()
+ html = 'First
Second
' + result = converter.convert(html) + # Should not have more than 2 consecutive newlines + assert '\n\n\n' not in result + + def test_special_characters_in_text(self): + """Test that special characters in text are preserved.""" + converter = HTMLtoMarkdown() + html = 'Special chars: & < > "
' + result = converter.convert(html) + # HTML entities should be preserved or decoded + assert 'Special chars:' in result + + def test_unicode_content(self): + """Test conversion of unicode content.""" + converter = HTMLtoMarkdown() + html = 'Unicode: 你好 مرحبا שלום 🚀
' + result = converter.convert(html) + assert '你好' in result + assert '🚀' in result + + def test_deeply_nested_skipped_tags(self): + """Test that deeply nested content in skipped tags is ignored.""" + converter = HTMLtoMarkdown() + html = ''' + +Visible
+ ''' + result = converter.convert(html) + assert 'Should not appear' not in result + assert 'Visible' in result + + def test_unclosed_tags(self): + """Test that unclosed tags are handled gracefully.""" + converter = HTMLtoMarkdown() + # HTMLParser handles unclosed tags, test it doesn't crash + html = 'Unclosed paragraphBold without close'
+ result = converter.convert(html)
+ assert 'Unclosed paragraph' in result
+
+ def test_multiple_conversions_same_instance(self):
+ """Test that same converter instance can be reused for multiple conversions."""
+ converter = HTMLtoMarkdown()
+ result1 = converter.convert(' First conversion