diff --git a/pydoll/browser/tab.py b/pydoll/browser/tab.py index 0a4efbcd..3f2a41e7 100644 --- a/pydoll/browser/tab.py +++ b/pydoll/browser/tab.py @@ -51,6 +51,7 @@ TopLevelTargetRequired, WaitElementTimeout, ) +from pydoll.exporters.markdown import HTMLtoMarkdown from pydoll.interactions import KeyboardAPI, ScrollAPI from pydoll.protocol.browser.types import DownloadBehavior, DownloadProgressState from pydoll.protocol.page.events import PageEvent @@ -568,6 +569,32 @@ async def go_to(self, url: str, timeout: int = 300): logger.error(f'Page load timeout after {timeout}s for URL: {url}') raise PageLoadTimeout() + async def to_markdown(self, skip_nav: bool = True, skip_images: bool = False, **options) -> str: + """ + Export current page HTML to Markdown. + + Args: + skip_nav: Skip navigation elements (nav, aside, header, footer) + skip_images: Skip image elements + **options: Additional options for the converter + + Returns: + Markdown representation of the page + """ + + result = await self.execute_script('return document.documentElement.outerHTML') + + try: + html = result['result']['result']['value'] + except (KeyError, TypeError) as e: + raise ValueError( + f'Failed to extract HTML from page. Unexpected result structure: {type(result)}' + ) from e + + convertor = HTMLtoMarkdown(skip_nav=skip_nav, skip_images=skip_images, **options) + + return convertor.convert(html) + async def refresh( self, ignore_cache: bool = False, diff --git a/pydoll/exporters/__init__.py b/pydoll/exporters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pydoll/exporters/markdown.py b/pydoll/exporters/markdown.py new file mode 100644 index 00000000..f075eeec --- /dev/null +++ b/pydoll/exporters/markdown.py @@ -0,0 +1,487 @@ +from __future__ import annotations + +import re +from html.parser import HTMLParser +from typing import Dict, List, Optional, Set + + +class HTMLtoMarkdown(HTMLParser): + """ + HTML to Markdown converter using Python's built-in HTMLParser. + + This class parses HTML content and converts it to Markdown format, + handling common HTML elements like headings, paragraphs, links, + images, lists, tables, code blocks, and text formatting. + + The converter is designed to produce clean, readable Markdown output + suitable for documentation, content extraction, or archival purposes. + It automatically handles nested elements, maintains proper spacing, + and can optionally skip navigation elements and images. + + Attributes: + SKIP_TAGS: HTML tags that are always skipped during conversion + (script, style, noscript, svg). + OPTIONAL_SKIP_TAGS: HTML tags that can be optionally skipped + (nav, aside, header, footer). + BLOCK_TAGS: HTML tags that represent block-level elements + and require proper spacing. + + Example: + Convert HTML to Markdown:: + + converter = HTMLtoMarkdown(skip_nav=True) + markdown = converter.convert('

Title

Content

') + # Output: '# Title\n\nContent' + + Skip images in conversion:: + + converter = HTMLtoMarkdown(skip_images=True) + markdown = converter.convert('

Text

') + # Output: 'Text' + """ + + SKIP_TAGS: Set[str] = {'script', 'style', 'noscript', 'svg'} + + OPTIONAL_SKIP_TAGS: Set[str] = {'nav', 'aside', 'header', 'footer'} + + BLOCK_TAGS: Set[str] = { + 'p', + 'div', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'ul', + 'ol', + 'li', + 'blockquote', + 'pre', + 'table', + 'tr', + 'article', + 'section', + } + + BOLD_TAGS: Set[str] = {'strong', 'b'} + ITALIC_TAGS: Set[str] = {'em', 'i'} + TABLE_CELL_TAGS: Set[str] = {'td', 'th'} + CODE_TAGS: Set[str] = {'code', 'pre'} + LIST_TAGS: Set[str] = {'ul', 'ol', 'li'} + LIST_END_TAGS: Set[str] = {'ul', 'ol'} + TABLE_STRUCTURE_TAGS: Set[str] = {'table', 'tr'} + SPECIAL_INLINE_TAGS: Set[str] = {'a', 'img', 'blockquote'} + HEADING_TAG_LENGTH: int = 2 + + def __init__( + self, skip_nav: bool = True, skip_images: bool = False, code_fence: str = '```', **kwargs + ): + """ + Initialize the HTML to Markdown converter. + + Args: + skip_nav: If True, skips navigation-related elements including + nav, aside, header, and footer tags. This is useful for + extracting main content without site navigation clutter. + Defaults to True. + skip_images: If True, skips all img tags during conversion. + Useful when only text content is needed. Defaults to False. + code_fence: The string to use for fenced code blocks. + Defaults to triple backticks (```). + **kwargs: Additional keyword arguments (reserved for future use). + + Example: + Create a converter that includes navigation:: + + converter = HTMLtoMarkdown(skip_nav=False) + + Create a converter with custom code fences:: + + converter = HTMLtoMarkdown(code_fence='~~~') + """ + super().__init__() + self.skip_nav = skip_nav + self.skip_images = skip_images + self.code_fence = code_fence + + self.markdown: List[str] = [] + self.skip_depth: int = 0 + self.current_tag_stack: List[str] = [] + + self.in_pre: bool = False + self.in_code: bool = False + self.list_stack: List[str] = [] + self.list_counters: List[int] = [] + + self.bold_depth: int = 0 + self.italic_depth: int = 0 + + self.link_buffer: Optional[Dict] = None + self.table_buffer: Optional[Dict] = None + + def convert(self, html: str) -> str: + """ + Convert HTML string to Markdown format. + + This is the main entry point for conversion. It parses the provided + HTML content and returns a clean Markdown string with normalized + whitespace (no more than two consecutive newlines). + + Args: + html: The HTML string to convert. Can be a complete HTML document + or a fragment. + + Returns: + The converted Markdown string, stripped of leading and trailing + whitespace with normalized line breaks. + + Example: + Convert a simple HTML fragment:: + + converter = HTMLtoMarkdown() + result = converter.convert('

Hello world

') + # Returns: 'Hello **world**' + + Convert HTML with multiple elements:: + + html = ''' +

Title

+

First paragraph.

+ + ''' + result = converter.convert(html) + """ + self.feed(html) + result = ''.join(self.markdown) + + result = re.sub(r'\n{3,}', '\n\n', result) + return result.strip() + + def _is_heading_tag(self, tag: str) -> bool: + """Check if a tag is a valid heading tag (h1-h6).""" + return tag.startswith('h') and len(tag) == self.HEADING_TAG_LENGTH and tag[1].isdigit() + + def _handle_heading_start(self, tag: str) -> None: + """Handle opening heading tags (h1-h6).""" + level = int(tag[1]) + self._add_block_separator() + self.markdown.append('#' * level + ' ') + + def _handle_formatting_start(self, tag: str) -> None: + """Handle opening formatting tags (bold, italic).""" + if tag in self.BOLD_TAGS: + self.markdown.append('**') + self.bold_depth += 1 + elif tag in self.ITALIC_TAGS: + self.markdown.append('*') + self.italic_depth += 1 + + def _handle_code_start(self, tag: str) -> None: + """Handle opening code/pre tags.""" + if tag == 'code' and not self.in_pre: + self.markdown.append('`') + self.in_code = True + elif tag == 'pre': + self._add_block_separator() + self.markdown.append(f'{self.code_fence}\n') + self.in_pre = True + + def _handle_simple_block_start(self, tag: str) -> None: + """Handle simple block tags (p, br).""" + if tag == 'p': + self._add_block_separator() + else: # br + self.markdown.append('\n') + + def _handle_special_start(self, tag: str, attrs_dict: Dict[str, str]) -> None: + """Handle special inline tags (a, img, blockquote).""" + if tag == 'a': + href = attrs_dict.get('href', '') + self.link_buffer = {'href': href, 'text': []} + elif tag == 'img': + if not self.skip_images: + src = attrs_dict.get('src', '') + alt = attrs_dict.get('alt', '') + self._add_block_separator() + self.markdown.append(f'![{alt}]({src})') + else: # blockquote + self._add_block_separator() + self.markdown.append('> ') + + def _handle_list_start(self, tag: str) -> None: + """Handle opening list tags (ul, ol, li).""" + if tag == 'ul': + self._add_block_separator() + self.list_stack.append('ul') + elif tag == 'ol': + self._add_block_separator() + self.list_stack.append('ol') + self.list_counters.append(1) + elif tag == 'li': + indent = ' ' * (len(self.list_stack) - 1) + if self.list_stack and self.list_stack[-1] == 'ul': + self.markdown.append(f'\n{indent}- ') + elif self.list_stack and self.list_stack[-1] == 'ol': + counter = self.list_counters[-1] + self.markdown.append(f'\n{indent}{counter}. ') + self.list_counters[-1] += 1 + + def _handle_table_start(self, tag: str) -> None: + """Handle opening table tags.""" + if tag == 'table': + self.table_buffer = {'rows': [], 'current_row': []} + elif tag == 'tr' and self.table_buffer: + self.table_buffer['current_row'] = [] + elif tag in self.TABLE_CELL_TAGS and self.table_buffer: + self.table_buffer['in_cell'] = True + self.table_buffer['cell_content'] = [] + + def handle_starttag(self, tag: str, attrs: List[tuple]): + """ + Process an HTML opening tag and convert to Markdown. + + This method is called by the HTMLParser base class when an opening + tag is encountered. It handles conversion of various HTML elements + to their Markdown equivalents. + + Supported tags: + - Headings (h1-h6): Converted to # syntax + - Paragraphs (p): Add block separation + - Line breaks (br): Add newline + - Bold (strong, b): Wrapped in ** + - Italic (em, i): Wrapped in * + - Code (code): Wrapped in backticks + - Preformatted (pre): Fenced code blocks + - Links (a): [text](href) format + - Images (img): ![alt](src) format + - Lists (ul, ol, li): - or numbered format + - Blockquotes: > prefix + - Tables (table, tr, td, th): Pipe-delimited format + + Args: + tag: The HTML tag name (lowercase). + attrs: List of (name, value) tuples for tag attributes. + + Note: + Tags in SKIP_TAGS are always ignored. Tags in OPTIONAL_SKIP_TAGS + are ignored based on the skip_nav setting. + """ + if tag in self.SKIP_TAGS: + self.skip_depth += 1 + return + + if self.skip_nav and tag in self.OPTIONAL_SKIP_TAGS: + self.skip_depth += 1 + return + + if self.skip_depth > 0: + return + + self.current_tag_stack.append(tag) + attrs_dict = dict(attrs) + + if self._is_heading_tag(tag): + self._handle_heading_start(tag) + elif tag in {'p', 'br'}: + self._handle_simple_block_start(tag) + elif tag in self.BOLD_TAGS or tag in self.ITALIC_TAGS: + self._handle_formatting_start(tag) + elif tag in self.CODE_TAGS: + self._handle_code_start(tag) + elif tag in self.SPECIAL_INLINE_TAGS: + self._handle_special_start(tag, attrs_dict) + elif tag in self.LIST_TAGS: + self._handle_list_start(tag) + elif tag in self.TABLE_STRUCTURE_TAGS or tag in self.TABLE_CELL_TAGS: + self._handle_table_start(tag) + + def _handle_formatting_end(self, tag: str) -> None: + """Handle closing formatting tags (bold, italic).""" + if tag in self.BOLD_TAGS: + if self.bold_depth > 0: + self.markdown.append('**') + self.bold_depth -= 1 + elif tag in self.ITALIC_TAGS: + if self.italic_depth > 0: + self.markdown.append('*') + self.italic_depth -= 1 + + def _handle_code_end(self, tag: str) -> None: + """Handle closing code/pre tags.""" + if tag == 'code' and not self.in_pre: + self.markdown.append('`') + self.in_code = False + elif tag == 'pre': + self.markdown.append(f'{self.code_fence}\n') + self.in_pre = False + + def _handle_link_end(self) -> None: + """Handle closing anchor tag.""" + if self.link_buffer: + text = ''.join(self.link_buffer['text']).strip() + href = self.link_buffer['href'] + self.markdown.append(f'[{text}]({href})') + self.link_buffer = None + + def _handle_list_end(self, tag: str) -> None: + """Handle closing list tags (ul, ol).""" + if tag == 'ul': + if self.list_stack and self.list_stack[-1] == 'ul': + self.list_stack.pop() + if not self.list_stack: + self.markdown.append('\n') + elif tag == 'ol': + if self.list_stack and self.list_stack[-1] == 'ol': + self.list_stack.pop() + if self.list_counters: + self.list_counters.pop() + if not self.list_stack: + self.markdown.append('\n') + + def _handle_table_end(self, tag: str) -> None: + """Handle closing table tags.""" + if tag == 'table' and self.table_buffer: + self._render_table() + self.table_buffer = None + elif tag == 'tr' and self.table_buffer: + if self.table_buffer.get('current_row'): + self.table_buffer['rows'].append(self.table_buffer['current_row']) + elif tag in self.TABLE_CELL_TAGS and self.table_buffer: + if 'cell_content' in self.table_buffer: + content = ''.join(self.table_buffer['cell_content']).strip() + self.table_buffer['current_row'].append(content) + self.table_buffer['in_cell'] = False + + def handle_endtag(self, tag: str) -> None: + """ + Process an HTML closing tag and finalize Markdown conversion. + + This method is called by the HTMLParser base class when a closing + tag is encountered. It handles closing syntax for Markdown elements + that require it (like bold, italic, links, lists, and tables). + + Args: + tag: The HTML tag name (lowercase). + + Note: + Maintains proper nesting for formatting elements and handles + cleanup of buffers used for complex elements like links and tables. + """ + if tag in self.SKIP_TAGS: + self.skip_depth = max(0, self.skip_depth - 1) + return + + if self.skip_nav and tag in self.OPTIONAL_SKIP_TAGS: + self.skip_depth = max(0, self.skip_depth - 1) + return + + if self.skip_depth > 0: + return + + if self.current_tag_stack and self.current_tag_stack[-1] == tag: + self.current_tag_stack.pop() + + if self._is_heading_tag(tag): + self.markdown.append('\n') + elif tag in self.BOLD_TAGS or tag in self.ITALIC_TAGS: + self._handle_formatting_end(tag) + elif tag in self.CODE_TAGS: + self._handle_code_end(tag) + elif tag == 'a': + self._handle_link_end() + elif tag in self.LIST_END_TAGS: + self._handle_list_end(tag) + elif tag in self.TABLE_STRUCTURE_TAGS or tag in self.TABLE_CELL_TAGS: + self._handle_table_end(tag) + + def handle_data(self, data: str) -> None: + """ + Process text content within HTML elements. + + This method is called by the HTMLParser base class when text data + is encountered between tags. It handles proper formatting of text + content based on the current parsing context. + + Args: + data: The text content to process. + + Note: + - Text inside skipped tags is ignored + - Text inside links is buffered for link construction + - Text inside table cells is buffered for table rendering + - Text inside pre blocks preserves whitespace + - Regular text has whitespace normalized + """ + if self.skip_depth > 0: + return + + if self.link_buffer is not None: + self.link_buffer['text'].append(data) + return + + if self.table_buffer and self.table_buffer.get('in_cell'): + self.table_buffer['cell_content'].append(data) + return + + if self.in_pre: + self.markdown.append(data) + else: + cleaned = re.sub(r'\s+', ' ', data) + if cleaned: + self.markdown.append(cleaned) + + def _add_block_separator(self) -> None: + """ + Add appropriate spacing between block-level elements. + + Ensures proper visual separation in the output Markdown by adding + newlines when transitioning between block elements. Prevents + excessive blank lines by checking current output state. + """ + if self.markdown and not self.markdown[-1].endswith('\n\n'): + if self.markdown[-1].endswith('\n'): + self.markdown.append('\n') + else: + self.markdown.append('\n\n') + + def _render_table(self) -> None: + """ + Render buffered table data as Markdown table syntax. + + Converts the accumulated table rows and cells from the table_buffer + into GitHub Flavored Markdown table syntax with header separator row. + Handles uneven column counts by padding shorter rows. + + The first row is treated as the header row. Output format:: + + | Header 1 | Header 2 | + |----------|----------| + | Cell 1 | Cell 2 | + + Note: + Called automatically when a closing tag is encountered. + Does nothing if table_buffer is empty or has no rows. + """ + + if not self.table_buffer or not self.table_buffer['rows']: + return + + rows = self.table_buffer['rows'] + + self._add_block_separator() + + max_cols = max(len(row) for row in rows) + + header = rows[0] + self.markdown.append('| ' + ' | '.join(header) + ' |\n') + self.markdown.append('|' + '|'.join(['---'] * max_cols) + '|\n') + + for row in rows[1:]: + padded_row = row + [''] * (max_cols - len(row)) + self.markdown.append('| ' + ' | '.join(padded_row) + ' |\n') + + self.markdown.append('\n') diff --git a/tests/test_browser/test_browser_tab.py b/tests/test_browser/test_browser_tab.py index ad794da4..e73b94f5 100644 --- a/tests/test_browser/test_browser_tab.py +++ b/tests/test_browser/test_browser_tab.py @@ -373,6 +373,138 @@ async def test_refresh_with_params(self, tab): assert tab._connection_handler.execute_command.call_count == 2 +class TestTabToMarkdown: + """Test Tab.to_markdown method.""" + + @pytest.mark.asyncio + async def test_to_markdown_basic(self, tab): + """Test basic HTML to Markdown conversion.""" + html_content = '

Title

Content

' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown() + + assert '# Title' in result + assert 'Content' in result + assert_mock_called_at_least_once(tab._connection_handler) + + @pytest.mark.asyncio + async def test_to_markdown_skip_nav_default(self, tab): + """Test that navigation elements are skipped by default.""" + html_content = '

Main content

' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown() + + assert 'Navigation' not in result + assert 'Main content' in result + + @pytest.mark.asyncio + async def test_to_markdown_include_nav(self, tab): + """Test including navigation elements when skip_nav=False.""" + html_content = '

Main content

' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown(skip_nav=False) + + assert 'Navigation' in result + assert 'Main content' in result + + @pytest.mark.asyncio + async def test_to_markdown_skip_images(self, tab): + """Test skipping images when skip_images=True.""" + html_content = '

Text

Image' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown(skip_images=True) + + assert 'Text' in result + assert '![' not in result + assert 'test.png' not in result + + @pytest.mark.asyncio + async def test_to_markdown_include_images(self, tab): + """Test including images when skip_images=False (default).""" + html_content = 'Test Image' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown(skip_images=False) + + assert '![Test Image](test.png)' in result + + @pytest.mark.asyncio + async def test_to_markdown_custom_options(self, tab): + """Test passing custom options to converter.""" + html_content = '
code
' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown(code_fence='~~~') + + assert '~~~' in result + assert '```' not in result + + @pytest.mark.asyncio + async def test_to_markdown_invalid_result_structure_keyerror(self, tab): + """Test ValueError raised when result has unexpected structure (KeyError).""" + # Missing 'result' key in response + tab._connection_handler.execute_command.return_value = {} + + with pytest.raises(ValueError, match='Failed to extract HTML from page'): + await tab.to_markdown() + + @pytest.mark.asyncio + async def test_to_markdown_invalid_result_structure_typeerror(self, tab): + """Test ValueError raised when result has None values (TypeError).""" + # Value is None instead of expected dict + tab._connection_handler.execute_command.return_value = { + 'result': {'result': None} + } + + with pytest.raises(ValueError, match='Failed to extract HTML from page'): + await tab.to_markdown() + + @pytest.mark.asyncio + async def test_to_markdown_complex_html(self, tab): + """Test conversion of complex HTML with multiple elements.""" + html_content = ''' + + +

Main Title

+

Introduction with bold and italic.

+ + Link + + + ''' + tab._connection_handler.execute_command.return_value = { + 'result': {'result': {'value': html_content}} + } + + result = await tab.to_markdown() + + assert '# Main Title' in result + assert '**bold**' in result + assert '*italic*' in result + assert '- Item 1' in result + assert '- Item 2' in result + assert '[Link](https://example.com)' in result + + class TestTabScreenshotAndPDF: """Test Tab screenshot and PDF methods.""" diff --git a/tests/test_markdown.py b/tests/test_markdown.py new file mode 100644 index 00000000..3370be08 --- /dev/null +++ b/tests/test_markdown.py @@ -0,0 +1,564 @@ +""" +Tests for the HTMLtoMarkdown converter in pydoll.exporters.markdown. + +""" + +import pytest + +from pydoll.exporters.markdown import HTMLtoMarkdown + + +class TestHTMLtoMarkdownBasics: + """Basic HTML to Markdown conversion tests.""" + + def test_convert_empty_string(self): + """Test conversion of empty HTML string.""" + converter = HTMLtoMarkdown() + result = converter.convert('') + assert result == '' + + def test_convert_plain_text(self): + """Test conversion of plain text without HTML tags.""" + converter = HTMLtoMarkdown() + result = converter.convert('Hello World') + assert result == 'Hello World' + + def test_convert_whitespace_normalization(self): + """Test that excessive whitespace is normalized.""" + converter = HTMLtoMarkdown() + result = converter.convert('Hello World') + assert result == 'Hello World' + + +class TestHTMLtoMarkdownHeadings: + """Tests for heading conversion.""" + + def test_convert_h1(self): + """Test conversion of h1 heading.""" + converter = HTMLtoMarkdown() + result = converter.convert('

Title

') + assert result == '# Title' + + def test_convert_h2(self): + """Test conversion of h2 heading.""" + converter = HTMLtoMarkdown() + result = converter.convert('

Subtitle

') + assert result == '## Subtitle' + + def test_convert_h3(self): + """Test conversion of h3 heading.""" + converter = HTMLtoMarkdown() + result = converter.convert('

Section

') + assert result == '### Section' + + def test_convert_h4(self): + """Test conversion of h4 heading.""" + converter = HTMLtoMarkdown() + result = converter.convert('

Subsection

') + assert result == '#### Subsection' + + def test_convert_h5(self): + """Test conversion of h5 heading.""" + converter = HTMLtoMarkdown() + result = converter.convert('
Minor Section
') + assert result == '##### Minor Section' + + def test_convert_h6(self): + """Test conversion of h6 heading.""" + converter = HTMLtoMarkdown() + result = converter.convert('
Small Section
') + assert result == '###### Small Section' + + def test_convert_multiple_headings(self): + """Test conversion of multiple headings.""" + converter = HTMLtoMarkdown() + html = '

Main

Sub

Section

' + result = converter.convert(html) + assert '# Main' in result + assert '## Sub' in result + assert '### Section' in result + + +class TestHTMLtoMarkdownParagraphs: + """Tests for paragraph and text conversion.""" + + def test_convert_paragraph(self): + """Test conversion of paragraph element.""" + converter = HTMLtoMarkdown() + result = converter.convert('

This is a paragraph.

') + assert result == 'This is a paragraph.' + + def test_convert_multiple_paragraphs(self): + """Test conversion of multiple paragraphs with proper spacing.""" + converter = HTMLtoMarkdown() + html = '

First paragraph.

Second paragraph.

' + result = converter.convert(html) + assert 'First paragraph.' in result + assert 'Second paragraph.' in result + # Should have proper separation + assert '\n\n' in result + + def test_convert_line_break(self): + """Test conversion of br element.""" + converter = HTMLtoMarkdown() + result = converter.convert('Line one
Line two') + assert 'Line one\nLine two' == result + + +class TestHTMLtoMarkdownFormatting: + """Tests for text formatting conversion.""" + + def test_convert_bold_strong(self): + """Test conversion of strong tag to bold.""" + converter = HTMLtoMarkdown() + result = converter.convert('bold text') + assert result == '**bold text**' + + def test_convert_bold_b(self): + """Test conversion of b tag to bold.""" + converter = HTMLtoMarkdown() + result = converter.convert('bold text') + assert result == '**bold text**' + + def test_convert_italic_em(self): + """Test conversion of em tag to italic.""" + converter = HTMLtoMarkdown() + result = converter.convert('italic text') + assert result == '*italic text*' + + def test_convert_italic_i(self): + """Test conversion of i tag to italic.""" + converter = HTMLtoMarkdown() + result = converter.convert('italic text') + assert result == '*italic text*' + + def test_convert_inline_code(self): + """Test conversion of inline code element.""" + converter = HTMLtoMarkdown() + result = converter.convert('Use the print() function') + assert result == 'Use the `print()` function' + + def test_convert_nested_formatting(self): + """Test conversion of nested formatting elements.""" + converter = HTMLtoMarkdown() + result = converter.convert('bold and italic') + assert '**' in result + assert '*' in result + assert 'bold and italic' in result + + +class TestHTMLtoMarkdownLinks: + """Tests for link conversion.""" + + def test_convert_link(self): + """Test conversion of anchor tag to markdown link.""" + converter = HTMLtoMarkdown() + result = converter.convert('Example') + assert result == '[Example](https://example.com)' + + def test_convert_link_without_href(self): + """Test conversion of anchor tag without href.""" + converter = HTMLtoMarkdown() + result = converter.convert('No Link') + assert result == '[No Link]()' + + def test_convert_link_with_formatted_text(self): + """Test conversion of link with formatted text.""" + converter = HTMLtoMarkdown() + result = converter.convert('Bold Link') + assert 'Bold Link' in result + assert 'https://example.com' in result + + +class TestHTMLtoMarkdownImages: + """Tests for image conversion.""" + + def test_convert_image(self): + """Test conversion of img tag to markdown image.""" + converter = HTMLtoMarkdown() + result = converter.convert('Test Image') + assert result == '![Test Image](image.png)' + + def test_convert_image_without_alt(self): + """Test conversion of img tag without alt text.""" + converter = HTMLtoMarkdown() + result = converter.convert('') + assert result == '![](image.png)' + + def test_skip_images_option(self): + """Test that images are skipped when skip_images is True.""" + converter = HTMLtoMarkdown(skip_images=True) + result = converter.convert('

Text

Image') + assert '![' not in result + assert 'image.png' not in result + assert 'Text' in result + + def test_skip_images_only_image(self): + """Test skip_images when only an image is present.""" + converter = HTMLtoMarkdown(skip_images=True) + result = converter.convert('Test') + assert result == '' + assert '![' not in result + + def test_skip_images_multiple_images(self): + """Test that all images are skipped when skip_images is True.""" + converter = HTMLtoMarkdown(skip_images=True) + html = '

Text

' + result = converter.convert(html) + assert '![' not in result + assert '1.png' not in result + assert '2.png' not in result + assert '3.png' not in result + assert 'Text' in result + + +class TestHTMLtoMarkdownLists: + """Tests for list conversion.""" + + def test_convert_unordered_list(self): + """Test conversion of unordered list.""" + converter = HTMLtoMarkdown() + html = '' + result = converter.convert(html) + assert '- Item 1' in result + assert '- Item 2' in result + assert '- Item 3' in result + + def test_convert_ordered_list(self): + """Test conversion of ordered list.""" + converter = HTMLtoMarkdown() + html = '
  1. First
  2. Second
  3. Third
' + result = converter.convert(html) + assert '1. First' in result + assert '2. Second' in result + assert '3. Third' in result + + def test_convert_nested_lists(self): + """Test conversion of nested lists.""" + converter = HTMLtoMarkdown() + html = ''' + + ''' + result = converter.convert(html) + assert '- Parent' in result + assert 'Child 1' in result + assert 'Child 2' in result + + +class TestHTMLtoMarkdownCodeBlocks: + """Tests for code block conversion.""" + + def test_convert_preformatted_code(self): + """Test conversion of pre/code block.""" + converter = HTMLtoMarkdown() + html = '
def hello():\n    print("Hello")
' + result = converter.convert(html) + assert '```' in result + assert 'def hello():' in result + assert 'print("Hello")' in result + + def test_convert_pre_block_preserves_whitespace(self): + """Test that pre blocks preserve whitespace.""" + converter = HTMLtoMarkdown() + html = '
Line 1\n    Line 2 indented
' + result = converter.convert(html) + assert 'Line 1\n Line 2 indented' in result + + def test_custom_code_fence(self): + """Test conversion with custom code fence.""" + converter = HTMLtoMarkdown(code_fence='~~~') + html = '
code here
' + result = converter.convert(html) + assert '~~~' in result + assert '```' not in result + + +class TestHTMLtoMarkdownBlockquotes: + """Tests for blockquote conversion.""" + + def test_convert_blockquote(self): + """Test conversion of blockquote element.""" + converter = HTMLtoMarkdown() + result = converter.convert('
Quoted text
') + assert '> Quoted text' in result + + +class TestHTMLtoMarkdownTables: + """Tests for table conversion.""" + + def test_convert_simple_table(self): + """Test conversion of simple table.""" + converter = HTMLtoMarkdown() + html = ''' + + + + +
NameAge
Alice25
Bob30
+ ''' + result = converter.convert(html) + assert '| Name | Age |' in result + assert '|---|---|' in result + assert '| Alice | 25 |' in result + assert '| Bob | 30 |' in result + + def test_convert_table_uneven_columns(self): + """Test conversion of table with uneven columns.""" + converter = HTMLtoMarkdown() + html = ''' + + + +
ABC
12
+ ''' + result = converter.convert(html) + # Should pad shorter rows + assert '| A | B | C |' in result + assert '| 1 | 2 |' in result + + def test_convert_empty_table(self): + """Test conversion of empty table element.""" + converter = HTMLtoMarkdown() + html = '
' + result = converter.convert(html) + # Empty table should not produce table output + assert '|' not in result + + def test_convert_table_no_rows(self): + """Test conversion of table with no tr elements.""" + converter = HTMLtoMarkdown() + html = '
Empty
' + result = converter.convert(html) + # Should not crash and not produce table syntax + assert '|---' not in result + + def test_convert_table_empty_rows(self): + """Test conversion of table with empty rows.""" + converter = HTMLtoMarkdown() + html = '
' + result = converter.convert(html) + # Empty row should not be added + assert '|---' not in result + + +class TestHTMLtoMarkdownSkipTags: + """Tests for skipped tags functionality.""" + + def test_skip_script_tags(self): + """Test that script tags are skipped.""" + converter = HTMLtoMarkdown() + html = '

Text

' + result = converter.convert(html) + assert 'alert' not in result + assert 'Text' in result + + def test_skip_style_tags(self): + """Test that style tags are skipped.""" + converter = HTMLtoMarkdown() + html = '

Text

' + result = converter.convert(html) + assert 'color' not in result + assert '.class' not in result + assert 'Text' in result + + def test_skip_noscript_tags(self): + """Test that noscript tags are skipped.""" + converter = HTMLtoMarkdown() + html = '

Text

' + result = converter.convert(html) + assert 'Enable JavaScript' not in result + assert 'Text' in result + + def test_skip_svg_tags(self): + """Test that svg tags are skipped.""" + converter = HTMLtoMarkdown() + html = '

Text

' + result = converter.convert(html) + assert 'circle' not in result + assert 'Text' in result + + +class TestHTMLtoMarkdownNavSkipping: + """Tests for navigation element skipping.""" + + def test_skip_nav_by_default(self): + """Test that nav elements are skipped by default.""" + converter = HTMLtoMarkdown() + html = '

Main content

' + result = converter.convert(html) + assert 'Navigation content' not in result + assert 'Main content' in result + + def test_skip_header_by_default(self): + """Test that header elements are skipped by default.""" + converter = HTMLtoMarkdown() + html = '
Header content

Main content

' + result = converter.convert(html) + assert 'Header content' not in result + assert 'Main content' in result + + def test_skip_footer_by_default(self): + """Test that footer elements are skipped by default.""" + converter = HTMLtoMarkdown() + html = '

Main content

' + result = converter.convert(html) + assert 'Footer content' not in result + assert 'Main content' in result + + def test_skip_aside_by_default(self): + """Test that aside elements are skipped by default.""" + converter = HTMLtoMarkdown() + html = '

Main content

' + result = converter.convert(html) + assert 'Sidebar content' not in result + assert 'Main content' in result + + def test_include_nav_when_skip_nav_false(self): + """Test that nav elements are included when skip_nav is False.""" + converter = HTMLtoMarkdown(skip_nav=False) + html = '

Main content

' + result = converter.convert(html) + assert 'Navigation content' in result + assert 'Main content' in result + + def test_include_header_when_skip_nav_false(self): + """Test that header elements are included when skip_nav is False.""" + converter = HTMLtoMarkdown(skip_nav=False) + html = '
Header content

Main content

' + result = converter.convert(html) + assert 'Header content' in result + assert 'Main content' in result + + +class TestHTMLtoMarkdownComplexHtml: + """Tests for complex HTML structures.""" + + def test_convert_full_page_structure(self): + """Test conversion of a full page structure.""" + converter = HTMLtoMarkdown() + html = ''' + + + Test Page + + +

Main Title

+

Introduction paragraph.

+

Section

+

Content with bold and italic.

+ + + + + ''' + result = converter.convert(html) + assert '# Main Title' in result + assert '## Section' in result + assert '**bold**' in result + assert '*italic*' in result + assert '- Item 1' in result + assert 'Menu' not in result # nav skipped + assert 'Copyright' not in result # footer skipped + + def test_convert_nested_divs(self): + """Test conversion of nested div elements.""" + converter = HTMLtoMarkdown() + html = '

Nested content

' + result = converter.convert(html) + assert 'Nested content' in result + + def test_convert_mixed_content(self): + """Test conversion of mixed inline and block content.""" + converter = HTMLtoMarkdown() + html = ''' +

Start bold and italic end.

+
code block
+ ''' + result = converter.convert(html) + assert 'Start' in result + assert '**' in result + assert '*' in result + assert '```' in result + assert 'code block' in result + + +class TestHTMLtoMarkdownEdgeCases: + """Tests for edge cases and special scenarios.""" + + def test_empty_tags(self): + """Test conversion of empty tags.""" + converter = HTMLtoMarkdown() + result = converter.convert('

') + # Should not crash and return empty or minimal content + assert isinstance(result, str) + + def test_self_closing_tags(self): + """Test conversion of self-closing tags.""" + converter = HTMLtoMarkdown() + result = converter.convert('
') + assert isinstance(result, str) + + def test_excessive_newlines_normalized(self): + """Test that excessive newlines are normalized to double newlines.""" + converter = HTMLtoMarkdown() + html = '

First

Second

' + result = converter.convert(html) + # Should not have more than 2 consecutive newlines + assert '\n\n\n' not in result + + def test_special_characters_in_text(self): + """Test that special characters in text are preserved.""" + converter = HTMLtoMarkdown() + html = '

Special chars: & < > "

' + result = converter.convert(html) + # HTML entities should be preserved or decoded + assert 'Special chars:' in result + + def test_unicode_content(self): + """Test conversion of unicode content.""" + converter = HTMLtoMarkdown() + html = '

Unicode: 你好 مرحبا שלום 🚀

' + result = converter.convert(html) + assert '你好' in result + assert '🚀' in result + + def test_deeply_nested_skipped_tags(self): + """Test that deeply nested content in skipped tags is ignored.""" + converter = HTMLtoMarkdown() + html = ''' + +

Visible

+ ''' + result = converter.convert(html) + assert 'Should not appear' not in result + assert 'Visible' in result + + def test_unclosed_tags(self): + """Test that unclosed tags are handled gracefully.""" + converter = HTMLtoMarkdown() + # HTMLParser handles unclosed tags, test it doesn't crash + html = '

Unclosed paragraphBold without close' + result = converter.convert(html) + assert 'Unclosed paragraph' in result + + def test_multiple_conversions_same_instance(self): + """Test that same converter instance can be reused for multiple conversions.""" + converter = HTMLtoMarkdown() + result1 = converter.convert('

First conversion

') + # Note: Due to how the converter works, a new instance is recommended + # for each conversion, but this tests the current behavior + assert 'First conversion' in result1