Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions pydoll/browser/tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
TopLevelTargetRequired,
WaitElementTimeout,
)
from pydoll.exporters.markdown import HTMLtoMarkdown
from pydoll.interactions import KeyboardAPI, ScrollAPI
from pydoll.protocol.browser.types import DownloadBehavior, DownloadProgressState
from pydoll.protocol.page.events import PageEvent
Expand Down Expand Up @@ -568,6 +569,32 @@ async def go_to(self, url: str, timeout: int = 300):
logger.error(f'Page load timeout after {timeout}s for URL: {url}')
raise PageLoadTimeout()

async def to_markdown(self, skip_nav: bool = True, skip_images: bool = False, **options) -> str:
"""
Export current page HTML to Markdown.

Args:
skip_nav: Skip navigation elements (nav, aside, header, footer)
skip_images: Skip image elements
**options: Additional options for the converter

Returns:
Markdown representation of the page
"""

result = await self.execute_script('return document.documentElement.outerHTML')

try:
html = result['result']['result']['value']
except (KeyError, TypeError) as e:
raise ValueError(
f'Failed to extract HTML from page. Unexpected result structure: {type(result)}'
) from e

convertor = HTMLtoMarkdown(skip_nav=skip_nav, skip_images=skip_images, **options)

return convertor.convert(html)

async def refresh(
self,
ignore_cache: bool = False,
Expand Down
Empty file added pydoll/exporters/__init__.py
Empty file.
Loading
Loading