Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ jobs:
matrix:
tox_env:
- py36
- py37
- py38
- py39
- py310
Expand Down
10 changes: 7 additions & 3 deletions lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,19 @@ def _has_javascript_scheme(s):
# - 0A - Line Feed
# - 0B - Vertical tab
# - 0D - Carriage Return
_ascii_control_characters = re.compile(r"[\x00-\x08\x0C\x0E-\x1F\x7F]")
_ascii_control_characters_str = re.compile("[\x00-\x08\x0C\x0E-\x1F\x7F]")
_ascii_control_characters_bytes = re.compile(b"[\x00-\x08\x0C\x0E-\x1F\x7F]")


def fromstring(string):
def fromstring(data):
"""
Enhanced fromstring function that removes ASCII control chars
before passing the input to the original lxml.html.fromstring.
"""
return lxml_fromstring(_ascii_control_characters.sub("", string))
if isinstance(data, bytes):
return lxml_fromstring(_ascii_control_characters_bytes.sub(b"", data))
else:
return lxml_fromstring(_ascii_control_characters_str.sub("", data))


# This regular expression is inspired by the one in urllib3.
Expand Down
6 changes: 6 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,12 @@ def test_ascii_control_chars_removed(self):
cleaner = Cleaner()
self.assertEqual(expected, cleaner.clean_html(html))

def test_ascii_control_chars_removed_from_bytes(self):
html = b"""<a href="java\x1bscript:alert()">Link</a>"""
expected = b"""<a href="">Link</a>"""
cleaner = Cleaner()
self.assertEqual(expected, cleaner.clean_html(html))

def test_memory_usage_many_elements_with_long_tails(self):
comment = "<!-- foo bar baz -->\n"
empty_line = "\t" * 10 + "\n"
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = py36,py37,py38,py39,py310,py311,py312,py313,mypy
envlist = py36,py38,py39,py310,py311,py312,py313,mypy
skipsdist = True

[testenv]
Expand Down