diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 64d4a2f..ec790fe 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -24,7 +24,6 @@ jobs: matrix: tox_env: - py36 - - py37 - py38 - py39 - py310 diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index 37cf3e0..3eeda47 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -90,15 +90,19 @@ def _has_javascript_scheme(s): # - 0A - Line Feed # - 0B - Vertical tab # - 0D - Carriage Return -_ascii_control_characters = re.compile(r"[\x00-\x08\x0C\x0E-\x1F\x7F]") +_ascii_control_characters_str = re.compile("[\x00-\x08\x0C\x0E-\x1F\x7F]") +_ascii_control_characters_bytes = re.compile(b"[\x00-\x08\x0C\x0E-\x1F\x7F]") -def fromstring(string): +def fromstring(data): """ Enhanced fromstring function that removes ASCII control chars before passing the input to the original lxml.html.fromstring. """ - return lxml_fromstring(_ascii_control_characters.sub("", string)) + if isinstance(data, bytes): + return lxml_fromstring(_ascii_control_characters_bytes.sub(b"", data)) + else: + return lxml_fromstring(_ascii_control_characters_str.sub("", data)) # This regular expression is inspired by the one in urllib3. diff --git a/tests/test_clean.py b/tests/test_clean.py index b22548b..11cc102 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -355,6 +355,12 @@ def test_ascii_control_chars_removed(self): cleaner = Cleaner() self.assertEqual(expected, cleaner.clean_html(html)) + def test_ascii_control_chars_removed_from_bytes(self): + html = b"""Link""" + expected = b"""Link""" + cleaner = Cleaner() + self.assertEqual(expected, cleaner.clean_html(html)) + def test_memory_usage_many_elements_with_long_tails(self): comment = "\n" empty_line = "\t" * 10 + "\n" diff --git a/tox.ini b/tox.ini index ce95032..8706ef5 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py36,py37,py38,py39,py310,py311,py312,py313,mypy +envlist = py36,py38,py39,py310,py311,py312,py313,mypy skipsdist = True [testenv]