Skip to content

Commit 067baef

Browse files
committed
Support $WITH_EXTENSION to control building the C tokenizer
1 parent 68aebb0 commit 067baef

File tree

7 files changed

+141
-31
lines changed

7 files changed

+141
-31
lines changed

CHANGELOG

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
v0.7.1 (unreleased):
22

3-
- Modernized packaging.
3+
- Modernize packaging.
4+
- Explicitly support $WITH_EXTENSION/$WITHOUT_EXTENSION to control whether the
5+
C tokenizer should be built.
46

57
v0.7.0 (released June 28, 2025):
68

README.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@ Installation
1818
The easiest way to install the parser is from `PyPI`_; you can install the
1919
latest release with ``pip install mwparserfromhell``.
2020

21-
Alternatively, get the latest development version:
21+
Prebuilt wheels are available on PyPI with a fast, compiled C tokenizer
22+
extension for most environments (Linux x86_64 and arm64, macOS x86_64 and
23+
arm64, Windows x86 and x86_64). If building from source and the C tokenizer
24+
cannot be built, you can fall back to the slower pure-Python implementation by
25+
setting the environment variable ``WITH_EXTENSION=0`` when installing.
26+
27+
To get the latest development version (with `uv`_):
2228

2329
.. code-block:: sh
2430
@@ -223,6 +229,7 @@ Python 3 code (using the API_ and the requests_ library):
223229
.. _Legoktm: https://en.wikipedia.org/wiki/User:Legoktm
224230
.. _GitHub: https://github.com/earwig/mwparserfromhell
225231
.. _PyPI: https://pypi.org/project/mwparserfromhell/
232+
.. _uv: https://docs.astral.sh/uv/
226233
.. _pytest: https://docs.pytest.org/
227234
.. _Word-ending links: https://www.mediawiki.org/wiki/Help:Links#linktrail
228235
.. _EarwigBot: https://github.com/earwig/earwigbot

docs/changelog.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ v0.7.1
77
Unreleased
88
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.7.0...main>`__):
99

10-
- Modernized packaging.
10+
- Modernize packaging.
11+
- Explicitly support ``$WITH_EXTENSION`` / ``$WITHOUT_EXTENSION`` to control
12+
whether the C tokenizer should be built.
1113

1214
v0.7.0
1315
------

docs/index.rst

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,13 @@ Installation
2020
The easiest way to install the parser is from `PyPI`_; you can install the
2121
latest release with ``pip install mwparserfromhell``.
2222

23-
Alternatively, get the latest development version::
23+
Prebuilt wheels are available on PyPI with a fast, compiled C tokenizer
24+
extension for most environments (Linux x86_64 and arm64, macOS x86_64 and
25+
arm64, Windows x86 and x86_64). If building from source and the C tokenizer
26+
cannot be built, you can fall back to the slower pure-Python implementation by
27+
setting the environment variable ``WITH_EXTENSION=0`` when installing.
28+
29+
To get the latest development version (with `uv`_)::
2430

2531
git clone https://github.com/earwig/mwparserfromhell.git
2632
cd mwparserfromhell
@@ -32,7 +38,14 @@ The comprehensive test suite can be run with ``pytest``. If using ``uv``, pass
3238

3339
uv run --reinstall-package mwparserfromhell pytest
3440

41+
.. note::
42+
43+
To see if the fast C tokenizer is being used, check the value of
44+
``mwparserfromhell.parser.use_c``. If ``True``, it's being used; if
45+
``False``, the Python fallback is being used.
46+
3547
.. _PyPI: https://pypi.org/project/mwparserfromhell/
48+
.. _uv: https://docs.astral.sh/uv/
3649
.. _pytest: https://docs.pytest.org/
3750

3851
Contents

pyproject.toml

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -38,43 +38,18 @@ dev = [
3838
"psutil",
3939
"pytest",
4040
"pytest-cov",
41+
"setuptools",
4142
"sphinx",
4243
]
4344

4445
[build-system]
4546
requires = ["setuptools >= 77.0.3", "setuptools-scm>=8"]
4647
build-backend = "setuptools.build_meta"
4748

48-
[[tool.setuptools.ext-modules]]
49-
name = "mwparserfromhell.parser._tokenizer"
50-
sources = [
51-
"src/mwparserfromhell/parser/ctokenizer/avl_tree.c",
52-
"src/mwparserfromhell/parser/ctokenizer/definitions.c",
53-
"src/mwparserfromhell/parser/ctokenizer/tag_data.c",
54-
"src/mwparserfromhell/parser/ctokenizer/textbuffer.c",
55-
"src/mwparserfromhell/parser/ctokenizer/tok_parse.c",
56-
"src/mwparserfromhell/parser/ctokenizer/tok_support.c",
57-
"src/mwparserfromhell/parser/ctokenizer/tokenizer.c",
58-
"src/mwparserfromhell/parser/ctokenizer/tokens.c",
59-
]
60-
depends = [
61-
"src/mwparserfromhell/parser/ctokenizer/avl_tree.h",
62-
"src/mwparserfromhell/parser/ctokenizer/common.h",
63-
"src/mwparserfromhell/parser/ctokenizer/contexts.h",
64-
"src/mwparserfromhell/parser/ctokenizer/definitions.h",
65-
"src/mwparserfromhell/parser/ctokenizer/tag_data.h",
66-
"src/mwparserfromhell/parser/ctokenizer/textbuffer.h",
67-
"src/mwparserfromhell/parser/ctokenizer/tok_parse.h",
68-
"src/mwparserfromhell/parser/ctokenizer/tok_support.h",
69-
"src/mwparserfromhell/parser/ctokenizer/tokenizer.h",
70-
"src/mwparserfromhell/parser/ctokenizer/tokens.h",
71-
]
72-
include-dirs = ["lib"]
73-
optional = true
74-
7549
[tool.setuptools_scm]
7650

7751
[tool.cibuildwheel]
52+
environment = { WITH_EXTENSION = "1" }
7853
environment-pass = ["RUNNER_OS"]
7954
test-groups = ["dev"]
8055
test-command = [

setup.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#! /usr/bin/env python
2+
#
3+
# Copyright (C) 2012-2025 Ben Kurtovic <[email protected]>
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in
13+
# all copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
from __future__ import annotations
24+
25+
import glob
26+
import os
27+
import sys
28+
from enum import Enum
29+
30+
from setuptools import Extension, setup
31+
from setuptools.command.build_ext import build_ext
32+
33+
34+
class UseExtension(Enum):
35+
REQUIRED = 1
36+
OPTIONAL = 2
37+
IGNORED = 3
38+
39+
40+
if "WITH_EXTENSION" in os.environ:
41+
if "WITHOUT_EXTENSION" in os.environ:
42+
raise RuntimeError("Cannot set both $WITH_EXTENSION and $WITHOUT_EXTENSION")
43+
value = os.environ["WITH_EXTENSION"].lower()
44+
if value in ("1", "true", "yes", "required"):
45+
use_extension = UseExtension.REQUIRED
46+
elif value == "optional":
47+
use_extension = UseExtension.OPTIONAL
48+
elif value in ("0", "false", "no"):
49+
use_extension = UseExtension.IGNORED
50+
else:
51+
raise RuntimeError(
52+
f"Unknown value for $WITH_EXTENSION; should be '1', '0', or 'optional', "
53+
f"but found {value!r}"
54+
)
55+
elif "WITHOUT_EXTENSION" in os.environ:
56+
value = os.environ["WITHOUT_EXTENSION"].lower()
57+
if value in ("1", "true", "yes"):
58+
use_extension = UseExtension.IGNORED
59+
elif value in ("0", "false", "no"):
60+
use_extension = UseExtension.REQUIRED
61+
else:
62+
raise RuntimeError(
63+
f"Unknown value for $WITHOUT_EXTENSION; should be '1', or '0', "
64+
f"but found {value!r}"
65+
)
66+
else:
67+
use_extension = UseExtension.REQUIRED
68+
69+
if use_extension == UseExtension.IGNORED:
70+
ext_modules = []
71+
else:
72+
tokenizer = Extension(
73+
"mwparserfromhell.parser._tokenizer",
74+
sources=sorted(glob.glob("src/mwparserfromhell/parser/ctokenizer/*.c")),
75+
depends=sorted(glob.glob("src/mwparserfromhell/parser/ctokenizer/*.h")),
76+
optional=use_extension == UseExtension.OPTIONAL,
77+
)
78+
ext_modules = [tokenizer]
79+
80+
81+
def build_ext_patched(self):
82+
try:
83+
build_ext_original(self)
84+
except Exception:
85+
print(
86+
"""
87+
**********
88+
Note: To avoid building the C tokenizer extension, set the environment variable \
89+
`WITH_EXTENSION=0`.
90+
This will fall back to a pure-Python tokenizer.
91+
**********
92+
""",
93+
file=sys.stderr,
94+
)
95+
raise
96+
97+
98+
build_ext.run, build_ext_original = build_ext_patched, build_ext.run
99+
100+
setup(ext_modules=ext_modules)

uv.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)