From b9ce5c88071825605d8e599d0e6d43c0e02f8fd4 Mon Sep 17 00:00:00 2001 From: Miel Vander Sande Date: Thu, 13 Nov 2025 09:17:23 +0100 Subject: [PATCH 1/4] Move unresolve function --- lib/pyld/iri_resolver.py | 84 +++++++++++++++++++++++++++++++++++++- tests/test_iri_resolver.py | 46 ++++++++++++++++++++- 2 files changed, 127 insertions(+), 3 deletions(-) diff --git a/lib/pyld/iri_resolver.py b/lib/pyld/iri_resolver.py index a20d0f3..38fbd4a 100644 --- a/lib/pyld/iri_resolver.py +++ b/lib/pyld/iri_resolver.py @@ -1,7 +1,12 @@ """ -The functions 'remove_dot_segments()', 'resolve()' and 'is_character_allowed_after_relative_path_segment()' are direct ports from [relative-to-absolute-iri.js](https://github.com/rubensworks/relative-to-absolute-iri.js) +- The functions 'remove_dot_segments()', 'resolve()' and 'is_character_allowed_after_relative_path_segment()' are direct ports from [relative-to-absolute-iri.js](https://github.com/rubensworks/relative-to-absolute-iri.js) (c) Ruben Taelman +- The 'unresolve()' function is a move and rename of the 'remove_base()' function from 'jsonld.py' """ +from collections import namedtuple +import re + + def is_character_allowed_after_relative_path_segment(ch: str) -> bool: """Return True if a character is valid after '.' or '..' in a path segment.""" return not ch or ch in ('#', '?', '/') @@ -204,4 +209,79 @@ def resolve(relative_iri: str, base_iri: str = None) -> str: relative_iri = base_path + relative_iri relative_iri = remove_dot_segments(relative_iri) - return base_iri[:base_slash_after_colon_pos] + relative_iri \ No newline at end of file + return base_iri[:base_slash_after_colon_pos] + relative_iri + +def unresolve(absolute_iri: str, base_iri: str = ""): + """ + Unresolves a given absolute IRI to an IRI relative to the given base IRI. + + :param base: the base IRI. + :param iri: the absolute IRI. + + :return: the relative IRI if relative to base, otherwise the absolute IRI. + """ + # TODO: better sync with jsonld.js version + # skip IRI processing + if base_iri is None: + return absolute_iri + + base = parse_url(base_iri) + rel = parse_url(absolute_iri) + + # schemes and network locations (authorities) don't match, don't alter IRI + if not (base.scheme == rel.scheme and base.authority == rel.authority): + return absolute_iri + + # remove path segments that match (do not remove last segment unless there + # is a hash or query + base_segments = remove_dot_segments(base.path).split('/') + iri_segments = remove_dot_segments(rel.path).split('/') + last = 0 if (rel.fragment or rel.query) else 1 + while (len(base_segments) and len(iri_segments) > last and + base_segments[0] == iri_segments[0]): + base_segments.pop(0) + iri_segments.pop(0) + + # use '../' for each non-matching base segment + rval = '' + if len(base_segments): + # don't count the last segment (if it ends with '/' last path doesn't + # count and if it doesn't end with '/' it isn't a path) + base_segments.pop() + rval += '../' * len(base_segments) + + # prepend remaining segments + rval += '/'.join(iri_segments) + + return unparse_url((None, None, rval, rel.query, rel.fragment)) or './' + +ParsedUrl = namedtuple( + 'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment']) + +def parse_url(url): + # regex from RFC 3986 + p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?' + m = re.match(p, url) + # remove default http and https ports + g = list(m.groups()) + if ((g[0] == 'https' and g[1].endswith(':443')) or + (g[0] == 'http' and g[1].endswith(':80'))): + g[1] = g[1][:g[1].rfind(':')] + return ParsedUrl(*g) + +def unparse_url(parsed): + if isinstance(parsed, dict): + parsed = ParsedUrl(**parsed) + elif isinstance(parsed, list) or isinstance(parsed, tuple): + parsed = ParsedUrl(*parsed) + rval = '' + if parsed.scheme: + rval += parsed.scheme + ':' + if parsed.authority is not None: + rval += '//' + parsed.authority + rval += parsed.path + if parsed.query is not None: + rval += '?' + parsed.query + if parsed.fragment is not None: + rval += '#' + parsed.fragment + return rval \ No newline at end of file diff --git a/tests/test_iri_resolver.py b/tests/test_iri_resolver.py index 2d3e28f..7db6f6e 100644 --- a/tests/test_iri_resolver.py +++ b/tests/test_iri_resolver.py @@ -1,7 +1,8 @@ import pytest -from pyld.iri_resolver import resolve, remove_dot_segments +from pyld.iri_resolver import resolve, unresolve, remove_dot_segments # Tests ported from relative-to-absolute-iri.js: https://github.com/rubensworks/relative-to-absolute-iri.js/blob/master/test/Resolve-test.ts +# (c) Ruben Taelman # ---------- Tests for resolve() ---------- class TestResolve: @@ -275,6 +276,49 @@ def test_relative_with_triple_dot_segment_and_2x_double_dot_and_base(self): def test_questionmark_prefix_relative_with_complex_base_with_dot(self): assert resolve('?y','http://a/bb/ccc/./d;p?q') == 'http://a/bb/ccc/./d;p?y' +# ---------- Tests for unresolve() ---------- +class TestUnresolve: + def test_absolute_iri_no_base(self): + assert unresolve('http://example.org/') == 'http://example.org/' + + def test_absolute_iri_empty_base(self): + assert unresolve('http://example.org/', '') == 'http://example.org/' + + def test_absolute_iri_with_base(self): + assert unresolve('http://example.org/', 'http://base.org/') == 'http://example.org/' + + def test_empty_value_uses_base(self): + assert unresolve('', 'http://base.org/') == '' + + def test_absolute_with_base(self): + assert unresolve('http://base.org/abc', 'http://base.org/') == 'abc' + + def test_absolute_with_fragment_base(self): + assert unresolve('http://base.org/abc', 'http://base.org/#frag') == 'abc' + + def test_hash_absolute(self): + assert unresolve('http://base.org/#abc', 'http://base.org/') == '#abc' + + def test_colon_in_value_ignores_base(self): + assert unresolve('http:abc', 'http://base.org/') == 'http:abc' + + def test_colon_in_value_removes_dots(self): + assert unresolve('http://abc/../../', 'http://base.org/') == 'http://abc/' + + # def test_non_absolute_base_error(self): + # with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value 'abc'"): + # unresolve('abc', 'def') + + # def test_non_absolute_base_empty_value_error(self): + # with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value ''"): + # unresolve('', 'def') + + def test_base_without_path_slash(self): + assert unresolve('http://base.org/abc', 'http://base.org') == 'abc' + + def test_base_with_path_slash(self): + assert unresolve('http://base.org/abc/', 'http://base.org') == 'abc/' + # ---------- Tests for remove_dot_segments() ---------- class TestRemoveDotSegments: def test_no_slash(self): From e54c9855ee3ac3c633f5c2b37d50e758e7ce7b81 Mon Sep 17 00:00:00 2001 From: Miel Vander Sande Date: Mon, 24 Nov 2025 14:28:58 +0100 Subject: [PATCH 2/4] Remove incorrect test, throw error when base IRI is invalid and improve parsing code --- lib/pyld/iri_resolver.py | 9 +++++++-- tests/test_iri_resolver.py | 15 ++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/lib/pyld/iri_resolver.py b/lib/pyld/iri_resolver.py index 38fbd4a..3fe93a7 100644 --- a/lib/pyld/iri_resolver.py +++ b/lib/pyld/iri_resolver.py @@ -222,10 +222,14 @@ def unresolve(absolute_iri: str, base_iri: str = ""): """ # TODO: better sync with jsonld.js version # skip IRI processing - if base_iri is None: + if not base_iri: return absolute_iri base = parse_url(base_iri) + + if not base.scheme: + raise ValueError(f"Found invalid baseIRI '{base_iri}' for value '{absolute_iri}'") + rel = parse_url(absolute_iri) # schemes and network locations (authorities) don't match, don't alter IRI @@ -264,7 +268,8 @@ def parse_url(url): m = re.match(p, url) # remove default http and https ports g = list(m.groups()) - if ((g[0] == 'https' and g[1].endswith(':443')) or + + if g[1] is not None and ((g[0] == 'https' and g[1].endswith(':443')) or (g[0] == 'http' and g[1].endswith(':80'))): g[1] = g[1][:g[1].rfind(':')] return ParsedUrl(*g) diff --git a/tests/test_iri_resolver.py b/tests/test_iri_resolver.py index 7db6f6e..411d5c2 100644 --- a/tests/test_iri_resolver.py +++ b/tests/test_iri_resolver.py @@ -302,16 +302,13 @@ def test_hash_absolute(self): def test_colon_in_value_ignores_base(self): assert unresolve('http:abc', 'http://base.org/') == 'http:abc' - def test_colon_in_value_removes_dots(self): - assert unresolve('http://abc/../../', 'http://base.org/') == 'http://abc/' - - # def test_non_absolute_base_error(self): - # with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value 'abc'"): - # unresolve('abc', 'def') + def test_non_absolute_base_error(self): + with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value 'http://base.org/abc'"): + unresolve('http://base.org/abc', 'def') - # def test_non_absolute_base_empty_value_error(self): - # with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value ''"): - # unresolve('', 'def') + def test_non_absolute_base_empty_value_error(self): + with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value ''"): + unresolve('', 'def') def test_base_without_path_slash(self): assert unresolve('http://base.org/abc', 'http://base.org') == 'abc' From fb397dcf2a6c3acbbcad26f7c6ba369e20be9b86 Mon Sep 17 00:00:00 2001 From: Miel Vander Sande Date: Tue, 25 Nov 2025 15:11:04 +0100 Subject: [PATCH 3/4] Adjust jsonld.py to use unresolve --- lib/pyld/jsonld.py | 134 +-------------------------------------------- 1 file changed, 3 insertions(+), 131 deletions(-) diff --git a/lib/pyld/jsonld.py b/lib/pyld/jsonld.py index 5abeceb..94b62b3 100644 --- a/lib/pyld/jsonld.py +++ b/lib/pyld/jsonld.py @@ -31,7 +31,7 @@ from numbers import Integral, Real from frozendict import frozendict from pyld.__about__ import (__copyright__, __license__, __version__) -from .iri_resolver import resolve +from .iri_resolver import parse_url, resolve, unresolve def cmp(a, b): return (a > b) - (a < b) @@ -444,134 +444,6 @@ def unregister_rdf_parser(content_type): del _rdf_parsers[content_type] -def remove_base(base, iri): - """ - Removes a base IRI from the given absolute IRI. - - :param base: the base IRI. - :param iri: the absolute IRI. - - :return: the relative IRI if relative to base, otherwise the absolute IRI. - """ - # TODO: better sync with jsonld.js version - # skip IRI processing - if base is None: - return iri - - base = parse_url(base) - rel = parse_url(iri) - - # schemes and network locations (authorities) don't match, don't alter IRI - if not (base.scheme == rel.scheme and base.authority == rel.authority): - return iri - - # remove path segments that match (do not remove last segment unless there - # is a hash or query - base_segments = remove_dot_segments(base.path).split('/') - iri_segments = remove_dot_segments(rel.path).split('/') - last = 0 if (rel.fragment or rel.query) else 1 - while (len(base_segments) and len(iri_segments) > last and - base_segments[0] == iri_segments[0]): - base_segments.pop(0) - iri_segments.pop(0) - - # use '../' for each non-matching base segment - rval = '' - if len(base_segments): - # don't count the last segment (if it ends with '/' last path doesn't - # count and if it doesn't end with '/' it isn't a path) - base_segments.pop() - rval += '../' * len(base_segments) - - # prepend remaining segments - rval += '/'.join(iri_segments) - - return unparse_url((None, None, rval, rel.query, rel.fragment)) or './' - - -def remove_dot_segments(path): - """ - Removes dot segments from a URL path. - - :param path: the path to remove dot segments from. - - :return: a path with normalized dot segments. - """ - - # RFC 3986 5.2.4 (reworked) - - # empty path shortcut - if len(path) == 0: - return '' - - input = path.split('/') - output = [] - - while len(input) > 0: - next = input.pop(0) - done = len(input) == 0 - - if next == '.': - if done: - # ensure output has trailing / - output.append('') - continue - - if next == '..': - if len(output) > 0: - output.pop() - if done: - # ensure output has trailing / - output.append('') - continue - - output.append(next) - - # ensure output has leading / - # merge path segments from section 5.2.3 - # note that if the path includes no segments, the entire path is removed - if len(output) > 0 and path.startswith('/') and output[0] != '': - output.insert(0, '') - if len(output) == 1 and output[0] == '': - return '/' - - return '/'.join(output) - - -ParsedUrl = namedtuple( - 'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment']) - - -def parse_url(url): - # regex from RFC 3986 - p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?' - m = re.match(p, url) - # remove default http and https ports - g = list(m.groups()) - if ((g[0] == 'https' and g[1].endswith(':443')) or - (g[0] == 'http' and g[1].endswith(':80'))): - g[1] = g[1][:g[1].rfind(':')] - return ParsedUrl(*g) - - -def unparse_url(parsed): - if isinstance(parsed, dict): - parsed = ParsedUrl(**parsed) - elif isinstance(parsed, list) or isinstance(parsed, tuple): - parsed = ParsedUrl(*parsed) - rval = '' - if parsed.scheme: - rval += parsed.scheme + ':' - if parsed.authority is not None: - rval += '//' + parsed.authority - rval += parsed.path - if parsed.query is not None: - rval += '?' + parsed.query - if parsed.fragment is not None: - rval += '#' + parsed.fragment - return rval - - class JsonLdProcessor(object): """ A JSON-LD processor. @@ -4814,9 +4686,9 @@ def _compact_iri( if active_ctx['@base'] is None: return iri else: - return remove_base(resolve(active_ctx['@base'], base), iri) + return unresolve(iri, resolve(active_ctx['@base'], base)) else: - return remove_base(base, iri) + return unresolve(iri, base) # return IRI as is return iri From f480961c13bed43abb8ea3b0d4983ad0d4b22ce5 Mon Sep 17 00:00:00 2001 From: Miel Vander Sande Date: Tue, 25 Nov 2025 22:23:28 +0100 Subject: [PATCH 4/4] Simplify unresolve code by using stdlib url(un)parse --- lib/pyld/iri_resolver.py | 68 ++++++++++++++++++---------------------- lib/pyld/jsonld.py | 5 +-- 2 files changed, 33 insertions(+), 40 deletions(-) diff --git a/lib/pyld/iri_resolver.py b/lib/pyld/iri_resolver.py index 3fe93a7..ab38bb0 100644 --- a/lib/pyld/iri_resolver.py +++ b/lib/pyld/iri_resolver.py @@ -3,8 +3,7 @@ - The 'unresolve()' function is a move and rename of the 'remove_base()' function from 'jsonld.py' """ -from collections import namedtuple -import re +from urllib.parse import urlparse, urlunparse def is_character_allowed_after_relative_path_segment(ch: str) -> bool: @@ -220,20 +219,24 @@ def unresolve(absolute_iri: str, base_iri: str = ""): :return: the relative IRI if relative to base, otherwise the absolute IRI. """ - # TODO: better sync with jsonld.js version # skip IRI processing if not base_iri: return absolute_iri - base = parse_url(base_iri) + base = urlparse(base_iri) if not base.scheme: raise ValueError(f"Found invalid baseIRI '{base_iri}' for value '{absolute_iri}'") - - rel = parse_url(absolute_iri) + + # compute authority (netloc) and strip default ports + base_authority = parse_authority(base) + + rel = urlparse(absolute_iri) + # compute authority (netloc) and strip default ports + rel_authority = parse_authority(rel) # schemes and network locations (authorities) don't match, don't alter IRI - if not (base.scheme == rel.scheme and base.authority == rel.authority): + if not (base.scheme == rel.scheme and base_authority == rel_authority): return absolute_iri # remove path segments that match (do not remove last segment unless there @@ -257,36 +260,25 @@ def unresolve(absolute_iri: str, base_iri: str = ""): # prepend remaining segments rval += '/'.join(iri_segments) - return unparse_url((None, None, rval, rel.query, rel.fragment)) or './' - -ParsedUrl = namedtuple( - 'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment']) + # build relative IRI using urlunparse with empty scheme/netloc + return urlunparse(('', '', rval, '', rel.query or '', rel.fragment or '')) or './' -def parse_url(url): - # regex from RFC 3986 - p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?' - m = re.match(p, url) - # remove default http and https ports - g = list(m.groups()) +def parse_authority(parsed_iri) -> str: + """ + Compute authority (netloc) and strip default ports - if g[1] is not None and ((g[0] == 'https' and g[1].endswith(':443')) or - (g[0] == 'http' and g[1].endswith(':80'))): - g[1] = g[1][:g[1].rfind(':')] - return ParsedUrl(*g) - -def unparse_url(parsed): - if isinstance(parsed, dict): - parsed = ParsedUrl(**parsed) - elif isinstance(parsed, list) or isinstance(parsed, tuple): - parsed = ParsedUrl(*parsed) - rval = '' - if parsed.scheme: - rval += parsed.scheme + ':' - if parsed.authority is not None: - rval += '//' + parsed.authority - rval += parsed.path - if parsed.query is not None: - rval += '?' + parsed.query - if parsed.fragment is not None: - rval += '#' + parsed.fragment - return rval \ No newline at end of file + :param parsed_iri: Description + :return: Description + :rtype: str + """ + base_authority = parsed_iri.netloc or None + + try: + base_port = parsed_iri.port + except Exception: + base_port = None + + if base_authority is not None and base_port is not None: + if (parsed_iri.scheme == 'https' and base_port == 443) or (parsed_iri.scheme == 'http' and base_port == 80): + base_authority = base_authority.rsplit(':', 1)[0] + return base_authority \ No newline at end of file diff --git a/lib/pyld/jsonld.py b/lib/pyld/jsonld.py index 94b62b3..40201c2 100644 --- a/lib/pyld/jsonld.py +++ b/lib/pyld/jsonld.py @@ -20,6 +20,7 @@ import re import sys import traceback +from urllib.parse import urlparse import warnings import uuid from .context_resolver import ContextResolver @@ -31,7 +32,7 @@ from numbers import Integral, Real from frozendict import frozendict from pyld.__about__ import (__copyright__, __license__, __version__) -from .iri_resolver import parse_url, resolve, unresolve +from .iri_resolver import resolve, unresolve def cmp(a, b): return (a > b) - (a < b) @@ -6445,7 +6446,7 @@ def load_html(input, url, profile, options): html_base = resolve(html_base[0], effective_base) options['base'] = html_base - url_elements = parse_url(url) + url_elements = urlparse(url) if url_elements.fragment: # FIXME: CGI decode id = url_elements.fragment