From b9ce5c88071825605d8e599d0e6d43c0e02f8fd4 Mon Sep 17 00:00:00 2001
From: Miel Vander Sande <miel.vandersande@meemoo.be>
Date: Thu, 13 Nov 2025 09:17:23 +0100
Subject: [PATCH 1/4] Move unresolve function

---
 lib/pyld/iri_resolver.py   | 84 +++++++++++++++++++++++++++++++++++++-
 tests/test_iri_resolver.py | 46 ++++++++++++++++++++-
 2 files changed, 127 insertions(+), 3 deletions(-)

diff --git a/lib/pyld/iri_resolver.py b/lib/pyld/iri_resolver.py
index a20d0f3..38fbd4a 100644
--- a/lib/pyld/iri_resolver.py
+++ b/lib/pyld/iri_resolver.py
@@ -1,7 +1,12 @@
 """
-The functions 'remove_dot_segments()', 'resolve()' and 'is_character_allowed_after_relative_path_segment()' are direct ports from [relative-to-absolute-iri.js](https://github.com/rubensworks/relative-to-absolute-iri.js)
+- The functions 'remove_dot_segments()', 'resolve()' and 'is_character_allowed_after_relative_path_segment()' are direct ports from [relative-to-absolute-iri.js](https://github.com/rubensworks/relative-to-absolute-iri.js) (c) Ruben Taelman <ruben.taelman@ugent.be>
+- The 'unresolve()' function is a move and rename of the 'remove_base()' function from 'jsonld.py'
 """
 
+from collections import namedtuple
+import re
+
+
 def is_character_allowed_after_relative_path_segment(ch: str) -> bool:
     """Return True if a character is valid after '.' or '..' in a path segment."""
     return not ch or ch in ('#', '?', '/')
@@ -204,4 +209,79 @@ def resolve(relative_iri: str, base_iri: str = None) -> str:
     relative_iri = base_path + relative_iri
     relative_iri = remove_dot_segments(relative_iri)
 
-    return base_iri[:base_slash_after_colon_pos] + relative_iri
\ No newline at end of file
+    return base_iri[:base_slash_after_colon_pos] + relative_iri
+
+def unresolve(absolute_iri: str, base_iri: str = ""):
+    """
+    Unresolves a given absolute IRI to an IRI relative to the given base IRI.
+
+    :param base: the base IRI.
+    :param iri: the absolute IRI.
+
+    :return: the relative IRI if relative to base, otherwise the absolute IRI.
+    """
+    # TODO: better sync with jsonld.js version
+    # skip IRI processing
+    if base_iri is None:
+        return absolute_iri
+
+    base = parse_url(base_iri)
+    rel = parse_url(absolute_iri)
+
+    # schemes and network locations (authorities) don't match, don't alter IRI
+    if not (base.scheme == rel.scheme and base.authority == rel.authority):
+        return absolute_iri
+
+    # remove path segments that match (do not remove last segment unless there
+    # is a hash or query
+    base_segments = remove_dot_segments(base.path).split('/')
+    iri_segments = remove_dot_segments(rel.path).split('/')
+    last = 0 if (rel.fragment or rel.query) else 1
+    while (len(base_segments) and len(iri_segments) > last and
+            base_segments[0] == iri_segments[0]):
+        base_segments.pop(0)
+        iri_segments.pop(0)
+
+    # use '../' for each non-matching base segment
+    rval = ''
+    if len(base_segments):
+        # don't count the last segment (if it ends with '/' last path doesn't
+        # count and if it doesn't end with '/' it isn't a path)
+        base_segments.pop()
+        rval += '../' * len(base_segments)
+
+    # prepend remaining segments
+    rval += '/'.join(iri_segments)
+
+    return unparse_url((None, None, rval, rel.query, rel.fragment)) or './'
+
+ParsedUrl = namedtuple(
+    'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment'])
+
+def parse_url(url):
+    # regex from RFC 3986
+    p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?'
+    m = re.match(p, url)
+    # remove default http and https ports
+    g = list(m.groups())
+    if ((g[0] == 'https' and g[1].endswith(':443')) or
+            (g[0] == 'http' and g[1].endswith(':80'))):
+        g[1] = g[1][:g[1].rfind(':')]
+    return ParsedUrl(*g)
+
+def unparse_url(parsed):
+    if isinstance(parsed, dict):
+        parsed = ParsedUrl(**parsed)
+    elif isinstance(parsed, list) or isinstance(parsed, tuple):
+        parsed = ParsedUrl(*parsed)
+    rval = ''
+    if parsed.scheme:
+        rval += parsed.scheme + ':'
+    if parsed.authority is not None:
+        rval += '//' + parsed.authority
+    rval += parsed.path
+    if parsed.query is not None:
+        rval += '?' + parsed.query
+    if parsed.fragment is not None:
+        rval += '#' + parsed.fragment
+    return rval
\ No newline at end of file
diff --git a/tests/test_iri_resolver.py b/tests/test_iri_resolver.py
index 2d3e28f..7db6f6e 100644
--- a/tests/test_iri_resolver.py
+++ b/tests/test_iri_resolver.py
@@ -1,7 +1,8 @@
 import pytest
-from pyld.iri_resolver import resolve, remove_dot_segments 
+from pyld.iri_resolver import resolve, unresolve, remove_dot_segments 
 
 # Tests ported from relative-to-absolute-iri.js: https://github.com/rubensworks/relative-to-absolute-iri.js/blob/master/test/Resolve-test.ts
+# (c) Ruben Taelman <stevenlevithan.com>
 
 # ---------- Tests for resolve() ----------
 class TestResolve:
@@ -275,6 +276,49 @@ def test_relative_with_triple_dot_segment_and_2x_double_dot_and_base(self):
     def test_questionmark_prefix_relative_with_complex_base_with_dot(self):
         assert resolve('?y','http://a/bb/ccc/./d;p?q') == 'http://a/bb/ccc/./d;p?y'
 
+# ---------- Tests for unresolve() ----------
+class TestUnresolve:
+    def test_absolute_iri_no_base(self):
+        assert unresolve('http://example.org/') == 'http://example.org/'
+
+    def test_absolute_iri_empty_base(self):
+        assert unresolve('http://example.org/', '') == 'http://example.org/'
+
+    def test_absolute_iri_with_base(self):
+        assert unresolve('http://example.org/', 'http://base.org/') == 'http://example.org/'
+
+    def test_empty_value_uses_base(self):
+        assert unresolve('', 'http://base.org/') == ''
+
+    def test_absolute_with_base(self):
+        assert unresolve('http://base.org/abc', 'http://base.org/') == 'abc'
+
+    def test_absolute_with_fragment_base(self):
+        assert unresolve('http://base.org/abc', 'http://base.org/#frag') == 'abc'
+
+    def test_hash_absolute(self):
+        assert unresolve('http://base.org/#abc', 'http://base.org/') == '#abc'
+
+    def test_colon_in_value_ignores_base(self):
+        assert unresolve('http:abc', 'http://base.org/') == 'http:abc'
+
+    def test_colon_in_value_removes_dots(self):
+        assert unresolve('http://abc/../../', 'http://base.org/') == 'http://abc/'
+
+    # def test_non_absolute_base_error(self):
+    #     with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value 'abc'"):
+    #         unresolve('abc', 'def')
+
+    # def test_non_absolute_base_empty_value_error(self):
+    #     with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value ''"):
+    #         unresolve('', 'def')
+
+    def test_base_without_path_slash(self):
+        assert unresolve('http://base.org/abc', 'http://base.org') == 'abc'
+
+    def test_base_with_path_slash(self):
+        assert unresolve('http://base.org/abc/', 'http://base.org') == 'abc/'
+
 # ---------- Tests for remove_dot_segments() ----------
 class TestRemoveDotSegments:
     def test_no_slash(self):

From e54c9855ee3ac3c633f5c2b37d50e758e7ce7b81 Mon Sep 17 00:00:00 2001
From: Miel Vander Sande <miel.vandersande@meemoo.be>
Date: Mon, 24 Nov 2025 14:28:58 +0100
Subject: [PATCH 2/4] Remove incorrect test, throw error when base IRI is
 invalid and improve parsing code

---
 lib/pyld/iri_resolver.py   |  9 +++++++--
 tests/test_iri_resolver.py | 15 ++++++---------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/lib/pyld/iri_resolver.py b/lib/pyld/iri_resolver.py
index 38fbd4a..3fe93a7 100644
--- a/lib/pyld/iri_resolver.py
+++ b/lib/pyld/iri_resolver.py
@@ -222,10 +222,14 @@ def unresolve(absolute_iri: str, base_iri: str = ""):
     """
     # TODO: better sync with jsonld.js version
     # skip IRI processing
-    if base_iri is None:
+    if not base_iri:
         return absolute_iri
 
     base = parse_url(base_iri)
+
+    if not base.scheme:
+        raise ValueError(f"Found invalid baseIRI '{base_iri}' for value '{absolute_iri}'")
+    
     rel = parse_url(absolute_iri)
 
     # schemes and network locations (authorities) don't match, don't alter IRI
@@ -264,7 +268,8 @@ def parse_url(url):
     m = re.match(p, url)
     # remove default http and https ports
     g = list(m.groups())
-    if ((g[0] == 'https' and g[1].endswith(':443')) or
+    
+    if g[1] is not None and ((g[0] == 'https' and g[1].endswith(':443')) or
             (g[0] == 'http' and g[1].endswith(':80'))):
         g[1] = g[1][:g[1].rfind(':')]
     return ParsedUrl(*g)
diff --git a/tests/test_iri_resolver.py b/tests/test_iri_resolver.py
index 7db6f6e..411d5c2 100644
--- a/tests/test_iri_resolver.py
+++ b/tests/test_iri_resolver.py
@@ -302,16 +302,13 @@ def test_hash_absolute(self):
     def test_colon_in_value_ignores_base(self):
         assert unresolve('http:abc', 'http://base.org/') == 'http:abc'
 
-    def test_colon_in_value_removes_dots(self):
-        assert unresolve('http://abc/../../', 'http://base.org/') == 'http://abc/'
-
-    # def test_non_absolute_base_error(self):
-    #     with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value 'abc'"):
-    #         unresolve('abc', 'def')
+    def test_non_absolute_base_error(self):
+        with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value 'http://base.org/abc'"):
+            unresolve('http://base.org/abc', 'def')
 
-    # def test_non_absolute_base_empty_value_error(self):
-    #     with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value ''"):
-    #         unresolve('', 'def')
+    def test_non_absolute_base_empty_value_error(self):
+        with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value ''"):
+            unresolve('', 'def')
 
     def test_base_without_path_slash(self):
         assert unresolve('http://base.org/abc', 'http://base.org') == 'abc'

From fb397dcf2a6c3acbbcad26f7c6ba369e20be9b86 Mon Sep 17 00:00:00 2001
From: Miel Vander Sande <miel.vandersande@meemoo.be>
Date: Tue, 25 Nov 2025 15:11:04 +0100
Subject: [PATCH 3/4] Adjust jsonld.py to use unresolve

---
 lib/pyld/jsonld.py | 134 +--------------------------------------------
 1 file changed, 3 insertions(+), 131 deletions(-)

diff --git a/lib/pyld/jsonld.py b/lib/pyld/jsonld.py
index 5abeceb..94b62b3 100644
--- a/lib/pyld/jsonld.py
+++ b/lib/pyld/jsonld.py
@@ -31,7 +31,7 @@
 from numbers import Integral, Real
 from frozendict import frozendict
 from pyld.__about__ import (__copyright__, __license__, __version__)
-from .iri_resolver import resolve 
+from .iri_resolver import parse_url, resolve, unresolve
 
 def cmp(a, b):
     return (a > b) - (a < b)
@@ -444,134 +444,6 @@ def unregister_rdf_parser(content_type):
         del _rdf_parsers[content_type]
 
 
-def remove_base(base, iri):
-    """
-    Removes a base IRI from the given absolute IRI.
-
-    :param base: the base IRI.
-    :param iri: the absolute IRI.
-
-    :return: the relative IRI if relative to base, otherwise the absolute IRI.
-    """
-    # TODO: better sync with jsonld.js version
-    # skip IRI processing
-    if base is None:
-        return iri
-
-    base = parse_url(base)
-    rel = parse_url(iri)
-
-    # schemes and network locations (authorities) don't match, don't alter IRI
-    if not (base.scheme == rel.scheme and base.authority == rel.authority):
-        return iri
-
-    # remove path segments that match (do not remove last segment unless there
-    # is a hash or query
-    base_segments = remove_dot_segments(base.path).split('/')
-    iri_segments = remove_dot_segments(rel.path).split('/')
-    last = 0 if (rel.fragment or rel.query) else 1
-    while (len(base_segments) and len(iri_segments) > last and
-            base_segments[0] == iri_segments[0]):
-        base_segments.pop(0)
-        iri_segments.pop(0)
-
-    # use '../' for each non-matching base segment
-    rval = ''
-    if len(base_segments):
-        # don't count the last segment (if it ends with '/' last path doesn't
-        # count and if it doesn't end with '/' it isn't a path)
-        base_segments.pop()
-        rval += '../' * len(base_segments)
-
-    # prepend remaining segments
-    rval += '/'.join(iri_segments)
-
-    return unparse_url((None, None, rval, rel.query, rel.fragment)) or './'
-
-
-def remove_dot_segments(path):
-    """
-    Removes dot segments from a URL path.
-
-    :param path: the path to remove dot segments from.
-
-    :return: a path with normalized dot segments.
-    """
-
-    # RFC 3986 5.2.4 (reworked)
-
-    # empty path shortcut
-    if len(path) == 0:
-        return ''
-
-    input = path.split('/')
-    output = []
-
-    while len(input) > 0:
-        next = input.pop(0)
-        done = len(input) == 0
-
-        if next == '.':
-            if done:
-                # ensure output has trailing /
-                output.append('')
-            continue
-
-        if next == '..':
-            if len(output) > 0:
-                output.pop()
-            if done:
-                # ensure output has trailing /
-                output.append('')
-            continue
-
-        output.append(next)
-
-    # ensure output has leading /
-    # merge path segments from section 5.2.3
-    # note that if the path includes no segments, the entire path is removed
-    if len(output) > 0 and path.startswith('/') and output[0] != '':
-        output.insert(0, '')
-    if len(output) == 1 and output[0] == '':
-        return '/'
-
-    return '/'.join(output)
-
-
-ParsedUrl = namedtuple(
-    'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment'])
-
-
-def parse_url(url):
-    # regex from RFC 3986
-    p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?'
-    m = re.match(p, url)
-    # remove default http and https ports
-    g = list(m.groups())
-    if ((g[0] == 'https' and g[1].endswith(':443')) or
-            (g[0] == 'http' and g[1].endswith(':80'))):
-        g[1] = g[1][:g[1].rfind(':')]
-    return ParsedUrl(*g)
-
-
-def unparse_url(parsed):
-    if isinstance(parsed, dict):
-        parsed = ParsedUrl(**parsed)
-    elif isinstance(parsed, list) or isinstance(parsed, tuple):
-        parsed = ParsedUrl(*parsed)
-    rval = ''
-    if parsed.scheme:
-        rval += parsed.scheme + ':'
-    if parsed.authority is not None:
-        rval += '//' + parsed.authority
-    rval += parsed.path
-    if parsed.query is not None:
-        rval += '?' + parsed.query
-    if parsed.fragment is not None:
-        rval += '#' + parsed.fragment
-    return rval
-
-
 class JsonLdProcessor(object):
     """
     A JSON-LD processor.
@@ -4814,9 +4686,9 @@ def _compact_iri(
                 if active_ctx['@base'] is None:
                     return iri
                 else:
-                    return remove_base(resolve(active_ctx['@base'], base), iri)
+                    return unresolve(iri, resolve(active_ctx['@base'], base))
             else:
-                return remove_base(base, iri)
+                return unresolve(iri, base)
 
         # return IRI as is
         return iri

From f480961c13bed43abb8ea3b0d4983ad0d4b22ce5 Mon Sep 17 00:00:00 2001
From: Miel Vander Sande <miel.vandersande@meemoo.be>
Date: Tue, 25 Nov 2025 22:23:28 +0100
Subject: [PATCH 4/4] Simplify unresolve code by using stdlib url(un)parse

---
 lib/pyld/iri_resolver.py | 68 ++++++++++++++++++----------------------
 lib/pyld/jsonld.py       |  5 +--
 2 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/lib/pyld/iri_resolver.py b/lib/pyld/iri_resolver.py
index 3fe93a7..ab38bb0 100644
--- a/lib/pyld/iri_resolver.py
+++ b/lib/pyld/iri_resolver.py
@@ -3,8 +3,7 @@
 - The 'unresolve()' function is a move and rename of the 'remove_base()' function from 'jsonld.py'
 """
 
-from collections import namedtuple
-import re
+from urllib.parse import urlparse, urlunparse
 
 
 def is_character_allowed_after_relative_path_segment(ch: str) -> bool:
@@ -220,20 +219,24 @@ def unresolve(absolute_iri: str, base_iri: str = ""):
 
     :return: the relative IRI if relative to base, otherwise the absolute IRI.
     """
-    # TODO: better sync with jsonld.js version
     # skip IRI processing
     if not base_iri:
         return absolute_iri
 
-    base = parse_url(base_iri)
+    base = urlparse(base_iri)
 
     if not base.scheme:
         raise ValueError(f"Found invalid baseIRI '{base_iri}' for value '{absolute_iri}'")
-    
-    rel = parse_url(absolute_iri)
+
+    # compute authority (netloc) and strip default ports
+    base_authority = parse_authority(base)
+
+    rel = urlparse(absolute_iri)
+    # compute authority (netloc) and strip default ports
+    rel_authority = parse_authority(rel)
 
     # schemes and network locations (authorities) don't match, don't alter IRI
-    if not (base.scheme == rel.scheme and base.authority == rel.authority):
+    if not (base.scheme == rel.scheme and base_authority == rel_authority):
         return absolute_iri
 
     # remove path segments that match (do not remove last segment unless there
@@ -257,36 +260,25 @@ def unresolve(absolute_iri: str, base_iri: str = ""):
     # prepend remaining segments
     rval += '/'.join(iri_segments)
 
-    return unparse_url((None, None, rval, rel.query, rel.fragment)) or './'
-
-ParsedUrl = namedtuple(
-    'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment'])
+    # build relative IRI using urlunparse with empty scheme/netloc
+    return urlunparse(('', '', rval, '', rel.query or '', rel.fragment or '')) or './'
 
-def parse_url(url):
-    # regex from RFC 3986
-    p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?'
-    m = re.match(p, url)
-    # remove default http and https ports
-    g = list(m.groups())
+def parse_authority(parsed_iri) -> str:
+    """
+    Compute authority (netloc) and strip default ports
     
-    if g[1] is not None and ((g[0] == 'https' and g[1].endswith(':443')) or
-            (g[0] == 'http' and g[1].endswith(':80'))):
-        g[1] = g[1][:g[1].rfind(':')]
-    return ParsedUrl(*g)
-
-def unparse_url(parsed):
-    if isinstance(parsed, dict):
-        parsed = ParsedUrl(**parsed)
-    elif isinstance(parsed, list) or isinstance(parsed, tuple):
-        parsed = ParsedUrl(*parsed)
-    rval = ''
-    if parsed.scheme:
-        rval += parsed.scheme + ':'
-    if parsed.authority is not None:
-        rval += '//' + parsed.authority
-    rval += parsed.path
-    if parsed.query is not None:
-        rval += '?' + parsed.query
-    if parsed.fragment is not None:
-        rval += '#' + parsed.fragment
-    return rval
\ No newline at end of file
+    :param parsed_iri: Description
+    :return: Description
+    :rtype: str
+    """ 
+    base_authority = parsed_iri.netloc or None
+    
+    try:
+        base_port = parsed_iri.port
+    except Exception:
+        base_port = None
+    
+    if base_authority is not None and base_port is not None:
+        if (parsed_iri.scheme == 'https' and base_port == 443) or (parsed_iri.scheme == 'http' and base_port == 80):
+            base_authority = base_authority.rsplit(':', 1)[0]
+    return base_authority
\ No newline at end of file
diff --git a/lib/pyld/jsonld.py b/lib/pyld/jsonld.py
index 94b62b3..40201c2 100644
--- a/lib/pyld/jsonld.py
+++ b/lib/pyld/jsonld.py
@@ -20,6 +20,7 @@
 import re
 import sys
 import traceback
+from urllib.parse import urlparse
 import warnings
 import uuid
 from .context_resolver import ContextResolver
@@ -31,7 +32,7 @@
 from numbers import Integral, Real
 from frozendict import frozendict
 from pyld.__about__ import (__copyright__, __license__, __version__)
-from .iri_resolver import parse_url, resolve, unresolve
+from .iri_resolver import resolve, unresolve
 
 def cmp(a, b):
     return (a > b) - (a < b)
@@ -6445,7 +6446,7 @@ def load_html(input, url, profile, options):
             html_base = resolve(html_base[0], effective_base)
         options['base'] = html_base
 
-    url_elements = parse_url(url)
+    url_elements = urlparse(url)
     if url_elements.fragment:
         # FIXME: CGI decode
         id = url_elements.fragment