Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 79 additions & 2 deletions lib/pyld/iri_resolver.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
"""
The functions 'remove_dot_segments()', 'resolve()' and 'is_character_allowed_after_relative_path_segment()' are direct ports from [relative-to-absolute-iri.js](https://github.com/rubensworks/relative-to-absolute-iri.js)
- The functions 'remove_dot_segments()', 'resolve()' and 'is_character_allowed_after_relative_path_segment()' are direct ports from [relative-to-absolute-iri.js](https://github.com/rubensworks/relative-to-absolute-iri.js) (c) Ruben Taelman <[email protected]>
- The 'unresolve()' function is a move and rename of the 'remove_base()' function from 'jsonld.py'
"""

from urllib.parse import urlparse, urlunparse


def is_character_allowed_after_relative_path_segment(ch: str) -> bool:
"""Return True if a character is valid after '.' or '..' in a path segment."""
return not ch or ch in ('#', '?', '/')
Expand Down Expand Up @@ -204,4 +208,77 @@ def resolve(relative_iri: str, base_iri: str = None) -> str:
relative_iri = base_path + relative_iri
relative_iri = remove_dot_segments(relative_iri)

return base_iri[:base_slash_after_colon_pos] + relative_iri
return base_iri[:base_slash_after_colon_pos] + relative_iri

def unresolve(absolute_iri: str, base_iri: str = ""):
"""
Unresolves a given absolute IRI to an IRI relative to the given base IRI.

:param base: the base IRI.
:param iri: the absolute IRI.

:return: the relative IRI if relative to base, otherwise the absolute IRI.
"""
# skip IRI processing
if not base_iri:
return absolute_iri

base = urlparse(base_iri)

if not base.scheme:
raise ValueError(f"Found invalid baseIRI '{base_iri}' for value '{absolute_iri}'")

# compute authority (netloc) and strip default ports
base_authority = parse_authority(base)

rel = urlparse(absolute_iri)
# compute authority (netloc) and strip default ports
rel_authority = parse_authority(rel)

# schemes and network locations (authorities) don't match, don't alter IRI
if not (base.scheme == rel.scheme and base_authority == rel_authority):
return absolute_iri

# remove path segments that match (do not remove last segment unless there
# is a hash or query
base_segments = remove_dot_segments(base.path).split('/')
iri_segments = remove_dot_segments(rel.path).split('/')
last = 0 if (rel.fragment or rel.query) else 1
while (len(base_segments) and len(iri_segments) > last and
base_segments[0] == iri_segments[0]):
base_segments.pop(0)
iri_segments.pop(0)

# use '../' for each non-matching base segment
rval = ''
if len(base_segments):
# don't count the last segment (if it ends with '/' last path doesn't
# count and if it doesn't end with '/' it isn't a path)
base_segments.pop()
rval += '../' * len(base_segments)

# prepend remaining segments
rval += '/'.join(iri_segments)

# build relative IRI using urlunparse with empty scheme/netloc
return urlunparse(('', '', rval, '', rel.query or '', rel.fragment or '')) or './'

def parse_authority(parsed_iri) -> str:
"""
Compute authority (netloc) and strip default ports

:param parsed_iri: Description
:return: Description
:rtype: str
"""
base_authority = parsed_iri.netloc or None

try:
base_port = parsed_iri.port
except Exception:
base_port = None

if base_authority is not None and base_port is not None:
if (parsed_iri.scheme == 'https' and base_port == 443) or (parsed_iri.scheme == 'http' and base_port == 80):
base_authority = base_authority.rsplit(':', 1)[0]
return base_authority
137 changes: 5 additions & 132 deletions lib/pyld/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import re
import sys
import traceback
from urllib.parse import urlparse
import warnings
import uuid
from .context_resolver import ContextResolver
Expand All @@ -31,7 +32,7 @@
from numbers import Integral, Real
from frozendict import frozendict
from pyld.__about__ import (__copyright__, __license__, __version__)
from .iri_resolver import resolve
from .iri_resolver import resolve, unresolve

def cmp(a, b):
return (a > b) - (a < b)
Expand Down Expand Up @@ -444,134 +445,6 @@ def unregister_rdf_parser(content_type):
del _rdf_parsers[content_type]


def remove_base(base, iri):
"""
Removes a base IRI from the given absolute IRI.

:param base: the base IRI.
:param iri: the absolute IRI.

:return: the relative IRI if relative to base, otherwise the absolute IRI.
"""
# TODO: better sync with jsonld.js version
# skip IRI processing
if base is None:
return iri

base = parse_url(base)
rel = parse_url(iri)

# schemes and network locations (authorities) don't match, don't alter IRI
if not (base.scheme == rel.scheme and base.authority == rel.authority):
return iri

# remove path segments that match (do not remove last segment unless there
# is a hash or query
base_segments = remove_dot_segments(base.path).split('/')
iri_segments = remove_dot_segments(rel.path).split('/')
last = 0 if (rel.fragment or rel.query) else 1
while (len(base_segments) and len(iri_segments) > last and
base_segments[0] == iri_segments[0]):
base_segments.pop(0)
iri_segments.pop(0)

# use '../' for each non-matching base segment
rval = ''
if len(base_segments):
# don't count the last segment (if it ends with '/' last path doesn't
# count and if it doesn't end with '/' it isn't a path)
base_segments.pop()
rval += '../' * len(base_segments)

# prepend remaining segments
rval += '/'.join(iri_segments)

return unparse_url((None, None, rval, rel.query, rel.fragment)) or './'


def remove_dot_segments(path):
"""
Removes dot segments from a URL path.

:param path: the path to remove dot segments from.

:return: a path with normalized dot segments.
"""

# RFC 3986 5.2.4 (reworked)

# empty path shortcut
if len(path) == 0:
return ''

input = path.split('/')
output = []

while len(input) > 0:
next = input.pop(0)
done = len(input) == 0

if next == '.':
if done:
# ensure output has trailing /
output.append('')
continue

if next == '..':
if len(output) > 0:
output.pop()
if done:
# ensure output has trailing /
output.append('')
continue

output.append(next)

# ensure output has leading /
# merge path segments from section 5.2.3
# note that if the path includes no segments, the entire path is removed
if len(output) > 0 and path.startswith('/') and output[0] != '':
output.insert(0, '')
if len(output) == 1 and output[0] == '':
return '/'

return '/'.join(output)


ParsedUrl = namedtuple(
'ParsedUrl', ['scheme', 'authority', 'path', 'query', 'fragment'])


def parse_url(url):
# regex from RFC 3986
p = r'^(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?'
m = re.match(p, url)
# remove default http and https ports
g = list(m.groups())
if ((g[0] == 'https' and g[1].endswith(':443')) or
(g[0] == 'http' and g[1].endswith(':80'))):
g[1] = g[1][:g[1].rfind(':')]
return ParsedUrl(*g)


def unparse_url(parsed):
if isinstance(parsed, dict):
parsed = ParsedUrl(**parsed)
elif isinstance(parsed, list) or isinstance(parsed, tuple):
parsed = ParsedUrl(*parsed)
rval = ''
if parsed.scheme:
rval += parsed.scheme + ':'
if parsed.authority is not None:
rval += '//' + parsed.authority
rval += parsed.path
if parsed.query is not None:
rval += '?' + parsed.query
if parsed.fragment is not None:
rval += '#' + parsed.fragment
return rval


class JsonLdProcessor(object):
"""
A JSON-LD processor.
Expand Down Expand Up @@ -4814,9 +4687,9 @@ def _compact_iri(
if active_ctx['@base'] is None:
return iri
else:
return remove_base(resolve(active_ctx['@base'], base), iri)
return unresolve(iri, resolve(active_ctx['@base'], base))
else:
return remove_base(base, iri)
return unresolve(iri, base)

# return IRI as is
return iri
Expand Down Expand Up @@ -6573,7 +6446,7 @@ def load_html(input, url, profile, options):
html_base = resolve(html_base[0], effective_base)
options['base'] = html_base

url_elements = parse_url(url)
url_elements = urlparse(url)
if url_elements.fragment:
# FIXME: CGI decode
id = url_elements.fragment
Expand Down
43 changes: 42 additions & 1 deletion tests/test_iri_resolver.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pytest
from pyld.iri_resolver import resolve, remove_dot_segments
from pyld.iri_resolver import resolve, unresolve, remove_dot_segments

# Tests ported from relative-to-absolute-iri.js: https://github.com/rubensworks/relative-to-absolute-iri.js/blob/master/test/Resolve-test.ts
# (c) Ruben Taelman <stevenlevithan.com>

# ---------- Tests for resolve() ----------
class TestResolve:
Expand Down Expand Up @@ -275,6 +276,46 @@ def test_relative_with_triple_dot_segment_and_2x_double_dot_and_base(self):
def test_questionmark_prefix_relative_with_complex_base_with_dot(self):
assert resolve('?y','http://a/bb/ccc/./d;p?q') == 'http://a/bb/ccc/./d;p?y'

# ---------- Tests for unresolve() ----------
class TestUnresolve:
def test_absolute_iri_no_base(self):
assert unresolve('http://example.org/') == 'http://example.org/'

def test_absolute_iri_empty_base(self):
assert unresolve('http://example.org/', '') == 'http://example.org/'

def test_absolute_iri_with_base(self):
assert unresolve('http://example.org/', 'http://base.org/') == 'http://example.org/'

def test_empty_value_uses_base(self):
assert unresolve('', 'http://base.org/') == ''

def test_absolute_with_base(self):
assert unresolve('http://base.org/abc', 'http://base.org/') == 'abc'

def test_absolute_with_fragment_base(self):
assert unresolve('http://base.org/abc', 'http://base.org/#frag') == 'abc'

def test_hash_absolute(self):
assert unresolve('http://base.org/#abc', 'http://base.org/') == '#abc'

def test_colon_in_value_ignores_base(self):
assert unresolve('http:abc', 'http://base.org/') == 'http:abc'

def test_non_absolute_base_error(self):
with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value 'http://base.org/abc'"):
unresolve('http://base.org/abc', 'def')

def test_non_absolute_base_empty_value_error(self):
with pytest.raises(ValueError, match=r"Found invalid baseIRI 'def' for value ''"):
unresolve('', 'def')

def test_base_without_path_slash(self):
assert unresolve('http://base.org/abc', 'http://base.org') == 'abc'

def test_base_with_path_slash(self):
assert unresolve('http://base.org/abc/', 'http://base.org') == 'abc/'

# ---------- Tests for remove_dot_segments() ----------
class TestRemoveDotSegments:
def test_no_slash(self):
Expand Down
Loading