Skip to content

Commit 32efe97

Browse files
committed
feat: fsspec for all non-object writing - %-encoded urls no longer decoded (#1034)
* writing goes through fsspec * increase rc version * type hints and docs * add helper methods, create * throw more specific error * add additional test for `create` failure with scheme other than local * simplify source selection * remove windows specific code * raise exception if invalid combination of handler / input (file-like object and fsspec) * use softer check for file-like object * cover problematic case with additional slash (file:///c:/file.root) * test "file:" scheme (no slash) * test backslash
1 parent 0777461 commit 32efe97

File tree

9 files changed

+225
-292
lines changed

9 files changed

+225
-292
lines changed

src/uproot/_util.py

Lines changed: 49 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,20 @@
1212
import itertools
1313
import numbers
1414
import os
15-
import platform
1615
import re
1716
import warnings
1817
from collections.abc import Iterable
19-
from urllib.parse import unquote, urlparse
18+
from pathlib import Path
19+
from typing import IO
20+
from urllib.parse import urlparse
2021

2122
import fsspec
2223
import numpy
2324
import packaging.version
2425

25-
win = platform.system().lower().startswith("win")
26+
import uproot.source.chunk
27+
import uproot.source.fsspec
28+
import uproot.source.object
2629

2730

2831
def tobytes(array):
@@ -36,7 +39,7 @@ def tobytes(array):
3639
return array.tostring()
3740

3841

39-
def isint(x):
42+
def isint(x) -> bool:
4043
"""
4144
Returns True if and only if ``x`` is an integer (including NumPy, not
4245
including bool).
@@ -46,7 +49,7 @@ def isint(x):
4649
)
4750

4851

49-
def isnum(x):
52+
def isnum(x) -> bool:
5053
"""
5154
Returns True if and only if ``x`` is a number (including NumPy, not
5255
including bool).
@@ -56,7 +59,7 @@ def isnum(x):
5659
)
5760

5861

59-
def ensure_str(x):
62+
def ensure_str(x) -> str:
6063
"""
6164
Ensures that ``x`` is a string (decoding with 'surrogateescape' if necessary).
6265
"""
@@ -94,18 +97,17 @@ def is_file_like(
9497
obj, readable: bool = False, writable: bool = False, seekable: bool = False
9598
) -> bool:
9699
return (
97-
callable(getattr(obj, "read", None))
98-
and callable(getattr(obj, "write", None))
99-
and callable(getattr(obj, "seek", None))
100-
and callable(getattr(obj, "tell", None))
101-
and callable(getattr(obj, "flush", None))
100+
all(
101+
callable(getattr(obj, attr, None))
102+
for attr in ("read", "write", "seek", "tell", "flush")
103+
)
102104
and (not readable or not hasattr(obj, "readable") or obj.readable())
103105
and (not writable or not hasattr(obj, "writable") or obj.writable())
104106
and (not seekable or not hasattr(obj, "seekable") or obj.seekable())
105107
)
106108

107109

108-
def parse_version(version):
110+
def parse_version(version: str):
109111
"""
110112
Converts a semver string into a Version object that can be compared with
111113
``<``, ``>=``, etc.
@@ -116,7 +118,7 @@ def parse_version(version):
116118
return packaging.version.parse(version)
117119

118120

119-
def from_module(obj, module_name):
121+
def from_module(obj, module_name: str) -> bool:
120122
"""
121123
Returns True if ``obj`` is an instance of a class from a module
122124
given by name.
@@ -155,7 +157,7 @@ def _regularize_filter_regex_flags(flags):
155157
return flagsbyte
156158

157159

158-
def no_filter(x):
160+
def no_filter(x) -> bool:
159161
"""
160162
A filter that accepts anything (always returns True).
161163
"""
@@ -285,10 +287,6 @@ def regularize_path(path):
285287
return path
286288

287289

288-
_windows_drive_letter_ending = re.compile(r".*\b[A-Za-z]$")
289-
_windows_absolute_path_pattern = re.compile(r"^[A-Za-z]:[\\/]")
290-
_windows_absolute_path_pattern_slash = re.compile(r"^[\\/][A-Za-z]:[\\/]")
291-
292290
# These schemes may not appear in fsspec if the corresponding libraries are not installed (e.g. s3fs)
293291
_remote_schemes = ["root", "s3", "http", "https"]
294292
_schemes = list({*_remote_schemes, *fsspec.available_protocols()})
@@ -324,87 +322,48 @@ def file_object_path_split(urlpath: str) -> tuple[str, str | None]:
324322
return urlpath, obj
325323

326324

327-
def file_path_to_source_class(file_path, options):
325+
def file_path_to_source_class(
326+
file_path_or_object: str | Path | IO, options: dict
327+
) -> tuple[type[uproot.source.chunk.Source], str | IO]:
328328
"""
329329
Use a file path to get the :doc:`uproot.source.chunk.Source` class that would read it.
330330
331331
Returns a tuple of (class, file_path) where the class is a subclass of :doc:`uproot.source.chunk.Source`.
332332
"""
333333

334-
import uproot.source.chunk
335-
336-
file_path = regularize_path(file_path)
334+
file_path_or_object: str | IO = regularize_path(file_path_or_object)
337335

338336
source_cls = options["handler"]
339-
if source_cls is not None:
340-
if not (
341-
isinstance(source_cls, type)
342-
and issubclass(source_cls, uproot.source.chunk.Source)
337+
if source_cls is not None and not (
338+
isinstance(source_cls, type)
339+
and issubclass(source_cls, uproot.source.chunk.Source)
340+
):
341+
raise TypeError(
342+
f"'handler' is not a class object inheriting from Source: {source_cls!r}"
343+
)
344+
345+
# Infer the source class from the file path
346+
if all(
347+
callable(getattr(file_path_or_object, attr, None)) for attr in ("read", "seek")
348+
):
349+
# need a very soft object check for ubuntu python3.8 pyroot ci tests, cannot use uproot._util.is_file_like
350+
if (
351+
source_cls is not None
352+
and source_cls is not uproot.source.object.ObjectSource
343353
):
344354
raise TypeError(
345-
f"'handler' is not a class object inheriting from Source: {source_cls!r}"
355+
f"'handler' is not ObjectSource for a file-like object: {source_cls!r}"
346356
)
347-
return source_cls, file_path
348-
349-
if (
350-
not isinstance(file_path, str)
351-
and hasattr(file_path, "read")
352-
and hasattr(file_path, "seek")
353-
):
354-
source_cls = uproot.source.object.ObjectSource
355-
return source_cls, file_path
356-
357-
windows_absolute_path = None
358-
if win and _windows_absolute_path_pattern.match(file_path) is not None:
359-
windows_absolute_path = file_path
360-
361-
parsed_url = urlparse(file_path)
362-
if parsed_url.scheme.lower() == "file":
363-
parsed_url_path = unquote(parsed_url.path)
357+
return uproot.source.object.ObjectSource, file_path_or_object
358+
elif isinstance(file_path_or_object, str):
359+
source_cls = (
360+
uproot.source.fsspec.FSSpecSource if source_cls is None else source_cls
361+
)
362+
return source_cls, file_path_or_object
364363
else:
365-
parsed_url_path = parsed_url.path
366-
367-
if win and windows_absolute_path is None:
368-
if _windows_absolute_path_pattern.match(parsed_url_path) is not None:
369-
windows_absolute_path = parsed_url_path
370-
elif _windows_absolute_path_pattern_slash.match(parsed_url_path) is not None:
371-
windows_absolute_path = parsed_url_path[1:]
372-
373-
scheme = parsed_url.scheme.lower()
374-
if (
375-
scheme == "file"
376-
or len(parsed_url.scheme) == 0
377-
or windows_absolute_path is not None
378-
):
379-
if windows_absolute_path is None:
380-
if parsed_url.netloc.lower() == "localhost":
381-
file_path = parsed_url_path
382-
else:
383-
file_path = parsed_url.netloc + parsed_url_path
384-
else:
385-
file_path = windows_absolute_path
386-
387-
# uproot.source.file.MemmapSource
388-
source_cls = uproot.source.fsspec.FSSpecSource
389-
390-
return source_cls, os.path.expanduser(file_path)
391-
392-
elif scheme == "root":
393-
# uproot.source.xrootd.XRootDSource
394-
source_cls = uproot.source.fsspec.FSSpecSource
395-
return source_cls, file_path
396-
397-
elif scheme == "s3":
398-
# uproot.source.s3.S3Source
399-
source_cls = uproot.source.fsspec.FSSpecSource
400-
return source_cls, file_path
401-
402-
elif scheme in ("http", "https"):
403-
# uproot.source.http.HTTPSource
404-
source_cls = uproot.source.fsspec.FSSpecSource
405-
return source_cls, file_path
406-
407-
return uproot.source.fsspec.FSSpecSource, file_path
364+
raise TypeError(
365+
f"file_path is not a string or file-like object: {file_path_or_object!r}"
366+
)
408367

409368

410369
if isinstance(__builtins__, dict):
@@ -448,7 +407,7 @@ def _file_not_found(files, message=None):
448407
)
449408

450409

451-
def memory_size(data, error_message=None):
410+
def memory_size(data, error_message=None) -> int:
452411
"""
453412
Regularizes strings like '## kB' and plain integer number of bytes to
454413
an integer number of bytes.
@@ -739,7 +698,7 @@ def damerau_levenshtein(a, b, ratio=False):
739698
# Modified Damerau-Levenshtein distance. Adds a middling penalty
740699
# for capitalization.
741700
# https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
742-
M = [[0] * (len(b) + 1) for i in range(len(a) + 1)]
701+
M = [[0] * (len(b) + 1) for _ in range(len(a) + 1)]
743702

744703
for i in range(len(a) + 1):
745704
M[i][0] = i
@@ -771,7 +730,7 @@ def damerau_levenshtein(a, b, ratio=False):
771730
# Transpose only
772731
M[i][j] = min(M[i][j], M[i - 2][j - 2] + 1)
773732
else:
774-
# Traspose and capitalization
733+
# Transpose and capitalization
775734
M[i][j] = min(M[i][j], M[i - 2][j - 2] + 1.5)
776735

777736
if not ratio:

src/uproot/reading.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,14 @@
99
"""
1010
from __future__ import annotations
1111

12+
from __future__ import annotations
13+
1214
import struct
1315
import sys
1416
import uuid
1517
from collections.abc import Mapping, MutableMapping
18+
from pathlib import Path
19+
from typing import IO
1620

1721
import uproot
1822
import uproot.behaviors.TBranch
@@ -525,7 +529,7 @@ class ReadOnlyFile(CommonFileMethods):
525529

526530
def __init__(
527531
self,
528-
file_path,
532+
file_path: str | Path | IO,
529533
*,
530534
object_cache=100,
531535
array_cache="100 MB",

0 commit comments

Comments
 (0)