diff --git a/README.md b/README.md index 747e4a55..dca743e7 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ For more examples, see the [example notebook here][example-notebook]. - `az:`, `adl:`, `abfs:` and `abfss:` Azure Storage _(requires `adlfs`)_ - `data:` RFC 2397 style data URLs _(requires `fsspec>=2023.12.2`)_ - `github:` GitHub repository filesystem +- `hf:` Hugging Face filesystem _(requires `huggingface_hub`)_ - `http:` and `https:` HTTP(S)-based filesystem - `hdfs:` Hadoop distributed filesystem - `gs:` and `gcs:` Google Cloud Storage _(requires `gcsfs`)_ diff --git a/upath/core.py b/upath/core.py index 3aaa93af..0815f976 100644 --- a/upath/core.py +++ b/upath/core.py @@ -507,6 +507,13 @@ def __new__( **_: Any, ) -> _uimpl.cloud.AzurePath: ... @overload # noqa: E301 + def __new__( + cls, + *args: JoinablePathLike, + protocol: Literal["hf"], + **_: Any, + ) -> _uimpl.cloud.HfPath: ... + @overload # noqa: E301 def __new__( cls, *args: JoinablePathLike, diff --git a/upath/implementations/cloud.py b/upath/implementations/cloud.py index b62e7bc0..8001abd8 100644 --- a/upath/implementations/cloud.py +++ b/upath/implementations/cloud.py @@ -23,6 +23,7 @@ from upath._chain import FSSpecChainParser from upath.types.storage_options import AzureStorageOptions from upath.types.storage_options import GCSStorageOptions + from upath.types.storage_options import HfStorageOptions from upath.types.storage_options import S3StorageOptions __all__ = [ @@ -30,6 +31,7 @@ "GCSPath", "S3Path", "AzurePath", + "HfPath", ] @@ -157,3 +159,18 @@ def __init__( ) if not self.drive and len(self.parts) > 1: raise ValueError("non key-like path provided (bucket/container missing)") + + +class HfPath(CloudPath): + __slots__ = () + + def __init__( + self, + *args: JoinablePathLike, + protocol: Literal["hf"] | None = None, + chain_parser: FSSpecChainParser = DEFAULT_CHAIN_PARSER, + **storage_options: Unpack[HfStorageOptions], + ) -> None: + super().__init__( + *args, protocol=protocol, chain_parser=chain_parser, **storage_options + ) diff --git a/upath/registry.py b/upath/registry.py index ceff1b92..5f78e81d 100644 --- a/upath/registry.py +++ b/upath/registry.py @@ -95,6 +95,7 @@ class _Registry(MutableMapping[str, "type[upath.UPath]"]): "gcs": "upath.implementations.cloud.GCSPath", "gs": "upath.implementations.cloud.GCSPath", "hdfs": "upath.implementations.hdfs.HDFSPath", + "hf": "upath.implementations.cloud.HfPath", "http": "upath.implementations.http.HTTPPath", "https": "upath.implementations.http.HTTPPath", "memory": "upath.implementations.memory.MemoryPath", diff --git a/upath/tests/conftest.py b/upath/tests/conftest.py index 46eb1d21..08b10386 100644 --- a/upath/tests/conftest.py +++ b/upath/tests/conftest.py @@ -553,3 +553,99 @@ def ssh_fixture(ssh_container, local_testdir, monkeypatch): ) finally: fs.delete("/app/testdir", recursive=True) + + +@pytest.fixture +def hf_test_repo(): + # "__username__" is an invalid username so we can use it for tests + return "__username__/test_repo" + + +@pytest.fixture +def mock_hf_api(pathlib_base, monkeypatch, hf_test_repo): # noqa: C901 + huggingface_hub = pytest.importorskip( + "huggingface_hub", reason="hf tests require huggingface_hub" + ) + hf_file_system = pytest.importorskip( + "huggingface_hub.hf_file_system", reason="hf tests require huggingface_hub" + ) + + class MockedHfApi(huggingface_hub.HfApi): + + def repo_info(self, repo_id, *args, repo_type=None, **kwargs): + if repo_id != hf_test_repo: + raise huggingface_hub.errors.RepositoryNotFoundError(repo_id) + elif repo_type is None or repo_type == "model": + return huggingface_hub.hf_api.ModelInfo(id=repo_id) + elif repo_type == "dataset": + return huggingface_hub.hf_api.DatasetInfo(id=repo_id) + elif repo_type == "space": + return huggingface_hub.hf_api.SpaceInfo(id=repo_id) + else: + raise ValueError("Unsupported repo type.") + + def get_paths_info(self, repo_id, paths, *args, **kwargs): + if repo_id != hf_test_repo: + raise huggingface_hub.errors.RepositoryNotFoundError(repo_id) + paths_info = [] + for path in paths: + if path: + path = pathlib_base / path + if path.is_file(): + paths_info.append( + huggingface_hub.hf_api.RepoFile( + path=path.relative_to(pathlib_base).as_posix(), + blob_id="blob_id", + size=path.stat().st_size, + ) + ) + elif path.is_dir(): + paths_info.append( + huggingface_hub.hf_api.RepoFolder( + path=path.relative_to(pathlib_base).as_posix(), + tree_id="tree_id", + ) + ) + return paths_info + + def list_repo_tree( + self, repo_id, path_in_repo, *args, recursive=False, **kwargs + ): + if repo_id != hf_test_repo: + raise huggingface_hub.errors.RepositoryNotFoundError(repo_id) + pathlib_dir = pathlib_base / path_in_repo if path_in_repo else pathlib_base + for path in pathlib_dir.rglob("*") if recursive else pathlib_dir.glob("*"): + if path.is_file(): + yield huggingface_hub.hf_api.RepoFile( + path=path.relative_to(pathlib_base).as_posix(), + oid="oid", + size=path.stat().st_size, + ) + else: + yield huggingface_hub.hf_api.RepoFolder( + path=path.relative_to(pathlib_base).as_posix(), + oid="oid", + ) + + hf_file_system.HfFileSystem.clear_instance_cache() + monkeypatch.setattr(hf_file_system, "HfApi", MockedHfApi) + + +@pytest.fixture +def mock_hf_filesystem_open(pathlib_base, monkeypatch): + hf_file_system = pytest.importorskip( + "huggingface_hub.hf_file_system", reason="hf tests require huggingface_hub" + ) + + def mocked_open(fs, path, mode="rb", *args, **kwargs): + resolved_path = fs.resolve_path(path) + return (pathlib_base / resolved_path.path_in_repo).open(mode) + + monkeypatch.setattr(hf_file_system.HfFileSystem, "_open", mocked_open) + + +@pytest.fixture +def hf_fixture_with_readonly_mocked_hf_api( + hf_test_repo, mock_hf_api, mock_hf_filesystem_open +): + return "hf://" + hf_test_repo diff --git a/upath/tests/implementations/test_hf.py b/upath/tests/implementations/test_hf.py new file mode 100644 index 00000000..02dd2c6b --- /dev/null +++ b/upath/tests/implementations/test_hf.py @@ -0,0 +1,97 @@ +import pytest +from fsspec import get_filesystem_class + +from upath import UPath +from upath.implementations.cloud import HfPath + +from ..cases import BaseTests + +try: + get_filesystem_class("hf") +except ImportError: + pytestmark = pytest.mark.skip + + +def test_hfpath(): + path = UPath("hf://HuggingFaceTB/SmolLM2-135M") + assert isinstance(path, HfPath) + assert path.exists() + + +class TestUPathHttp(BaseTests): + @pytest.fixture(autouse=True, scope="function") + def path(self, hf_fixture_with_readonly_mocked_hf_api): + self.path = UPath(hf_fixture_with_readonly_mocked_hf_api) + + @pytest.mark.skip + def test_mkdir(self): + pass + + @pytest.mark.skip + def test_mkdir_exists_ok_false(self): + pass + + @pytest.mark.skip + def test_mkdir_exists_ok_true(self): + pass + + @pytest.mark.skip + def test_mkdir_parents_true_exists_ok_true(self): + pass + + @pytest.mark.skip + def test_mkdir_parents_true_exists_ok_false(self): + pass + + @pytest.mark.skip + def test_makedirs_exist_ok_true(self): + pass + + @pytest.mark.skip + def test_makedirs_exist_ok_false(self): + pass + + @pytest.mark.skip + def test_touch(self): + pass + + @pytest.mark.skip + def test_touch_unlink(self): + pass + + @pytest.mark.skip + def test_write_bytes(self, pathlib_base): + pass + + @pytest.mark.skip + def test_write_text(self, pathlib_base): + pass + + def test_fsspec_compat(self): + pass + + def test_rename(self): + pass + + def test_rename2(self): + pass + + def test_move_local(self, tmp_path): + pass + + def test_move_into_local(self, tmp_path): + pass + + def test_move_memory(self, clear_fsspec_memory_cache): + pass + + def test_move_into_memory(self, clear_fsspec_memory_cache): + pass + + @pytest.mark.skip(reason="HfPath does not support listing repositories") + def test_iterdir(self, local_testdir): + pass + + @pytest.mark.skip(reason="HfPath does not support listing repositories") + def test_iterdir2(self, local_testdir): + pass diff --git a/upath/types/storage_options.py b/upath/types/storage_options.py index 1dfd5aca..d784e9ab 100644 --- a/upath/types/storage_options.py +++ b/upath/types/storage_options.py @@ -19,6 +19,7 @@ "GCSStorageOptions", "S3StorageOptions", "AzureStorageOptions", + "HfStorageOptions", "DataStorageOptions", "GitHubStorageOptions", "HDFSStorageOptions", @@ -182,6 +183,21 @@ class AzureStorageOptions(_AbstractStorageOptions, total=False): assume_container_exists: bool | None # container existence assumptions +class HfStorageOptions(_AbstractStorageOptions, total=False): + """Storage options for Hugging face filesystem""" + + # Authentication + token: str | None + + # Connection settings + endpoint: str | None + + # Performance settings + block_size: ( + int | None + ) # Block size for reading bytes; 0 = raw requests file-like objects + + class DataStorageOptions(_AbstractStorageOptions, total=False): """Storage options for Data URIs filesystem"""