diff --git a/s3fs/core.py b/s3fs/core.py index d8848db3..6670c017 100644 --- a/s3fs/core.py +++ b/s3fs/core.py @@ -764,6 +764,7 @@ async def _lsdir( else: files.append(c) files += dirs + files.sort(key=lambda f: f["name"]) except ClientError as e: raise translate_boto_error(e) @@ -887,38 +888,49 @@ async def _find( sdirs = set() thisdircache = {} for o in out: - par = self._parent(o["name"]) - if par not in sdirs: - sdirs.add(par) - d = False - if len(path) <= len(par): - d = { - "Key": self.split_path(par)[1], - "Size": 0, - "name": par, - "StorageClass": "DIRECTORY", - "type": "directory", - "size": 0, - } - dirs.append(d) - thisdircache[par] = [] - ppar = self._parent(par) - if ppar in thisdircache: - if d and d not in thisdircache[ppar]: - thisdircache[ppar].append(d) - if par in sdirs: - thisdircache[par].append(o) + # not self._parent, because that strips "/" from placeholders + par = o["name"].rsplit("/", maxsplit=1)[0] + o["Key"] = o["name"] + name = o["name"] + while "/" in par: + if par not in sdirs: + sdirs.add(par) + d = False + if len(path) <= len(par): + d = { + "Key": par, + "Size": 0, + "name": par, + "StorageClass": "DIRECTORY", + "type": "directory", + "size": 0, + } + dirs.append(d) + thisdircache[par] = [] + ppar = self._parent(par) + if ppar in thisdircache: + if d and d not in thisdircache[ppar]: + thisdircache[ppar].append(d) + if par in sdirs and not name.endswith("/"): + # exclude placeholdees, they do not belong in the directory listing + thisdircache[par].append(o) + par, name, o = par.rsplit("/", maxsplit=1)[0], par, d + if par in thisdircache or par in self.dircache: + break # Explicitly add directories to their parents in the dircache for d in dirs: par = self._parent(d["name"]) - if par in thisdircache: + # extra condition here (in any()) to deal with directory-marking files + if par in thisdircache and not any( + _["name"] == d["name"] for _ in thisdircache[par] + ): thisdircache[par].append(d) if not prefix: for k, v in thisdircache.items(): if k not in self.dircache and len(k) >= len(path): - self.dircache[k] = v + self.dircache[k] = sorted(v, key=lambda x: x["name"]) if withdirs: out = sorted(out + dirs, key=lambda x: x["name"]) if detail: diff --git a/s3fs/tests/test_s3fs.py b/s3fs/tests/test_s3fs.py index 45609eb6..a59b4c48 100644 --- a/s3fs/tests/test_s3fs.py +++ b/s3fs/tests/test_s3fs.py @@ -2365,7 +2365,7 @@ def test_get_file_info_with_selector(s3): pass infos = fs.find(base_dir, maxdepth=None, withdirs=True, detail=True) - assert len(infos) == 5 # includes base_dir directory + assert len(infos) == 4 # includes base_dir directory for info in infos.values(): if info["name"].endswith(file_a): @@ -2993,3 +2993,56 @@ def test_bucket_info(s3): assert "VersionId" in info assert info["type"] == "directory" assert info["name"] == test_bucket_name + + +def test_find_ls_fail(s3): + # beacuse of https://github.com/fsspec/s3fs/pull/989 + client = get_boto3_client() + files = { + f"{test_bucket_name}/find/a/a": b"data", + f"{test_bucket_name}/find/a/b": b"data", + f"{test_bucket_name}/find/a": b"", # duplicate of dir, without "/" + f"{test_bucket_name}/find/b": b"", # empty file without "/" and no children + f"{test_bucket_name}/find/c/c": b"data", # directory with no placeholder + f"{test_bucket_name}/find/d/d": b"data", # dir will acquire placeholder with "/" + } + client.put_object(Bucket=test_bucket_name, Key="find/d/", Body=b"") + client.put_object( + Bucket=test_bucket_name, Key="find/e/", Body=b"" + ) # placeholder only + s3.pipe(files) + + out0 = s3.ls(f"{test_bucket_name}/find", detail=True) + s3.find(test_bucket_name, detail=False) + out = s3.ls(f"{test_bucket_name}/find", detail=True) + assert out == out0 + + s3.invalidate_cache() + s3.find(f"{test_bucket_name}/find", detail=False) + out = s3.ls(f"{test_bucket_name}/find", detail=True) + assert out == out0 + + +def test_find_missing_ls(s3): + # https://github.com/fsspec/s3fs/issues/988#issuecomment-3436727753 + BUCKET = test_bucket_name + BASE_PREFIX = "disappearing-folders/" + BASE = f"s3://{BUCKET}/{BASE_PREFIX}" + + s3_with_cache = S3FileSystem( + anon=False, + use_listings_cache=True, + client_kwargs={"endpoint_url": endpoint_uri}, + ) + s3_no_cache = S3FileSystem( + anon=False, + use_listings_cache=False, + client_kwargs={"endpoint_url": endpoint_uri}, + ) + + s3_with_cache.pipe({f"{BASE}folder/foo/1.txt": b"", f"{BASE}bar.txt": b""}) + s3_with_cache.find(BASE) + listed_cached = s3_with_cache.ls(BASE, detail=False) + listed_no_cache = s3_no_cache.ls(BASE, detail=False) + + assert set(listed_cached) == set(listed_no_cache)