Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 35 additions & 23 deletions s3fs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,7 @@ async def _lsdir(
else:
files.append(c)
files += dirs
files.sort(key=lambda f: f["name"])
except ClientError as e:
raise translate_boto_error(e)

Expand Down Expand Up @@ -887,38 +888,49 @@ async def _find(
sdirs = set()
thisdircache = {}
for o in out:
par = self._parent(o["name"])
if par not in sdirs:
sdirs.add(par)
d = False
if len(path) <= len(par):
d = {
"Key": self.split_path(par)[1],
"Size": 0,
"name": par,
"StorageClass": "DIRECTORY",
"type": "directory",
"size": 0,
}
dirs.append(d)
thisdircache[par] = []
ppar = self._parent(par)
if ppar in thisdircache:
if d and d not in thisdircache[ppar]:
thisdircache[ppar].append(d)
if par in sdirs:
thisdircache[par].append(o)
# not self._parent, because that strips "/" from placeholders
par = o["name"].rsplit("/", maxsplit=1)[0]
o["Key"] = o["name"]
name = o["name"]
while "/" in par:
if par not in sdirs:
sdirs.add(par)
d = False
if len(path) <= len(par):
d = {
"Key": par,
"Size": 0,
"name": par,
"StorageClass": "DIRECTORY",
"type": "directory",
"size": 0,
}
dirs.append(d)
thisdircache[par] = []
ppar = self._parent(par)
if ppar in thisdircache:
if d and d not in thisdircache[ppar]:
thisdircache[ppar].append(d)
if par in sdirs and not name.endswith("/"):
# exclude placeholdees, they do not belong in the directory listing
thisdircache[par].append(o)
par, name, o = par.rsplit("/", maxsplit=1)[0], par, d
if par in thisdircache or par in self.dircache:
break

# Explicitly add directories to their parents in the dircache
for d in dirs:
par = self._parent(d["name"])
if par in thisdircache:
# extra condition here (in any()) to deal with directory-marking files
if par in thisdircache and not any(
_["name"] == d["name"] for _ in thisdircache[par]
):
thisdircache[par].append(d)

if not prefix:
for k, v in thisdircache.items():
if k not in self.dircache and len(k) >= len(path):
self.dircache[k] = v
self.dircache[k] = sorted(v, key=lambda x: x["name"])
if withdirs:
out = sorted(out + dirs, key=lambda x: x["name"])
if detail:
Expand Down
55 changes: 54 additions & 1 deletion s3fs/tests/test_s3fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2365,7 +2365,7 @@ def test_get_file_info_with_selector(s3):
pass

infos = fs.find(base_dir, maxdepth=None, withdirs=True, detail=True)
assert len(infos) == 5 # includes base_dir directory
assert len(infos) == 4 # includes base_dir directory

for info in infos.values():
if info["name"].endswith(file_a):
Expand Down Expand Up @@ -2993,3 +2993,56 @@ def test_bucket_info(s3):
assert "VersionId" in info
assert info["type"] == "directory"
assert info["name"] == test_bucket_name


def test_find_ls_fail(s3):
# beacuse of https://github.com/fsspec/s3fs/pull/989
client = get_boto3_client()
files = {
f"{test_bucket_name}/find/a/a": b"data",
f"{test_bucket_name}/find/a/b": b"data",
f"{test_bucket_name}/find/a": b"", # duplicate of dir, without "/"
f"{test_bucket_name}/find/b": b"", # empty file without "/" and no children
f"{test_bucket_name}/find/c/c": b"data", # directory with no placeholder
f"{test_bucket_name}/find/d/d": b"data", # dir will acquire placeholder with "/"
}
client.put_object(Bucket=test_bucket_name, Key="find/d/", Body=b"")
client.put_object(
Bucket=test_bucket_name, Key="find/e/", Body=b""
) # placeholder only
s3.pipe(files)

out0 = s3.ls(f"{test_bucket_name}/find", detail=True)
s3.find(test_bucket_name, detail=False)
out = s3.ls(f"{test_bucket_name}/find", detail=True)
assert out == out0

s3.invalidate_cache()
s3.find(f"{test_bucket_name}/find", detail=False)
out = s3.ls(f"{test_bucket_name}/find", detail=True)
assert out == out0


def test_find_missing_ls(s3):
# https://github.com/fsspec/s3fs/issues/988#issuecomment-3436727753
BUCKET = test_bucket_name
BASE_PREFIX = "disappearing-folders/"
BASE = f"s3://{BUCKET}/{BASE_PREFIX}"

s3_with_cache = S3FileSystem(
anon=False,
use_listings_cache=True,
client_kwargs={"endpoint_url": endpoint_uri},
)
s3_no_cache = S3FileSystem(
anon=False,
use_listings_cache=False,
client_kwargs={"endpoint_url": endpoint_uri},
)

s3_with_cache.pipe({f"{BASE}folder/foo/1.txt": b"", f"{BASE}bar.txt": b""})
s3_with_cache.find(BASE)
listed_cached = s3_with_cache.ls(BASE, detail=False)
listed_no_cache = s3_no_cache.ls(BASE, detail=False)

assert set(listed_cached) == set(listed_no_cache)
Loading