Skip to content
This repository has been archived by the owner on Apr 6, 2023. It is now read-only.

Add code and tests + supporting files #1

Merged
merged 15 commits into from
Oct 11, 2022
19 changes: 19 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
.PHONY: quality style test

# Check that source code meets quality standards

quality:
black --check --line-length 119 --target-version py37 src tests
isort --check-only src tests
flake8 src tests
mariosasko marked this conversation as resolved.
Show resolved Hide resolved

# Format source code automatically

style:
black --line-length 119 --target-version py37 src tests
isort src tests

# Run tests for the library

test:
python -m pytest -sv ./tests/
88 changes: 86 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,87 @@
# HFfs
# `hffs`

HFfs builds on huggingface_hub to provide a convenient Python filesystem interface for huggingface.co repositories.
`hffs` builds on [`huggingface_hub`](https://github.com/huggingface/huggingface_hub) and [`fsspec`](https://github.com/huggingface/huggingface_hub) to provide a convenient Python filesystem interface to 🤗 Hub.

## Examples

Locate and read a file from a 🤗 Hub repo:

```python
>>> import hffs
>>> fs = hffs.HfFileSystem("my-username/my-dataset-repo", repo_type="dataset")
mariosasko marked this conversation as resolved.
Show resolved Hide resolved
>>> fs.ls("")
['.gitattributes', 'my-file.txt']
>>> with fs.open("my-file.txt", "r") as f:
... f.read()
'Hello, world'
```

Write a file to the repo:

```python
>>> with fs.open("my-file-new.txt", "w") as f:
... f.write("Hello, world1")
... f.write("Hello, world2")
>>> fs.exists("my-file-new.txt")
True
>>> fs.du("my-file-new.txt")
26
```

Instantiation via `fsspec`:

```python
>>> import fsspec

# Instantiate a `hffs.HfFileSystem` object
>>> fs = fsspec.filesystem("hf://my-username/my-dataset-repo", repo_type="dataset")
>>> fs.ls("")
['.gitattributes', 'my-file.txt']

# Instantiate a `hffs.HfFileSystem` object and write a file to it
>>> with fsspec.open("hf://my-username/my-dataset-repo:/my-file-new.txt", repo_type="dataset"):
... f.write("Hello, world1")
... f.write("Hello, world2")
```

> **Note**: To be recognized as a `hffs` URL, the URL path passed to [`fsspec.open`](https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=open#fsspec.open) must adhere to the following scheme:
> ```
> hf://<repo_id>[@<revision>]:/<path/in/repo>
> ```

# Installation

```bash
pip install hffs
```

## Integrations

* [`pandas`](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#reading-writing-remote-files)/[`dask`](https://docs.dask.org/en/stable/how-to/connect-to-remote-data.html)

```python
>>> import pandas as pd

>>> # Read a remote CSV file into a dataframe
>>> df = pd.read_csv("hf://my-username/my-dataset-repo:/train.csv", storage_options={"repo_type": "dataset"})

>>> # Write a dataframe to a remote CSV file
>>> df.to_csv("hf://my-username/my-dataset-repo:/test.csv", storage_options={"repo_type": "dataset"})
```

* [`datasets`](https://huggingface.co/docs/datasets/filesystems#load-and-save-your-datasets-using-your-cloud-storage-filesystem)

```python
>>> import datasets

>>> # Cache a (large) dataset inside a repo
>>> cache_dir = "hf://my-username/my-dataset-repo"
>>> builder = datasets.load_dataset_builder("path/to/local/loading_script/loading_script.py", cache_dir=cache_dir, storage_options={"repo_type": "dataset"})
>>> builder.download_and_prepare(file_format="parquet")

>>> # Stream the dataset from the repo
>>> dset = datasets.load_dataset("my-username/my-dataset-repo", split="train")
>>> # Process the examples
>>> for ex in dset:
... ...
```
15 changes: 15 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[metadata]
license_file = LICENSE

[isort]
ensure_newline_before_comments = True
force_grid_wrap = 0
include_trailing_comma = True
line_length = 119
lines_after_imports = 2
multi_line_output = 3
use_parentheses = True

[flake8]
ignore = E203, E501, W503
max-line-length = 119
7 changes: 3 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,10 @@

TESTS_REQUIRE = [
"pytest",
"pytest-datadir",
"pytest-xdist",
]


QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0", "pyyaml>=5.3.1"]
QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0"]


EXTRAS_REQUIRE = {
Expand All @@ -97,6 +95,7 @@
license="Apache 2.0",
package_dir={"": "src"},
packages=find_packages("src"),
package_data={"hffs": ["py.typed"]},
python_requires=">=3.7.0",
install_requires=REQUIRED_PKGS,
extras_require=EXTRAS_REQUIRE,
Expand All @@ -114,6 +113,6 @@
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
keywords="models datasets machine learning huggingface",
keywords="models datasets machine learning huggingface filesystem",
zip_safe=False, # Required for mypy to find the py.typed file
)
5 changes: 2 additions & 3 deletions src/hffs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from fsspec import AbstractFileSystem
# flake8: noqa

__version__ = "0.0.1.dev0"

class HfFileSystem(AbstractFileSystem):
pass
from .spec import HfFileSystem
Empty file added src/hffs/py.typed
Empty file.
216 changes: 216 additions & 0 deletions src/hffs/spec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import os
import tempfile
from functools import partial
from pathlib import PurePosixPath
from typing import Optional

import fsspec
from fsspec.utils import stringify_path
from huggingface_hub import HfFolder, delete_file, hf_hub_url, upload_file
from huggingface_hub.constants import REPO_TYPES
from huggingface_hub.hf_api import repo_info


class HfFileSystem(fsspec.AbstractFileSystem):
"""
Access a remote Hugging Face Hub repository as if were a local file system.

Args:
repo_id (`str`):
The remote repository to access as if were a local file system,
for example: `"username/custom_transformers"`
token (`str`, *optional*):
Authentication token, obtained with `HfApi.login` method. Will
default to the stored token.
repo_type (`str`, *optional*):
Set to `"dataset"` or `"space"` if the remote repositry is a dataset or
space repositroy, `None` or `"model"` if it is a model repository. Default is
`None`.
revision (`str`, *optional*):
An optional Git revision id which can be a branch name, a tag, or a
commit hash. Defaults to the head of the `"main"` branch.

Direct usage:

```python
>>> import hffs

>>> fs = hffs.HfFileSystem("username/my-dataset", repo_type="dataset")

>>> # Read a remote file
>>> with fs.open("remote/file/in/repo.bin") as f:
... data = f.read()

>>> # Write a remote file
>>> with fs.open("remote/file/in/repo.bin", "wb") as f:
... f.write(data)
```

Usage via [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/)):

```python
>>> import fsspec

>>> # Read a remote file
>>> with fsspec.open("hf://username/my-dataset:/remote/file/in/repo.bin", repo_type="dataset") as f:
... data = f.read()

>>> # Write a remote file
>>> with fsspec.open("hf://username/my-dataset:/remote/file/in/repo.bin", "wb", repo_type="dataset") as f:
... f.write(data)
```
"""

root_marker = ""
protocol = "hf"

def __init__(
self,
repo_id: str,
token: Optional[str] = None,
repo_type: Optional[str] = None,
revision: Optional[str] = None,
**kwargs,
):
super().__init__(self, **kwargs)

if repo_type not in REPO_TYPES:
raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}")

self.repo_id = repo_id
self.token = token if token is not None else HfFolder.get_token()
self.repo_type = repo_type
self.revision = revision

def _dircache_from_repo_info(self):
repo_info_obj = repo_info(self.repo_id, revision=self.revision, repo_type=self.repo_type, token=self.token)
for sibling in repo_info_obj.siblings:
child = {
"name": sibling.rfilename,
"size": None, # waiting for #951
"type": "file",
}
for parent in list(PurePosixPath(sibling.rfilename).parents)[:-1] + [self.root_marker]:
self.dircache.setdefault(str(parent), []).append(child)
child = {"name": str(parent), "size": None, "type": "directory"}

def invalidate_cache(self, path=None):
self.dircache.clear()

@classmethod
def _strip_protocol(cls, path):
if isinstance(path, list):
return [cls._strip_protocol(stringify_path(p)) for p in path]
if path.startswith(f"{cls.protocol}://"):
path = path[len(f"{cls.protocol}://") :]
if ":/" in path:
_, path = path.split(":/", 1)
path = path.lstrip("/")
else:
path = cls.root_marker
return path

def unstrip_protocol(self, path):
return super().unstrip_protocol(
f"{self.repo_id}{'@' + self.revision if self.revision is not None else ''}:/{path}"
)

@staticmethod
def _get_kwargs_from_urls(path):
protocol = HfFileSystem.protocol
if path.startswith(f"{protocol}://"):
path = path[len(f"{protocol}://") :]
out = {}
if ":/" in path:
out["repo_id"], _ = path.split(":/", 1)
else:
out["repo_id"] = path
if "@" in out["repo_id"]:
out["repo_id"], out["revision"] = out["repo_id"].split("@", 1)
return out

def _open(
self,
path: str,
mode: str = "rb",
**kwargs,
):
if mode == "rb":
url = hf_hub_url(
self.repo_id,
path,
repo_type=self.repo_type,
revision=self.revision,
)
return fsspec.open(
url,
mode=mode,
headers={"authorization": f"Bearer {self.token}"},
).open()
else:
return TempFileUploader(self, path, mode=mode)

def _rm(self, path):
path = self._strip_protocol(path)
delete_file(
path_in_repo=path,
repo_id=self.repo_id,
token=self.token,
repo_type=self.repo_type,
revision=self.revision,
commit_message=f"Delete {path} with hffs",
)
self.invalidate_cache()

def rm(self, path, recursive=False, maxdepth=None):
paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
paths = [path for path in paths if not self.isdir(path)]
for path in paths:
self.rm_file(path)
mariosasko marked this conversation as resolved.
Show resolved Hide resolved

def ls(self, path, detail=False, **kwargs):
path = self._strip_protocol(path)
if not self.dircache:
self._dircache_from_repo_info()
out = self._ls_from_cache(path)
if out is None:
raise FileNotFoundError(path)
if detail:
return out
return [o["name"] for o in out]

def cp_file(self, path1, path2, **kwargs):
mariosasko marked this conversation as resolved.
Show resolved Hide resolved
path1 = self._strip_protocol(path1)
path2 = self._strip_protocol(path2)

with self.open(path1, "rb") as f1:
with self.open(path2, "wb") as f2:
for block in iter(partial(f1.read, self.blocksize), b""):
f2.write(block)


class TempFileUploader(fsspec.spec.AbstractBufferedFile):
def _initiate_upload(self):
self.temp_file = tempfile.NamedTemporaryFile(delete=False)
if "a" in self.mode:
with self.fs.open(self.path, "rb") as f:
for block in iter(partial(f.read, self.blocksize), b""):
self.temp_file.write(block)

def _upload_chunk(self, final=False):
self.buffer.seek(0)
block = self.buffer.read()
self.temp_file.write(block)
if final:
self.temp_file.close()
upload_file(
path_or_fileobj=self.temp_file.name,
path_in_repo=self.path,
repo_id=self.fs.repo_id,
token=self.fs.token,
repo_type=self.fs.repo_type,
revision=self.fs.revision,
commit_message=f"Upload {self.path} with hffs",
)
os.remove(self.temp_file.name)
self.fs.invalidate_cache()