huggingface · mariosasko · Oct 11, 2022 · Jul 29, 2022 · Jul 29, 2022 · Jul 29, 2022
diff --git a/Makefile b/Makefile
@@ -0,0 +1,19 @@
+.PHONY: quality style test
+
+# Check that source code meets quality standards
+
+quality:
+	black --check --line-length 119 --target-version py37 src tests
+	isort --check-only src tests
+	flake8 src tests
+
+# Format source code automatically
+
+style:
+	black --line-length 119 --target-version py37 src tests
+	isort src tests
+
+# Run tests for the library
+
+test:
+	python -m pytest -sv ./tests/
diff --git a/README.md b/README.md
@@ -1,3 +1,87 @@
-# HFfs
+# `hffs`
 
-HFfs builds on huggingface_hub to provide a convenient Python filesystem interface for huggingface.co repositories.
+`hffs` builds on [`huggingface_hub`](https://github.com/huggingface/huggingface_hub) and [`fsspec`](https://github.com/huggingface/huggingface_hub) to provide a convenient Python filesystem interface to 🤗 Hub.
+
+## Examples
+
+Locate and read a file from a 🤗 Hub repo:
+
+```python
+>>> import hffs
+>>> fs = hffs.HfFileSystem("my-username/my-dataset-repo", repo_type="dataset")
+>>> fs.ls("")
+['.gitattributes', 'my-file.txt']
+>>> with fs.open("my-file.txt", "r") as f:
+...     f.read()
+'Hello, world'
+```
+
+Write a file to the repo:
+
+```python
+>>> with fs.open("my-file-new.txt", "w") as f:
+...     f.write("Hello, world1")
+...     f.write("Hello, world2")
+>>> fs.exists("my-file-new.txt")
+True
+>>> fs.du("my-file-new.txt")
+26
+```
+
+Instantiation via `fsspec`:
+
+```python
+>>> import fsspec
+
+# Instantiate a `hffs.HfFileSystem` object
+>>> fs = fsspec.filesystem("hf://my-username/my-dataset-repo", repo_type="dataset")
+>>> fs.ls("")
+['.gitattributes', 'my-file.txt']
+
+# Instantiate a `hffs.HfFileSystem` object and write a file to it
+>>> with fsspec.open("hf://my-username/my-dataset-repo:/my-file-new.txt", repo_type="dataset"):
+...     f.write("Hello, world1")
+...     f.write("Hello, world2")
+```
+
+> **Note**: To be recognized as a `hffs` URL, the URL path passed to [`fsspec.open`](https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=open#fsspec.open) must adhere to the following scheme:
+> ```
+> hf://<repo_id>[@<revision>]:/<path/in/repo>
+> ```
+
+# Installation
+
+```bash
+pip install hffs
+```
+
+## Integrations
+
+* [`pandas`](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#reading-writing-remote-files)/[`dask`](https://docs.dask.org/en/stable/how-to/connect-to-remote-data.html)
+
+```python
+>>> import pandas as pd
+
+>>> # Read a remote CSV file into a dataframe
+>>> df = pd.read_csv("hf://my-username/my-dataset-repo:/train.csv", storage_options={"repo_type": "dataset"})
+
+>>> # Write a dataframe to a remote CSV file
+>>> df.to_csv("hf://my-username/my-dataset-repo:/test.csv", storage_options={"repo_type": "dataset"})
+```
+
+* [`datasets`](https://huggingface.co/docs/datasets/filesystems#load-and-save-your-datasets-using-your-cloud-storage-filesystem)
+
+```python
+>>> import datasets
+
+>>> # Cache a (large) dataset inside a repo
+>>> cache_dir = "hf://my-username/my-dataset-repo"
+>>> builder = datasets.load_dataset_builder("path/to/local/loading_script/loading_script.py", cache_dir=cache_dir, storage_options={"repo_type": "dataset"})
+>>> builder.download_and_prepare(file_format="parquet")
+
+>>> # Stream the dataset from the repo
+>>> dset = datasets.load_dataset("my-username/my-dataset-repo", split="train")
+>>> # Process the examples
+>>> for ex in dset:
+...    ...
+```
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,15 @@
+[metadata]
+license_file = LICENSE
+
+[isort]
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+
+[flake8]
+ignore = E203, E501, W503
+max-line-length = 119
diff --git a/setup.py b/setup.py
@@ -70,12 +70,10 @@
 
 TESTS_REQUIRE = [
     "pytest",
-    "pytest-datadir",
-    "pytest-xdist",
 ]
 
 
-QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0", "pyyaml>=5.3.1"]
+QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0"]
 
 
 EXTRAS_REQUIRE = {
@@ -97,6 +95,7 @@
     license="Apache 2.0",
     package_dir={"": "src"},
     packages=find_packages("src"),
+    package_data={"hffs": ["py.typed"]},
     python_requires=">=3.7.0",
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
@@ -114,6 +113,6 @@
         "Programming Language :: Python :: 3.10",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    keywords="models datasets machine learning huggingface",
+    keywords="models datasets machine learning huggingface filesystem",
     zip_safe=False,  # Required for mypy to find the py.typed file
 )
diff --git a/src/hffs/__init__.py b/src/hffs/__init__.py
@@ -1,6 +1,5 @@
-from fsspec import AbstractFileSystem
+# flake8: noqa
 
 __version__ = "0.0.1.dev0"
 
-class HfFileSystem(AbstractFileSystem):
-    pass
+from .spec import HfFileSystem
diff --git a/src/hffs/py.typed b/src/hffs/py.typed
diff --git a/src/hffs/spec.py b/src/hffs/spec.py
@@ -0,0 +1,216 @@
+import os
+import tempfile
+from functools import partial
+from pathlib import PurePosixPath
+from typing import Optional
+
+import fsspec
+from fsspec.utils import stringify_path
+from huggingface_hub import HfFolder, delete_file, hf_hub_url, upload_file
+from huggingface_hub.constants import REPO_TYPES
+from huggingface_hub.hf_api import repo_info
+
+
+class HfFileSystem(fsspec.AbstractFileSystem):
+    """
+    Access a remote Hugging Face Hub repository as if were a local file system.
+
+    Args:
+        repo_id (`str`):
+            The remote repository to access as if were a local file system,
+            for example: `"username/custom_transformers"`
+        token (`str`, *optional*):
+            Authentication token, obtained with `HfApi.login` method. Will
+            default to the stored token.
+        repo_type (`str`, *optional*):
+            Set to `"dataset"` or `"space"` if the remote repositry is a dataset or
+            space repositroy, `None` or `"model"` if it is a model repository. Default is
+            `None`.
+        revision (`str`, *optional*):
+            An optional Git revision id which can be a branch name, a tag, or a
+            commit hash. Defaults to the head of the `"main"` branch.
+
+    Direct usage:
+
+    ```python
+    >>> import hffs
+
+    >>> fs = hffs.HfFileSystem("username/my-dataset", repo_type="dataset")
+
+    >>> # Read a remote file
+    >>> with fs.open("remote/file/in/repo.bin") as f:
+    ...     data = f.read()
+
+    >>> # Write a remote file
+    >>> with fs.open("remote/file/in/repo.bin", "wb") as f:
+    ...     f.write(data)
+    ```
+
+    Usage via [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/)):
+
+    ```python
+    >>> import fsspec
+
+    >>> # Read a remote file
+    >>> with fsspec.open("hf://username/my-dataset:/remote/file/in/repo.bin", repo_type="dataset") as f:
+    ...     data = f.read()
+
+    >>> # Write a remote file
+    >>> with fsspec.open("hf://username/my-dataset:/remote/file/in/repo.bin", "wb", repo_type="dataset") as f:
+    ...     f.write(data)
+    ```
+    """
+
+    root_marker = ""
+    protocol = "hf"
+
+    def __init__(
+        self,
+        repo_id: str,
+        token: Optional[str] = None,
+        repo_type: Optional[str] = None,
+        revision: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(self, **kwargs)
+
+        if repo_type not in REPO_TYPES:
+            raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}")
+
+        self.repo_id = repo_id
+        self.token = token if token is not None else HfFolder.get_token()
+        self.repo_type = repo_type
+        self.revision = revision
+
+    def _dircache_from_repo_info(self):
+        repo_info_obj = repo_info(self.repo_id, revision=self.revision, repo_type=self.repo_type, token=self.token)
+        for sibling in repo_info_obj.siblings:
+            child = {
+                "name": sibling.rfilename,
+                "size": None,  # waiting for #951
+                "type": "file",
+            }
+            for parent in list(PurePosixPath(sibling.rfilename).parents)[:-1] + [self.root_marker]:
+                self.dircache.setdefault(str(parent), []).append(child)
+                child = {"name": str(parent), "size": None, "type": "directory"}
+
+    def invalidate_cache(self, path=None):
+        self.dircache.clear()
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        if isinstance(path, list):
+            return [cls._strip_protocol(stringify_path(p)) for p in path]
+        if path.startswith(f"{cls.protocol}://"):
+            path = path[len(f"{cls.protocol}://") :]
+            if ":/" in path:
+                _, path = path.split(":/", 1)
+                path = path.lstrip("/")
+            else:
+                path = cls.root_marker
+        return path
+
+    def unstrip_protocol(self, path):
+        return super().unstrip_protocol(
+            f"{self.repo_id}{'@' + self.revision if self.revision is not None else ''}:/{path}"
+        )
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        protocol = HfFileSystem.protocol
+        if path.startswith(f"{protocol}://"):
+            path = path[len(f"{protocol}://") :]
+        out = {}
+        if ":/" in path:
+            out["repo_id"], _ = path.split(":/", 1)
+        else:
+            out["repo_id"] = path
+        if "@" in out["repo_id"]:
+            out["repo_id"], out["revision"] = out["repo_id"].split("@", 1)
+        return out
+
+    def _open(
+        self,
+        path: str,
+        mode: str = "rb",
+        **kwargs,
+    ):
+        if mode == "rb":
+            url = hf_hub_url(
+                self.repo_id,
+                path,
+                repo_type=self.repo_type,
+                revision=self.revision,
+            )
+            return fsspec.open(
+                url,
+                mode=mode,
+                headers={"authorization": f"Bearer {self.token}"},
+            ).open()
+        else:
+            return TempFileUploader(self, path, mode=mode)
+
+    def _rm(self, path):
+        path = self._strip_protocol(path)
+        delete_file(
+            path_in_repo=path,
+            repo_id=self.repo_id,
+            token=self.token,
+            repo_type=self.repo_type,
+            revision=self.revision,
+            commit_message=f"Delete {path} with hffs",
+        )
+        self.invalidate_cache()
+
+    def rm(self, path, recursive=False, maxdepth=None):
+        paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+        paths = [path for path in paths if not self.isdir(path)]
+        for path in paths:
+            self.rm_file(path)
+
+    def ls(self, path, detail=False, **kwargs):
+        path = self._strip_protocol(path)
+        if not self.dircache:
+            self._dircache_from_repo_info()
+        out = self._ls_from_cache(path)
+        if out is None:
+            raise FileNotFoundError(path)
+        if detail:
+            return out
+        return [o["name"] for o in out]
+
+    def cp_file(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+
+        with self.open(path1, "rb") as f1:
+            with self.open(path2, "wb") as f2:
+                for block in iter(partial(f1.read, self.blocksize), b""):
+                    f2.write(block)
+
+
+class TempFileUploader(fsspec.spec.AbstractBufferedFile):
+    def _initiate_upload(self):
+        self.temp_file = tempfile.NamedTemporaryFile(delete=False)
+        if "a" in self.mode:
+            with self.fs.open(self.path, "rb") as f:
+                for block in iter(partial(f.read, self.blocksize), b""):
+                    self.temp_file.write(block)
+
+    def _upload_chunk(self, final=False):
+        self.buffer.seek(0)
+        block = self.buffer.read()
+        self.temp_file.write(block)
+        if final:
+            self.temp_file.close()
+            upload_file(
+                path_or_fileobj=self.temp_file.name,
+                path_in_repo=self.path,
+                repo_id=self.fs.repo_id,
+                token=self.fs.token,
+                repo_type=self.fs.repo_type,
+                revision=self.fs.revision,
+                commit_message=f"Upload {self.path} with hffs",
+            )
+            os.remove(self.temp_file.name)
+            self.fs.invalidate_cache()