Add code and tests + supporting files (#1)

* Finish implementation (for now) * Add basic readme * Minor changes in setup.py * Add setup.cfg * Add tests * Add Makefile * Minor README fix * Code improvements * Better tests * Comment out some coode * add zarr example to readme * Lucain's comments * Quentin's comments * Update src/hffs/fs.py Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> * Update pyproject.toml Add newline to pyproject.toml Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
huggingface · Oct 11, 2022 · 0c61ed2 · 0c61ed2
1 parent d00546d
commit 0c61ed2
Show file tree

Hide file tree

Showing 9 changed files with 627 additions and 14 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,21 @@
+.PHONY: quality style test
+
+check_dirs := tests src setup.py
+
+# Check that source code meets quality standards
+
+quality:
+	black --check $(check_dirs)
+	isort --check-only $(check_dirs)
+	flake8 $(check_dirs)
+
+# Format source code automatically
+
+style:
+	black $(check_dirs)
+	isort $(check_dirs)
+
+# Run tests for the library
+
+test:
+	python -m pytest -sv ./tests/
diff --git a/README.md b/README.md
@@ -1,3 +1,106 @@
-# HFfs
+# `hffs`
 
-HFfs builds on huggingface_hub to provide a convenient Python filesystem interface for huggingface.co repositories.
+`hffs` builds on [`huggingface_hub`](https://github.com/huggingface/huggingface_hub) and [`fsspec`](https://github.com/huggingface/huggingface_hub) to provide a convenient Python filesystem interface to 🤗 Hub.
+
+## Examples
+
+Locate and read a file from a 🤗 Hub repo:
+
+```python
+>>> import hffs
+>>> fs = hffs.HfFileSystem("my-username/my-dataset-repo", repo_type="dataset")
+>>> fs.ls("")
+['.gitattributes', 'my-file.txt']
+>>> with fs.open("my-file.txt", "r") as f:
+...     f.read()
+'Hello, world'
+```
+
+Write a file to the repo:
+
+```python
+>>> with fs.open("my-file-new.txt", "w") as f:
+...     f.write("Hello, world1")
+...     f.write("Hello, world2")
+>>> fs.exists("my-file-new.txt")
+True
+>>> fs.du("my-file-new.txt")
+26
+```
+
+Instantiation via `fsspec`:
+
+```python
+>>> import fsspec
+
+>>> # Instantiate a `hffs.HfFileSystem` object
+>>> fs = fsspec.filesystem("hf://my-username/my-model-repo", repo_type="model")
+>>> fs.ls("")
+['.gitattributes', 'config.json', 'pytorch_model.bin']
+
+>>> # Instantiate a `hffs.HfFileSystem` object and write a file to it
+>>> with fsspec.open("hf://my-username/my-dataset-repo:/my-file-new.txt", repo_type="dataset"):
+...     f.write("Hello, world1")
+...     f.write("Hello, world2")
+```
+
+> **Note**: To be recognized as a `hffs` URL, the URL path passed to [`fsspec.open`](https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=open#fsspec.open) must adhere to the following scheme:
+> ```
+> hf://<repo_id>[@<revision>]:/<path/in/repo>
+> ```
+
+# Installation
+
+```bash
+pip install hffs
+```
+
+## Integrations
+
+* [`pandas`](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#reading-writing-remote-files)/[`dask`](https://docs.dask.org/en/stable/how-to/connect-to-remote-data.html)
+
+```python
+>>> import pandas as pd
+
+>>> # Read a remote CSV file into a dataframe
+>>> df = pd.read_csv("hf://my-username/my-dataset-repo:/train.csv", storage_options={"repo_type": "dataset"})
+
+>>> # Write a dataframe to a remote CSV file
+>>> df.to_csv("hf://my-username/my-dataset-repo:/test.csv", storage_options={"repo_type": "dataset"})
+```
+
+* [`datasets`](https://huggingface.co/docs/datasets/filesystems#load-and-save-your-datasets-using-your-cloud-storage-filesystem)
+
+```python
+>>> import datasets
+
+>>> # Export a (large) dataset to a repo
+>>> cache_dir = "hf://my-username/my-dataset-repo"
+>>> builder = datasets.load_dataset_builder("path/to/local/loading_script/loading_script.py", cache_dir=cache_dir, storage_options={"repo_type": "dataset"})
+>>> builder.download_and_prepare(file_format="parquet")
+
+>>> # Stream the dataset from the repo
+>>> dset = datasets.load_dataset("my-username/my-dataset-repo", split="train")
+>>> # Process the examples
+>>> for ex in dset:
+...    ...
+```
+
+* [`zarr`](https://zarr.readthedocs.io/en/stable/tutorial.html#io-with-fsspec)
+
+```python
+>>> import numpy as np
+>>> import zarr
+
+>>> embeddings = np.random.randn(50000, 1000).astype("float32")
+
+>>> # Write an array to a repo acting as a remote zarr store
+>>> with zarr.open_group("hf://my-username/my-model-repo:/array-store", mode="w", storage_options={"repo_type": "model"}) as root:
+...    foo = root.create_group("embeddings")
+...    foobar = foo.zeros('experiment_0', shape=(50000, 1000), chunks=(10000, 1000), dtype='f4')
+...    foobar[:] = embeddings
+
+>>> # Read from a remote zarr store
+>>> with zarr.open_group("hf://my-username/my-model-repo:/array-store", mode="r", storage_options={"repo_type": "model"}) as root:
+...    first_row = root["embeddings/experiment_0"][0]
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,4 @@
+[tool.black]
+line-length = 119
+target_version = ['py37', 'py38', 'py39', 'py310']
+preview = true
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,15 @@
+[metadata]
+license_file = LICENSE
+
+[isort]
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+
+[flake8]
+ignore = E203, E501, W503
+max-line-length = 119
diff --git a/setup.py b/setup.py
@@ -1,5 +1,5 @@
 # Lint as: python3
-""" HuggingFace/HFfs is an interface to huggingface.co repositories.
+"""HuggingFace Filesystem is an interface to huggingface.co repositories.
 
 Note:
 
@@ -60,22 +60,19 @@
 
 REQUIRED_PKGS = [
     # minimum 2021.11.1 so that BlockSizeError is fixed: see https://github.com/fsspec/filesystem_spec/pull/830
-    "fsspec[http]>=2021.11.1",
-    # for data streaming via http
-    "aiohttp",
+    "fsspec",
+    "requests",
     # To use the HfApi to get the files info from huggingface.co
-    "huggingface-hub>=0.8.0,<1.0.0",
+    "huggingface_hub>=0.10.0",
 ]
 
 
 TESTS_REQUIRE = [
     "pytest",
-    "pytest-datadir",
-    "pytest-xdist",
 ]
 
 
-QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0", "pyyaml>=5.3.1"]
+QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0"]
 
 
 EXTRAS_REQUIRE = {
@@ -97,6 +94,7 @@
     license="Apache 2.0",
     package_dir={"": "src"},
     packages=find_packages("src"),
+    package_data={"hffs": ["py.typed"]},
     python_requires=">=3.7.0",
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
@@ -114,6 +112,6 @@
         "Programming Language :: Python :: 3.10",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    keywords="models datasets machine learning huggingface",
+    keywords="models datasets machine learning huggingface filesystem",
     zip_safe=False,  # Required for mypy to find the py.typed file
 )
diff --git a/src/hffs/__init__.py b/src/hffs/__init__.py
@@ -1,6 +1,5 @@
-from fsspec import AbstractFileSystem
+# flake8: noqa
 
 __version__ = "0.0.1.dev0"
 
-class HfFileSystem(AbstractFileSystem):
-    pass
+from .fs import HfFileSystem