feat: general read/write methods

dmyersturnbull · Mar 31, 2021 · 4c067b7 · 4c067b7
1 parent e4c67ef
commit 4c067b7
Show file tree

Hide file tree

Showing 11 changed files with 577 additions and 168 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,8 +7,11 @@ Adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and
 
 ## Added
 - Read/write wrappers for Feather, Parquet, and JSON
+- Added general functions `read_file` and `write_file`
+- `TypeDfs.wrap` and `FinalDf`
 
 ### Fixed
+- `to_csv` was not passing along `args` and `kwargs`
 - Slightly better build config
 
 ## [0.5.0] - 2021-01-19

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 [![Documentation status](https://readthedocs.org/projects/typed-dfs/badge)](https://typed-dfs.readthedocs.io/en/stable/)
 [![Coverage (coveralls)](https://coveralls.io/repos/github/dmyersturnbull/typed-dfs/badge.svg?branch=main&service=github)](https://coveralls.io/github/dmyersturnbull/typed-dfs?branch=main)
 [![Maintainability](https://api.codeclimate.com/v1/badges/6b804351b6ba5e7694af/maintainability)](https://codeclimate.com/github/dmyersturnbull/typed-dfs/maintainability)
-[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/dmyersturnbull/typed-dfs/badges/quality-score.png?b=main)](https://scrutinizer-ci.com/g/dmyersturnbull/typed-dfs/?branch=main)  
+[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/dmyersturnbull/typed-dfs/badges/quality-score.png?b=main)](https://scrutinizer-ci.com/g/dmyersturnbull/typed-dfs/?branch=main)
 [![Created with Tyrannosaurus](https://img.shields.io/badge/Created_with-Tyrannosaurus-0000ff.svg)](https://github.com/dmyersturnbull/tyrannosaurus)
 
 
@@ -35,7 +35,7 @@ MyDfType = (
   so **`read_csv` and `to_csv` are inverses**.
   `MyDf.read_csv(mydf.to_csv())` is `mydf`. 
 - DataFrames display elegantly in Jupyter notebooks.
-- Extra methods such as `sort_natural` and `drop_cols`.
+- Extra methods such as `sort_natural` and `write_file`.
 
 ### 🎨 Example
 
@@ -77,6 +77,14 @@ Use `.untyped()` or `.vanilla()` to make a detyped copy that doesn’t enforce r
 
 ### 🔌 Serialization support
 
+Like Pandas, TypedDfs can read and write to various formats.
+It provides the methods `read_file` and `write_file`, which guess the format from the
+filename extension. For example, `df.write_file("myfile.snappy)` writes Parquet files,
+and `df.write_file("myfile.tab.gz")` writes a gzipped, tab-delimited file.
+The `read_file` method works the same way: `MyDf.read_file("myfile.feather")` will
+read an Apache Arrow Feather file, and `MyDf.read_file("myfile.json.gzip")`reads
+a gzipped JSON file. You can pass keyword arguments to those functions.
+
 Serialization is provided through Pandas, and some formats require additional packages.
 Pandas does not specify compatible versions, so typed-dfs specifies
 [extras](https://python-poetry.org/docs/pyproject/#extras) are provided in typed-dfs

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -104,7 +104,7 @@ pyarrow                  = {version=">=3.0, <4.0", optional=true}
 #tables                   = {version=">=3.6, <4.0", optional=true}
 
 [tool.poetry.dev-dependencies]
-pre-commit               = ">=2.10, <3.0"
+pre-commit               = ">=2.11, <3.0"
 pre-commit-hooks         = ">=3.4, <4.0"
 bandit                   = ">=1.7, <2.0"
 pytest                   = ">=6.2, <7.0"
@@ -113,9 +113,9 @@ pytest-cov               = ">=2.11, <3.0"
 sphinx                   = ">=3.5, <4.0"
 sphinx-autoapi           = ">=1.7, <2.0"
 sphinx-rtd-theme         = ">=0.5, <1.0"
-flake8                   = ">=3.8, <4.0"
+flake8                   = ">=3.9, <4.0"
 flake8-docstrings        = ">=1.5, <2.0"
-flake8-bugbear           = ">=20"
+flake8-bugbear           = ">=21"
 tomlkit                  = ">=0.7, <1.0"
 
 

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,5 +1,6 @@
 import contextlib
 import inspect
+import random
 from pathlib import Path
 from typing import Sequence
 
@@ -10,8 +11,9 @@
 
 @contextlib.contextmanager
 def tmpfile(ext: str) -> Path:
-    caller = inspect.stack()[1][3]
-    path = Path(__file__).parent.parent.parent / "resources" / "tmp" / (str(caller) + ext)
+    # caller = inspect.stack()[1][3]
+    caller = str(random.randint(1, 100000))  # nosec
+    path = Path(__file__).parent / "resources" / "tmp" / (str(caller) + ext)
     path.parent.mkdir(parents=True, exist_ok=True)
     yield path
     if path.exists():
@@ -52,6 +54,12 @@ class TypedTrivial(TypedDf):
     pass
 
 
+class TypedOneColumn(TypedDf):
+    @classmethod
+    def required_columns(cls) -> Sequence[str]:
+        return ["abc"]
+
+
 class TypedSingleIndex(TypedDf):
     @classmethod
     def required_index_names(cls) -> Sequence[str]:

diff --git a/tests/test_fancy_read_write.py b/tests/test_fancy_read_write.py
@@ -0,0 +1,63 @@
+import pytest
+
+from . import TypedMultiIndex, sample_data, tmpfile, TypedOneColumn, TypedSingleIndex
+
+# h5, snappy, and parquet work too -- but can't run in CI yet
+known_compressions = {"", ".gz", ".zip", ".bz2", ".xz"}
+
+
+def _get_known_extensions():
+    ne = {".feather"}
+    for e in {".csv", ".tsv", ".tab"}:
+        for c in known_compressions:
+            ne.add(e + c)
+    return ne
+
+
+known_extensions = _get_known_extensions()
+
+
+class TestReadWrite:
+    def test_read_write_file_multi_index(self):
+        for ext in known_extensions:
+            with tmpfile(ext) as path:
+                df = TypedMultiIndex.convert(TypedMultiIndex(sample_data()))
+                df.write_file(path)
+                df2 = TypedMultiIndex.read_file(path)
+                assert df2.index_names() == ["abc", "xyz"]
+                assert df2.column_names() == ["123"]
+
+    def test_read_write_one_single_index(self):
+        for ext in known_extensions:
+            with tmpfile(ext) as path:
+                df = TypedSingleIndex.convert(TypedSingleIndex(sample_data()))
+                df.write_file(path)
+                df2 = TypedSingleIndex.read_file(path)
+                assert df2.index_names() == ["abc"]
+                assert df2.column_names() == ["123", "xyz"]
+
+    # noinspection DuplicatedCode
+    def test_read_write_one_col(self):
+        for ext in known_extensions:
+            with tmpfile(ext) as path:
+                df = TypedOneColumn(["a", "puppy", "and", "a", "parrot"], columns=["abc"])
+                df = TypedOneColumn.convert(df)
+                df.write_file(path)
+                df2 = TypedOneColumn.read_file(path)
+                assert df2.index_names() == []
+                assert df2.column_names() == ["abc"]
+
+    # noinspection DuplicatedCode
+    def test_read_write_txt(self):
+        for c in known_compressions:
+            with tmpfile(".txt" + c) as path:
+                df = TypedOneColumn(["a", "puppy", "and", "a", "parrot"], columns=["abc"])
+                df = TypedOneColumn.convert(df)
+                df.write_file(path)
+                df2 = TypedOneColumn.read_file(path)
+                assert df2.index_names() == []
+                assert df2.column_names() == ["abc"]
+
+
+if __name__ == "__main__":
+    pytest.main()
diff --git a/tests/test_read_write.py b/tests/test_read_write.py
@@ -10,15 +10,15 @@ def test_feather_lz4(self):
         with tmpfile(".feather") as path:
             df = TypedMultiIndex.convert(TypedMultiIndex(sample_data()))
             df.to_feather(path, compression="lz4")
-            df2 = UntypedDf.read_feather(path)
+            df2 = TypedMultiIndex.read_feather(path)
             assert df2.index_names() == ["abc", "xyz"]
             assert df2.column_names() == ["123"]
 
     def test_feather_zstd(self):
         with tmpfile(".feather") as path:
             df = TypedMultiIndex.convert(TypedMultiIndex(sample_data()))
             df.to_feather(path, compression="zstd")
-            df2 = UntypedDf.read_feather(path)
+            df2 = TypedMultiIndex.read_feather(path)
             assert df2.index_names() == ["abc", "xyz"]
             assert df2.column_names() == ["123"]
 

diff --git a/typeddfs/__init__.py b/typeddfs/__init__.py
@@ -9,6 +9,8 @@
 from pathlib import Path
 from typing import Optional, Type
 
+import pandas as pd
+
 from typeddfs.base_dfs import AsymmetricDfError as _AsymmetricDfError
 from typeddfs.base_dfs import BaseDf
 from typeddfs.base_dfs import ExtraConditionFailedError as _ExtraConditionFailedError
@@ -42,6 +44,10 @@
     logger.error(f"Could not load package metadata for {pkg}. Is it installed?")
 
 
+class FinalDf(UntypedDf):
+    """An untyped DataFrame meant for general use."""
+
+
 class TypedDfs:
     """
     The only thing you need to import from ``typeddfs``.
@@ -83,6 +89,18 @@ def example(cls) -> Type[TypedDf]:
         ).build()
         return KeyValue
 
+    @classmethod
+    def wrap(cls, df: pd.DataFrame) -> FinalDf:
+        """
+        Just wraps a DataFrame into a simple untyped DataFrame.
+        Useful to quickly access a function only defined on typeddfs DataFrames.
+
+        Example:
+
+            TypedDfs.wrap(df).write_file("abc.feather")
+        """
+        return FinalDf(df)
+
     @classmethod
     def typed(cls, name: str, doc: Optional[str] = None) -> TypedDfBuilder:
         """
@@ -127,4 +145,4 @@ class New(UntypedDf):
         return New
 
 
-__all__ = ["BaseDf", "UntypedDf", "TypedDf", "TypedDfs"]
+__all__ = ["BaseDf", "UntypedDf", "TypedDf", "TypedDfs", "FinalDf"]