Skip to content

Commit

Permalink
feat: general read/write methods
Browse files Browse the repository at this point in the history
  • Loading branch information
dmyersturnbull committed Mar 31, 2021
1 parent e4c67ef commit 4c067b7
Show file tree
Hide file tree
Showing 11 changed files with 577 additions and 168 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Expand Up @@ -7,8 +7,11 @@ Adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and

## Added
- Read/write wrappers for Feather, Parquet, and JSON
- Added general functions `read_file` and `write_file`
- `TypeDfs.wrap` and `FinalDf`

### Fixed
- `to_csv` was not passing along `args` and `kwargs`
- Slightly better build config

## [0.5.0] - 2021-01-19
Expand Down
12 changes: 10 additions & 2 deletions README.md
Expand Up @@ -9,7 +9,7 @@
[![Documentation status](https://readthedocs.org/projects/typed-dfs/badge)](https://typed-dfs.readthedocs.io/en/stable/)
[![Coverage (coveralls)](https://coveralls.io/repos/github/dmyersturnbull/typed-dfs/badge.svg?branch=main&service=github)](https://coveralls.io/github/dmyersturnbull/typed-dfs?branch=main)
[![Maintainability](https://api.codeclimate.com/v1/badges/6b804351b6ba5e7694af/maintainability)](https://codeclimate.com/github/dmyersturnbull/typed-dfs/maintainability)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/dmyersturnbull/typed-dfs/badges/quality-score.png?b=main)](https://scrutinizer-ci.com/g/dmyersturnbull/typed-dfs/?branch=main)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/dmyersturnbull/typed-dfs/badges/quality-score.png?b=main)](https://scrutinizer-ci.com/g/dmyersturnbull/typed-dfs/?branch=main)
[![Created with Tyrannosaurus](https://img.shields.io/badge/Created_with-Tyrannosaurus-0000ff.svg)](https://github.com/dmyersturnbull/tyrannosaurus)


Expand All @@ -35,7 +35,7 @@ MyDfType = (
so **`read_csv` and `to_csv` are inverses**.
`MyDf.read_csv(mydf.to_csv())` is `mydf`.
- DataFrames display elegantly in Jupyter notebooks.
- Extra methods such as `sort_natural` and `drop_cols`.
- Extra methods such as `sort_natural` and `write_file`.

### 🎨 Example

Expand Down Expand Up @@ -77,6 +77,14 @@ Use `.untyped()` or `.vanilla()` to make a detyped copy that doesn’t enforce r

### 🔌 Serialization support

Like Pandas, TypedDfs can read and write to various formats.
It provides the methods `read_file` and `write_file`, which guess the format from the
filename extension. For example, `df.write_file("myfile.snappy)` writes Parquet files,
and `df.write_file("myfile.tab.gz")` writes a gzipped, tab-delimited file.
The `read_file` method works the same way: `MyDf.read_file("myfile.feather")` will
read an Apache Arrow Feather file, and `MyDf.read_file("myfile.json.gzip")`reads
a gzipped JSON file. You can pass keyword arguments to those functions.

Serialization is provided through Pandas, and some formats require additional packages.
Pandas does not specify compatible versions, so typed-dfs specifies
[extras](https://python-poetry.org/docs/pyproject/#extras) are provided in typed-dfs
Expand Down
246 changes: 136 additions & 110 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pyproject.toml
Expand Up @@ -104,7 +104,7 @@ pyarrow = {version=">=3.0, <4.0", optional=true}
#tables = {version=">=3.6, <4.0", optional=true}

[tool.poetry.dev-dependencies]
pre-commit = ">=2.10, <3.0"
pre-commit = ">=2.11, <3.0"
pre-commit-hooks = ">=3.4, <4.0"
bandit = ">=1.7, <2.0"
pytest = ">=6.2, <7.0"
Expand All @@ -113,9 +113,9 @@ pytest-cov = ">=2.11, <3.0"
sphinx = ">=3.5, <4.0"
sphinx-autoapi = ">=1.7, <2.0"
sphinx-rtd-theme = ">=0.5, <1.0"
flake8 = ">=3.8, <4.0"
flake8 = ">=3.9, <4.0"
flake8-docstrings = ">=1.5, <2.0"
flake8-bugbear = ">=20"
flake8-bugbear = ">=21"
tomlkit = ">=0.7, <1.0"


Expand Down
12 changes: 10 additions & 2 deletions tests/__init__.py
@@ -1,5 +1,6 @@
import contextlib
import inspect
import random
from pathlib import Path
from typing import Sequence

Expand All @@ -10,8 +11,9 @@

@contextlib.contextmanager
def tmpfile(ext: str) -> Path:
caller = inspect.stack()[1][3]
path = Path(__file__).parent.parent.parent / "resources" / "tmp" / (str(caller) + ext)
# caller = inspect.stack()[1][3]
caller = str(random.randint(1, 100000)) # nosec
path = Path(__file__).parent / "resources" / "tmp" / (str(caller) + ext)
path.parent.mkdir(parents=True, exist_ok=True)
yield path
if path.exists():
Expand Down Expand Up @@ -52,6 +54,12 @@ class TypedTrivial(TypedDf):
pass


class TypedOneColumn(TypedDf):
@classmethod
def required_columns(cls) -> Sequence[str]:
return ["abc"]


class TypedSingleIndex(TypedDf):
@classmethod
def required_index_names(cls) -> Sequence[str]:
Expand Down
63 changes: 63 additions & 0 deletions tests/test_fancy_read_write.py
@@ -0,0 +1,63 @@
import pytest

from . import TypedMultiIndex, sample_data, tmpfile, TypedOneColumn, TypedSingleIndex

# h5, snappy, and parquet work too -- but can't run in CI yet
known_compressions = {"", ".gz", ".zip", ".bz2", ".xz"}


def _get_known_extensions():
ne = {".feather"}
for e in {".csv", ".tsv", ".tab"}:
for c in known_compressions:
ne.add(e + c)
return ne


known_extensions = _get_known_extensions()


class TestReadWrite:
def test_read_write_file_multi_index(self):
for ext in known_extensions:
with tmpfile(ext) as path:
df = TypedMultiIndex.convert(TypedMultiIndex(sample_data()))
df.write_file(path)
df2 = TypedMultiIndex.read_file(path)
assert df2.index_names() == ["abc", "xyz"]
assert df2.column_names() == ["123"]

def test_read_write_one_single_index(self):
for ext in known_extensions:
with tmpfile(ext) as path:
df = TypedSingleIndex.convert(TypedSingleIndex(sample_data()))
df.write_file(path)
df2 = TypedSingleIndex.read_file(path)
assert df2.index_names() == ["abc"]
assert df2.column_names() == ["123", "xyz"]

# noinspection DuplicatedCode
def test_read_write_one_col(self):
for ext in known_extensions:
with tmpfile(ext) as path:
df = TypedOneColumn(["a", "puppy", "and", "a", "parrot"], columns=["abc"])
df = TypedOneColumn.convert(df)
df.write_file(path)
df2 = TypedOneColumn.read_file(path)
assert df2.index_names() == []
assert df2.column_names() == ["abc"]

# noinspection DuplicatedCode
def test_read_write_txt(self):
for c in known_compressions:
with tmpfile(".txt" + c) as path:
df = TypedOneColumn(["a", "puppy", "and", "a", "parrot"], columns=["abc"])
df = TypedOneColumn.convert(df)
df.write_file(path)
df2 = TypedOneColumn.read_file(path)
assert df2.index_names() == []
assert df2.column_names() == ["abc"]


if __name__ == "__main__":
pytest.main()
4 changes: 2 additions & 2 deletions tests/test_read_write.py
Expand Up @@ -10,15 +10,15 @@ def test_feather_lz4(self):
with tmpfile(".feather") as path:
df = TypedMultiIndex.convert(TypedMultiIndex(sample_data()))
df.to_feather(path, compression="lz4")
df2 = UntypedDf.read_feather(path)
df2 = TypedMultiIndex.read_feather(path)
assert df2.index_names() == ["abc", "xyz"]
assert df2.column_names() == ["123"]

def test_feather_zstd(self):
with tmpfile(".feather") as path:
df = TypedMultiIndex.convert(TypedMultiIndex(sample_data()))
df.to_feather(path, compression="zstd")
df2 = UntypedDf.read_feather(path)
df2 = TypedMultiIndex.read_feather(path)
assert df2.index_names() == ["abc", "xyz"]
assert df2.column_names() == ["123"]

Expand Down
20 changes: 19 additions & 1 deletion typeddfs/__init__.py
Expand Up @@ -9,6 +9,8 @@
from pathlib import Path
from typing import Optional, Type

import pandas as pd

from typeddfs.base_dfs import AsymmetricDfError as _AsymmetricDfError
from typeddfs.base_dfs import BaseDf
from typeddfs.base_dfs import ExtraConditionFailedError as _ExtraConditionFailedError
Expand Down Expand Up @@ -42,6 +44,10 @@
logger.error(f"Could not load package metadata for {pkg}. Is it installed?")


class FinalDf(UntypedDf):
"""An untyped DataFrame meant for general use."""


class TypedDfs:
"""
The only thing you need to import from ``typeddfs``.
Expand Down Expand Up @@ -83,6 +89,18 @@ def example(cls) -> Type[TypedDf]:
).build()
return KeyValue

@classmethod
def wrap(cls, df: pd.DataFrame) -> FinalDf:
"""
Just wraps a DataFrame into a simple untyped DataFrame.
Useful to quickly access a function only defined on typeddfs DataFrames.
Example:
TypedDfs.wrap(df).write_file("abc.feather")
"""
return FinalDf(df)

@classmethod
def typed(cls, name: str, doc: Optional[str] = None) -> TypedDfBuilder:
"""
Expand Down Expand Up @@ -127,4 +145,4 @@ class New(UntypedDf):
return New


__all__ = ["BaseDf", "UntypedDf", "TypedDf", "TypedDfs"]
__all__ = ["BaseDf", "UntypedDf", "TypedDf", "TypedDfs", "FinalDf"]

0 comments on commit 4c067b7

Please sign in to comment.