Skip to content

Commit

Permalink
Fixes for 2 and 3 zone systems (#26)
Browse files Browse the repository at this point in the history
* do not drop dims when analog is missing

* fix shared memory for sparse arrays

* flows work with numba jit disabled

* fix error in looking up from empty sparse rows

* fix reverse in sparse

* allow None in attrs

* import .dataset by default

* limit to setuptools<64

editable mode is broken in 64, python/importlib_metadata#402

* fix install info in docs

addresses #25

* re-loop aster for mixed spacenames, fix np.clip

* faster dataset from dataframe

* update install reqs

* add 3.10 to test matrix
  • Loading branch information
jpn-- committed Aug 25, 2022
1 parent 1499e79 commit 1fe091b
Show file tree
Hide file tree
Showing 12 changed files with 259 additions and 68 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/run-tests.yml
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
matrix:
os: ["ubuntu-latest", "macos-latest", "windows-latest"]
python-version: ["3.7", "3.9"]
python-version: ["3.7", "3.9", "3.10"]
defaults:
run:
shell: bash -l {0}
Expand Down
8 changes: 8 additions & 0 deletions docs/intro.md
Expand Up @@ -30,6 +30,14 @@ the root directory run
pip install -e .
```

Alternatively, you can install sharrow plus all the dependencies (including
additional optional dependencies for development and testing) in a conda environment,
using the `envs/development.yml` environment to create a `sh-dev` environment:

```shell
conda env create -f envs/development.yml
```

## Testing

Sharrow includes unit tests both in the `sharrow/tests` directory and embedded
Expand Down
29 changes: 29 additions & 0 deletions docs/walkthrough/sparse.ipynb
Expand Up @@ -508,6 +508,35 @@
"# TEST\n",
"assert skims.redirection.blenders == {'DISTWALK': {'max_blend_distance': 1.0, 'blend_distance_name': None}}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "96a7c554",
"metadata": {},
"outputs": [],
"source": [
"# TEST\n",
"# reverse skims in sparse\n",
"flow3 = tree.setup_flow({\n",
" 'plain_distance': 'DISTWALK',\n",
" 'reverse_distance': 'skims.reverse(\"DISTWALK\")',\n",
"})\n",
"\n",
"assert flow3.load() == approx(np.array([[ 0.0111, 0.0111],\n",
" [ 0.184 , 0.12 ],\n",
" [ 0.12 , 0.12 ],\n",
" [ 0.17 , 0.17 ],\n",
" [ 0.17 , 0.17 ]], dtype=np.float32))\n",
"\n",
"z = skims.iat(\n",
" omaz=[ 0, 1, 3, 101, 102],\n",
" dmaz=[ 0, 0, 0, 100, 100],\n",
" _names=['DIST', 'DISTWALK'], _load=True,\n",
")\n",
"assert z['DISTWALK'].data == approx(np.array([ 0.0111, 0.12 , 0.12 , 0.17 , 0.17 ]))\n",
"assert z['DIST'].data == approx(np.array([ 0.12, 0.12 , 0.12 , 0.17 , 0.17 ]))"
]
}
],
"metadata": {
Expand Down
31 changes: 31 additions & 0 deletions envs/development.yml
@@ -0,0 +1,31 @@
name: sh-dev
channels:
- conda-forge
- nodefaults
dependencies:
- python=3.9
- pip
# required for testing
- dask
- filelock
- flake8
- jupyter
- nbmake
- networkx
- notebook
- numba>=0.53
- numexpr
- numpy>=1.19
- openmatrix
- pandas>=1.2
- pyarrow
- pytest
- pytest-cov
- pytest-regressions
- pytest-xdist
- sparse
- xarray
- zarr

- pip:
- -e ..
5 changes: 2 additions & 3 deletions pyproject.toml
@@ -1,9 +1,8 @@
[build-system]
requires = [
"setuptools>=42",
"setuptools>=42,<64",
"wheel",
"setuptools_scm[toml]>=3.4",
"setuptools_scm_git_archive",
"setuptools_scm[toml]>=7.0",
]
build-backend = "setuptools.build_meta"

Expand Down
2 changes: 1 addition & 1 deletion sharrow/__init__.py
@@ -1,6 +1,6 @@
from xarray import DataArray

from . import example_data, selectors, shared_memory, sparse
from . import dataset, example_data, selectors, shared_memory, sparse
from ._version import version as __version__
from .dataset import Dataset
from .digital_encoding import array_decode, array_encode
Expand Down
35 changes: 26 additions & 9 deletions sharrow/aster.py
Expand Up @@ -392,8 +392,13 @@ def _replacement(

if self.spacevars is not None:
if attr not in self.spacevars:
raise KeyError(f"{topname}..{attr}")
# return original_node
if topname == pref_topname:
raise KeyError(f"{topname}..{attr}")
# we originally raised a KeyError here regardless, but what if we just
# give back the original node, and see if other spaces,
# possibly fallback spaces, might work? If nothing works then
# it will still eventually error out when compiling?
return original_node

dim_slots = self.dim_slots
if isinstance(self.spacevars, dict):
Expand Down Expand Up @@ -549,8 +554,8 @@ def _maybe_transpose_first_two_args(_slice):
ast.Name(
id=f"__{pref_topname}___s_{attr}__data", ctx=ast.Load()
),
result_arg_[0],
result_arg_[1],
result_arg_[0 if not transpose_lead else 1],
result_arg_[1 if not transpose_lead else 0],
ast_Constant(blender.get("max_blend_distance")), # blend_limit
],
keywords=[],
Expand Down Expand Up @@ -694,12 +699,14 @@ def visit_Call(self, node):

result = None
# implement ActivitySim's "reverse" skims
if isinstance(node.func, ast.Attribute) and node.func.attr == "reverse":
if isinstance(node.func.value, ast.Name):
if node.func.value.id == self.spacename:
if (
isinstance(node.func, ast.Attribute) and node.func.attr == "reverse"
): # *.reverse(...)
if isinstance(node.func.value, ast.Name): # somename.reverse(...)
if node.func.value.id == self.spacename: # spacename.reverse(...)
if len(node.args) == 1 and isinstance(
node.args[0], ast_Constant_Type
):
): # spacename.reverse('constant')
result = self._replacement(
ast_String_value(node.args[0]),
node.func.ctx,
Expand All @@ -708,7 +715,17 @@ def visit_Call(self, node):
)
# handle clip as a method
if isinstance(node.func, ast.Attribute) and node.func.attr == "clip":
if len(node.args) == 1 and len(node.keywords) == 0:
if isinstance(node.func.value, ast.Name) and node.func.value.id == "np":
# call to np.clip(...), change to local clip implementation
clip_args = []
for a in node.args:
clip_args.append(self.visit(a))
result = ast.Call(
func=ast.Name("clip", cts=ast.Load()),
args=clip_args,
keywords=[self.visit(i) for i in node.keywords],
)
elif len(node.args) == 1 and len(node.keywords) == 0:
# single positional arg becomes max
result = ast.Call(
func=ast.Name("max", cts=ast.Load()),
Expand Down
86 changes: 83 additions & 3 deletions sharrow/dataset.py
Expand Up @@ -82,16 +82,15 @@ def construct(source):
source : pandas.DataFrame, pyarrow.Table, xarray.Dataset, or Sequence[str]
The source from which to create a Dataset. DataFrames and Tables
are converted to Datasets that have one dimension (the rows) and
seperate variables for each of the columns. A list of strings
separate variables for each of the columns. A list of strings
creates a dataset with those named empty variables.
Returns
-------
Dataset
"""
if isinstance(source, pd.DataFrame):
source = xr.Dataset.from_dataframe(source)
# source = cls.from_dataframe_fast(source) # older xarray was slow
source = dataset_from_dataframe_fast(source) # xarray default can be slow
elif isinstance(source, (Table, pa.Table)):
source = xr.Dataset.from_table(source)
elif isinstance(source, (pa.Table)):
Expand All @@ -105,6 +104,63 @@ def construct(source):
return source


def dataset_from_dataframe_fast(
dataframe: pd.DataFrame, sparse: bool = False
) -> "Dataset":
"""Convert a pandas.DataFrame into an xarray.Dataset
Each column will be converted into an independent variable in the
Dataset. If the dataframe's index is a MultiIndex, it will be expanded
into a tensor product of one-dimensional indices (filling in missing
values with NaN). This method will produce a Dataset very similar to
that on which the 'to_dataframe' method was called, except with
possibly redundant dimensions (since all dataset variables will have
the same dimensionality)
Parameters
----------
dataframe : DataFrame
DataFrame from which to copy data and indices.
sparse : bool, default: False
If true, create a sparse arrays instead of dense numpy arrays. This
can potentially save a large amount of memory if the DataFrame has
a MultiIndex. Requires the sparse package (sparse.pydata.org).
Returns
-------
New Dataset.
See Also
--------
xarray.DataArray.from_series
pandas.DataFrame.to_xarray
"""

# this is much faster than the default xarray version when not
# using a MultiIndex.

if isinstance(dataframe.index, pd.MultiIndex) or sparse:
return Dataset.from_dataframe(dataframe, sparse)

if not dataframe.columns.is_unique:
raise ValueError("cannot convert DataFrame with non-unique columns")

if isinstance(dataframe.index, pd.CategoricalIndex):
idx = dataframe.index.remove_unused_categories()
else:
idx = dataframe.index

index_name = idx.name if idx.name is not None else "index"
# Cast to a NumPy array first, in case the Series is a pandas Extension
# array (which doesn't have a valid NumPy dtype)
arrays = {
name: ([index_name], np.asarray(dataframe[name].values))
for name in dataframe.columns
if name != index_name
}
return Dataset(arrays, coords={index_name: (index_name, dataframe.index.values)})


def from_table(
tbl,
index_name="index",
Expand Down Expand Up @@ -527,8 +583,22 @@ def from_zarr_with_attr(*args, **kwargs):
and avalue.endswith("} ")
):
avalue = ast.literal_eval(avalue[1:-1])
if isinstance(avalue, str) and avalue == " < None > ":
avalue = None
attrs[aname] = avalue
obj[k] = obj[k].assign_attrs(attrs)
attrs = {}
for aname, avalue in obj.attrs.items():
if (
isinstance(avalue, str)
and avalue.startswith(" {")
and avalue.endswith("} ")
):
avalue = ast.literal_eval(avalue[1:-1])
if isinstance(avalue, str) and avalue == " < None > ":
avalue = None
attrs[aname] = avalue
obj = obj.assign_attrs(attrs)
return obj


Expand Down Expand Up @@ -759,8 +829,18 @@ def to_zarr_with_attr(self, *args, **kwargs):
for aname, avalue in self[k].attrs.items():
if isinstance(avalue, dict):
avalue = f" {avalue!r} "
if avalue is None:
avalue = " < None > "
attrs[aname] = avalue
obj[k] = self[k].assign_attrs(attrs)
attrs = {}
for aname, avalue in self.attrs.items():
if isinstance(avalue, dict):
avalue = f" {avalue!r} "
if avalue is None:
avalue = " < None > "
attrs[aname] = avalue
obj = obj.assign_attrs(attrs)
return obj.to_zarr(*args, **kwargs)


Expand Down

0 comments on commit 1fe091b

Please sign in to comment.