Fixes for 2 and 3 zone systems (#26)

* do not drop dims when analog is missing * fix shared memory for sparse arrays * flows work with numba jit disabled * fix error in looking up from empty sparse rows * fix reverse in sparse * allow None in attrs * import .dataset by default * limit to setuptools<64 editable mode is broken in 64, python/importlib_metadata#402 * fix install info in docs addresses #25 * re-loop aster for mixed spacenames, fix np.clip * faster dataset from dataframe * update install reqs * add 3.10 to test matrix
ActivitySim · Aug 25, 2022 · 1fe091b · 1fe091b
1 parent 1499e79
commit 1fe091b
Show file tree

Hide file tree

Showing 12 changed files with 259 additions and 68 deletions.
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       matrix:
         os: ["ubuntu-latest", "macos-latest", "windows-latest"]
-        python-version: ["3.7", "3.9"]
+        python-version: ["3.7", "3.9", "3.10"]
     defaults:
       run:
         shell: bash -l {0}

diff --git a/docs/intro.md b/docs/intro.md
@@ -30,6 +30,14 @@ the root directory run
 pip install -e .
 ```
 
+Alternatively, you can install sharrow plus all the dependencies (including
+additional optional dependencies for development and testing) in a conda environment,
+using the `envs/development.yml` environment to create a `sh-dev` environment:
+
+```shell
+conda env create -f envs/development.yml
+```
+
 ## Testing
 
 Sharrow includes unit tests both in the `sharrow/tests` directory and embedded

diff --git a/docs/walkthrough/sparse.ipynb b/docs/walkthrough/sparse.ipynb
@@ -508,6 +508,35 @@
     "# TEST\n",
     "assert skims.redirection.blenders == {'DISTWALK': {'max_blend_distance': 1.0, 'blend_distance_name': None}}"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96a7c554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TEST\n",
+    "# reverse skims in sparse\n",
+    "flow3 = tree.setup_flow({\n",
+    "    'plain_distance': 'DISTWALK',\n",
+    "    'reverse_distance': 'skims.reverse(\"DISTWALK\")',\n",
+    "})\n",
+    "\n",
+    "assert flow3.load() == approx(np.array([[ 0.0111,  0.0111],\n",
+    "       [ 0.184 ,  0.12  ],\n",
+    "       [ 0.12  ,  0.12  ],\n",
+    "       [ 0.17  ,  0.17  ],\n",
+    "       [ 0.17  ,  0.17  ]], dtype=np.float32))\n",
+    "\n",
+    "z = skims.iat(\n",
+    "    omaz=[  0,   1,   3, 101, 102],\n",
+    "    dmaz=[  0,   0,   0, 100, 100],\n",
+    "    _names=['DIST', 'DISTWALK'], _load=True,\n",
+    ")\n",
+    "assert z['DISTWALK'].data == approx(np.array([ 0.0111,  0.12  ,  0.12  ,  0.17  ,  0.17  ]))\n",
+    "assert z['DIST'].data == approx(np.array([ 0.12,  0.12  ,  0.12  ,  0.17  ,  0.17  ]))"
+   ]
   }
  ],
  "metadata": {

diff --git a/envs/development.yml b/envs/development.yml
@@ -0,0 +1,31 @@
+name: sh-dev
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  - python=3.9
+  - pip
+  # required for testing
+  - dask
+  - filelock
+  - flake8
+  - jupyter
+  - nbmake
+  - networkx
+  - notebook
+  - numba>=0.53
+  - numexpr
+  - numpy>=1.19
+  - openmatrix
+  - pandas>=1.2
+  - pyarrow
+  - pytest
+  - pytest-cov
+  - pytest-regressions
+  - pytest-xdist
+  - sparse
+  - xarray
+  - zarr
+
+  - pip:
+    - -e ..
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,9 +1,8 @@
 [build-system]
 requires = [
-    "setuptools>=42",
+    "setuptools>=42,<64",
     "wheel",
-    "setuptools_scm[toml]>=3.4",
-    "setuptools_scm_git_archive",
+    "setuptools_scm[toml]>=7.0",
 ]
 build-backend = "setuptools.build_meta"
 

diff --git a/sharrow/__init__.py b/sharrow/__init__.py
@@ -1,6 +1,6 @@
 from xarray import DataArray
 
-from . import example_data, selectors, shared_memory, sparse
+from . import dataset, example_data, selectors, shared_memory, sparse
 from ._version import version as __version__
 from .dataset import Dataset
 from .digital_encoding import array_decode, array_encode

diff --git a/sharrow/aster.py b/sharrow/aster.py
@@ -392,8 +392,13 @@ def _replacement(
 
         if self.spacevars is not None:
             if attr not in self.spacevars:
-                raise KeyError(f"{topname}..{attr}")
-                # return original_node
+                if topname == pref_topname:
+                    raise KeyError(f"{topname}..{attr}")
+                # we originally raised a KeyError here regardless, but what if we just
+                # give back the original node, and see if other spaces,
+                # possibly fallback spaces, might work?  If nothing works then
+                # it will still eventually error out when compiling?
+                return original_node
 
         dim_slots = self.dim_slots
         if isinstance(self.spacevars, dict):
@@ -549,8 +554,8 @@ def _maybe_transpose_first_two_args(_slice):
                         ast.Name(
                             id=f"__{pref_topname}___s_{attr}__data", ctx=ast.Load()
                         ),
-                        result_arg_[0],
-                        result_arg_[1],
+                        result_arg_[0 if not transpose_lead else 1],
+                        result_arg_[1 if not transpose_lead else 0],
                         ast_Constant(blender.get("max_blend_distance")),  # blend_limit
                     ],
                     keywords=[],
@@ -694,12 +699,14 @@ def visit_Call(self, node):
 
         result = None
         # implement ActivitySim's "reverse" skims
-        if isinstance(node.func, ast.Attribute) and node.func.attr == "reverse":
-            if isinstance(node.func.value, ast.Name):
-                if node.func.value.id == self.spacename:
+        if (
+            isinstance(node.func, ast.Attribute) and node.func.attr == "reverse"
+        ):  # *.reverse(...)
+            if isinstance(node.func.value, ast.Name):  # somename.reverse(...)
+                if node.func.value.id == self.spacename:  # spacename.reverse(...)
                     if len(node.args) == 1 and isinstance(
                         node.args[0], ast_Constant_Type
-                    ):
+                    ):  # spacename.reverse('constant')
                         result = self._replacement(
                             ast_String_value(node.args[0]),
                             node.func.ctx,
@@ -708,7 +715,17 @@ def visit_Call(self, node):
                         )
         # handle clip as a method
         if isinstance(node.func, ast.Attribute) and node.func.attr == "clip":
-            if len(node.args) == 1 and len(node.keywords) == 0:
+            if isinstance(node.func.value, ast.Name) and node.func.value.id == "np":
+                # call to np.clip(...), change to local clip implementation
+                clip_args = []
+                for a in node.args:
+                    clip_args.append(self.visit(a))
+                result = ast.Call(
+                    func=ast.Name("clip", cts=ast.Load()),
+                    args=clip_args,
+                    keywords=[self.visit(i) for i in node.keywords],
+                )
+            elif len(node.args) == 1 and len(node.keywords) == 0:
                 # single positional arg becomes max
                 result = ast.Call(
                     func=ast.Name("max", cts=ast.Load()),

diff --git a/sharrow/dataset.py b/sharrow/dataset.py
@@ -82,16 +82,15 @@ def construct(source):
     source : pandas.DataFrame, pyarrow.Table, xarray.Dataset, or Sequence[str]
         The source from which to create a Dataset.  DataFrames and Tables
         are converted to Datasets that have one dimension (the rows) and
-        seperate variables for each of the columns.  A list of strings
+        separate variables for each of the columns.  A list of strings
         creates a dataset with those named empty variables.
 
     Returns
     -------
     Dataset
     """
     if isinstance(source, pd.DataFrame):
-        source = xr.Dataset.from_dataframe(source)
-        # source = cls.from_dataframe_fast(source) # older xarray was slow
+        source = dataset_from_dataframe_fast(source)  # xarray default can be slow
     elif isinstance(source, (Table, pa.Table)):
         source = xr.Dataset.from_table(source)
     elif isinstance(source, (pa.Table)):
@@ -105,6 +104,63 @@ def construct(source):
     return source
 
 
+def dataset_from_dataframe_fast(
+    dataframe: pd.DataFrame, sparse: bool = False
+) -> "Dataset":
+    """Convert a pandas.DataFrame into an xarray.Dataset
+
+    Each column will be converted into an independent variable in the
+    Dataset. If the dataframe's index is a MultiIndex, it will be expanded
+    into a tensor product of one-dimensional indices (filling in missing
+    values with NaN). This method will produce a Dataset very similar to
+    that on which the 'to_dataframe' method was called, except with
+    possibly redundant dimensions (since all dataset variables will have
+    the same dimensionality)
+
+    Parameters
+    ----------
+    dataframe : DataFrame
+        DataFrame from which to copy data and indices.
+    sparse : bool, default: False
+        If true, create a sparse arrays instead of dense numpy arrays. This
+        can potentially save a large amount of memory if the DataFrame has
+        a MultiIndex. Requires the sparse package (sparse.pydata.org).
+
+    Returns
+    -------
+    New Dataset.
+
+    See Also
+    --------
+    xarray.DataArray.from_series
+    pandas.DataFrame.to_xarray
+    """
+
+    # this is much faster than the default xarray version when not
+    # using a MultiIndex.
+
+    if isinstance(dataframe.index, pd.MultiIndex) or sparse:
+        return Dataset.from_dataframe(dataframe, sparse)
+
+    if not dataframe.columns.is_unique:
+        raise ValueError("cannot convert DataFrame with non-unique columns")
+
+    if isinstance(dataframe.index, pd.CategoricalIndex):
+        idx = dataframe.index.remove_unused_categories()
+    else:
+        idx = dataframe.index
+
+    index_name = idx.name if idx.name is not None else "index"
+    # Cast to a NumPy array first, in case the Series is a pandas Extension
+    # array (which doesn't have a valid NumPy dtype)
+    arrays = {
+        name: ([index_name], np.asarray(dataframe[name].values))
+        for name in dataframe.columns
+        if name != index_name
+    }
+    return Dataset(arrays, coords={index_name: (index_name, dataframe.index.values)})
+
+
 def from_table(
     tbl,
     index_name="index",
@@ -527,8 +583,22 @@ def from_zarr_with_attr(*args, **kwargs):
                 and avalue.endswith("} ")
             ):
                 avalue = ast.literal_eval(avalue[1:-1])
+            if isinstance(avalue, str) and avalue == " < None > ":
+                avalue = None
             attrs[aname] = avalue
         obj[k] = obj[k].assign_attrs(attrs)
+    attrs = {}
+    for aname, avalue in obj.attrs.items():
+        if (
+            isinstance(avalue, str)
+            and avalue.startswith(" {")
+            and avalue.endswith("} ")
+        ):
+            avalue = ast.literal_eval(avalue[1:-1])
+        if isinstance(avalue, str) and avalue == " < None > ":
+            avalue = None
+        attrs[aname] = avalue
+    obj = obj.assign_attrs(attrs)
     return obj
 
 
@@ -759,8 +829,18 @@ def to_zarr_with_attr(self, *args, **kwargs):
         for aname, avalue in self[k].attrs.items():
             if isinstance(avalue, dict):
                 avalue = f" {avalue!r} "
+            if avalue is None:
+                avalue = " < None > "
             attrs[aname] = avalue
         obj[k] = self[k].assign_attrs(attrs)
+    attrs = {}
+    for aname, avalue in self.attrs.items():
+        if isinstance(avalue, dict):
+            avalue = f" {avalue!r} "
+        if avalue is None:
+            avalue = " < None > "
+        attrs[aname] = avalue
+    obj = obj.assign_attrs(attrs)
     return obj.to_zarr(*args, **kwargs)