diff --git a/.coverage b/.coverage deleted file mode 100644 index 1f297399..00000000 Binary files a/.coverage and /dev/null differ diff --git a/.github/workflows/code-coverage.yaml b/.github/workflows/code-coverage.yaml index 0d31cd94..1451a479 100644 --- a/.github/workflows/code-coverage.yaml +++ b/.github/workflows/code-coverage.yaml @@ -13,7 +13,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: 3.13 + python-version: 3.14 - name: Install dependencies run: | diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c3e631fa..7a4e350c 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v4 diff --git a/docs/api/coordinates.md b/docs/api/coordinates.md new file mode 100644 index 00000000..e94b1570 --- /dev/null +++ b/docs/api/coordinates.md @@ -0,0 +1,216 @@ +```{eval-rst} +.. currentmodule:: xdas.coordinates +``` +# xdas.coordinates + +## Coordinates + +Constructor + + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinates +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinates.isdim + Coordinates.get_query + Coordinates.to_index + Coordinates.equals + Coordinates.to_dict + Coordinates.copy + Coordinates.drop_dims + Coordinates.drop_coords +``` + +### Coordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinate +``` + +Attributes + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinate.dtype + Coordinate.ndim + Coordinate.shape + Coordinate.values +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinate.to_index + Coordinate.isscalar + Coordinate.isdense + Coordinate.isinterp +``` + + +### ScalarCoordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + ScalarCoordinate +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + ScalarCoordinate.isvalid + ScalarCoordinate.equals + ScalarCoordinate.to_index + ScalarCoordinate.to_dict +``` + +### DenseCoordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + DenseCoordinate +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + DenseCoordinate.isvalid + DenseCoordinate.index + DenseCoordinate.get_indexer + DenseCoordinate.slice_indexer + DenseCoordinate.to_dict +``` + +### InterpCoordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + InterpCoordinate +``` + +Attributes + + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + InterpCoordinate.tie_indices + InterpCoordinate.tie_values + InterpCoordinate.empty + InterpCoordinate.dtype + InterpCoordinate.ndim + InterpCoordinate.shape + InterpCoordinate.indices + InterpCoordinate.values +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + InterpCoordinate.isvalid + InterpCoordinate.equals + InterpCoordinate.get_value + InterpCoordinate.format_index + InterpCoordinate.slice_index + InterpCoordinate.get_indexer + InterpCoordinate.slice_indexer + InterpCoordinate.decimate + InterpCoordinate.simplify + InterpCoordinate.get_discontinuities + InterpCoordinate.from_array + InterpCoordinate.to_dict +``` + + +### SampledCoordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + SampledCoordinate +``` + +Attributes + + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + SampledCoordinate.tie_values + SampledCoordinate.tie_lengths + SampledCoordinate.tie_indices + SampledCoordinate.sampling_interval + SampledCoordinate.empty + SampledCoordinate.dtype + SampledCoordinate.ndim + SampledCoordinate.shape + SampledCoordinate.indices + SampledCoordinate.values +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + SampledCoordinate.isvalid + SampledCoordinate.equals + SampledCoordinate.get_sampling_interval + SampledCoordinate.get_value + SampledCoordinate.slice_index + SampledCoordinate.get_indexer + SampledCoordinate.slice_indexer + SampledCoordinate.append + SampledCoordinate.decimate + SampledCoordinate.simplify + SampledCoordinate.get_split_indices + SampledCoordinate.from_array + SampledCoordinate.to_dict + SampledCoordinate.from_block +``` \ No newline at end of file diff --git a/docs/api/index.md b/docs/api/index.md index 3f2d665b..d16f304d 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -5,9 +5,11 @@ xdas atoms -io +coordinates fft +io parallel +picking processing signal synthetics diff --git a/docs/api/xdas.md b/docs/api/xdas.md index 3fcfd50f..be198ce9 100644 --- a/docs/api/xdas.md +++ b/docs/api/xdas.md @@ -168,162 +168,3 @@ Methods DataSequence.map ``` -### Coordinates - -Constructor - - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinates -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinates.isdim - Coordinates.get_query - Coordinates.to_index - Coordinates.equals - Coordinates.to_dict - Coordinates.copy - Coordinates.drop_dims - Coordinates.drop_coords -``` - -### Coordinate - -Constructor - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinate -``` - -Attributes - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinate.dtype - Coordinate.ndim - Coordinate.shape - Coordinate.values -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinate.to_index - Coordinate.isscalar - Coordinate.isdense - Coordinate.isinterp -``` - - -### ScalarCoordinate - -Constructor - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - ScalarCoordinate -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - ScalarCoordinate.isvalid - ScalarCoordinate.equals - ScalarCoordinate.to_index - ScalarCoordinate.to_dict -``` - -### DenseCoordinate - -Constructor - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - DenseCoordinate -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - DenseCoordinate.isvalid - DenseCoordinate.index - DenseCoordinate.get_indexer - DenseCoordinate.slice_indexer - DenseCoordinate.to_dict -``` - -### InterpCoordinate - -Constructor - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - InterpCoordinate -``` - -Attributes - - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - InterpCoordinate.tie_indices - InterpCoordinate.tie_values - InterpCoordinate.empty - InterpCoordinate.dtype - InterpCoordinate.ndim - InterpCoordinate.shape - InterpCoordinate.indices - InterpCoordinate.values -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - InterpCoordinate.isvalid - InterpCoordinate.equals - InterpCoordinate.get_value - InterpCoordinate.format_index - InterpCoordinate.slice_index - InterpCoordinate.format_index_slice - InterpCoordinate.get_indexer - InterpCoordinate.slice_indexer - InterpCoordinate.decimate - InterpCoordinate.simplify - InterpCoordinate.get_discontinuities - InterpCoordinate.from_array - InterpCoordinate.to_dict -``` \ No newline at end of file diff --git a/docs/getting-started.md b/docs/getting-started.md index edff32ef..f717052a 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -65,7 +65,7 @@ da Xdas only loads the metadata from each file and returns a {py:class}`~xdas.DataArray` object. This object has mainly two attributes. First a `data` attribute that contain the data. Here a {py:class}`~xdas.VirtualStack` object that is a pointer to the different files we opened. Second, a `coords` attribute that contains the metadata related to how the space and the time are sampled. Here both dimensions are labeled using {py:class}`~xdas.InterpCoordinate` objects. Those allow to concisely store the time and space information, including potential gaps and overlaps. See the [](user-guide/interpolated-coordinates) section for more information. -Note that if you want to create a single data collection object for multiple acquisitions (i.e. different instruments or several acquisition with different parameters), you can use the [DataCollection](user-guide/data-structure/datacollection) structure. +Note that if you want to create a single data collection object for multiple acquisitions (i.e. different instruments or several acquisition with different parameters), you can use the [DataCollection](user-guide/data-structures/datacollection) structure. ```{note} For Febus users, converting native files into Xdas NetCDF format generally improves I/O operations and reduce the amount of data by a factor two. This can be done by looping over Febus files and running: `xdas.open_dataarray("path_to_febus_file.h5", engine="febus").to_netcdf("path_to_xdas_file.nc", virtual=False)`. The converted files can then be linked as described above. diff --git a/docs/release-notes.md b/docs/release-notes.md index 0dd5b92a..d52e3f06 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,5 +1,8 @@ # Release notes +## 0.X.X +- Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). + ## 0.2.4 - Add StreamWriter to write long time series to miniSEED (@marbail). - Fix OptaSense engine wrong axis attribution (@smouellet). diff --git a/docs/user-guide/coordinates/index.md b/docs/user-guide/coordinates/index.md new file mode 100644 index 00000000..62cb7a4f --- /dev/null +++ b/docs/user-guide/coordinates/index.md @@ -0,0 +1,47 @@ +--- +file_format: mystnb +kernelspec: + name: python3 +--- + +# Coordinates + + +{py:class}`~xdas.DataArray` is the base class in *xdas*. It is mainly composed of a N-dimensional array and of a set of {py:class}`~xdas.Coordinate` objects that are gathered in a {py:class}`~xdas.Coordinates` dict-like object than can be accessed by the `DataArray.coords` attribute. Xdas comme with several flavours of {py:class}`~xdas.Coordinate` objects. + +| Type | Description | `name` | `data` | +|:---|:---|:---:|:---:| +| {py:class}`~xdas.coordinates.ScalarCoordinate` | Used to label 0D dimensions | `scalar` | `{"value": any}` | +| {py:class}`~xdas.coordinates.DefaultCoordinate` | Each value is equal to its index | `default` | `{"size": int}` | +| {py:class}`~xdas.coordinates.DenseCoordinate` | Each index is mapped to a given value | `dense` | `array-like[any]` | +| {py:class}`~xdas.coordinates.InterpCoordinate` | Values are interpolated linearly between tie points | `interpolated` | `{"tie_indices": array-like[int], "tie_values": array-like[any]}` | +| {py:class}`~xdas.coordinates.SampledCoordinate` | Values are given as a multiple of a fixed sampling interval and several start values | `sampled` | `{"tie_values": array-like[any], "tie_indices": array-like[int], "sampling_interval": any}` | + +In the current state fo the documentation most of the coordinate information can be found in the [Interpolated Coordinate](interpolated-coordinates) page. + +## Per type information + +```{toctree} +:maxdepth: 1 + +interpolated-coordinates +sampled-coordinates +``` + + \ No newline at end of file diff --git a/docs/user-guide/interpolated-coordinates.md b/docs/user-guide/coordinates/interpolated-coordinates.md similarity index 98% rename from docs/user-guide/interpolated-coordinates.md rename to docs/user-guide/coordinates/interpolated-coordinates.md index d4821f87..6ae7aee9 100644 --- a/docs/user-guide/interpolated-coordinates.md +++ b/docs/user-guide/coordinates/interpolated-coordinates.md @@ -59,7 +59,7 @@ is to `simplify` the coordinates, increasing the tolerance such that the overlap disappear. ``` -# Gaps and Overlaps +## Gaps and Overlaps Gaps and Overlaps can be easily identified based on the tie point positions, and extracted with: @@ -79,7 +79,7 @@ coord = coord.simplify(tolerance=0.0) coord ``` -# Temporal Coordinates +## Temporal Coordinates The main use of coordinates in *xdas* is to deal with long time series. By default *xdas* uses `"datetime64[us]"` dtype. Microseconds are used because to perform diff --git a/docs/user-guide/coordinates/sampled-coordinates.md b/docs/user-guide/coordinates/sampled-coordinates.md new file mode 100644 index 00000000..e02500ff --- /dev/null +++ b/docs/user-guide/coordinates/sampled-coordinates.md @@ -0,0 +1,9 @@ +--- +file_format: mystnb +kernelspec: + name: python3 +--- + +# Sampled Coordinates + +TODO \ No newline at end of file diff --git a/docs/user-guide/data-structure/dataarray.md b/docs/user-guide/data-structures/dataarray.md similarity index 95% rename from docs/user-guide/data-structure/dataarray.md rename to docs/user-guide/data-structures/dataarray.md index 7d7a62e6..9e48ff4e 100644 --- a/docs/user-guide/data-structure/dataarray.md +++ b/docs/user-guide/data-structures/dataarray.md @@ -16,7 +16,7 @@ os.chdir("../../_data") {py:class}`~xdas.DataArray` is the base class to load and manipulate big datasets to in *xdas*. It is mainly composed of two attributes: - `data`: any N-dimensional array-like object. Compared to *xarray* `xdas.DataArray` are more permissive to the kinds of array-like objects that can be used. In particular, [virtual arrays](../virtual-datasets) can be used. -- `coords`: a dict-like container of coordinates. As opposed to *xarray*, which uses dense arrays to label each point, *xdas* also implements [interpolated coordinates](../interpolated-coordinates) that provides an efficient representation of evenly spaced data (gracefully handling gaps and small sampling variations). +- `coords`: a dict-like container of coordinates. As opposed to *xarray*, which uses dense arrays to label each point, *xdas* also implements [interpolated coordinates](../coordinates/interpolated-coordinates) that provides an efficient representation of evenly spaced data (gracefully handling gaps and small sampling variations). ![](/_static/dataarray.svg) @@ -30,7 +30,7 @@ In the following examples, we use only one `DataArray`, if you have several `Dat ## Creating a DataArray -The user can wrap together an n-dimensional array and some related coordinates. See the related description of how to create coordinates [here](../interpolated-coordinates.md). For example: +The user can wrap together an n-dimensional array and some related coordinates. See the related description of how to create coordinates [here](../coordinates/interpolated-coordinates.md). For example: ```{code-cell} diff --git a/docs/user-guide/data-structure/datacollection.md b/docs/user-guide/data-structures/datacollection.md similarity index 100% rename from docs/user-guide/data-structure/datacollection.md rename to docs/user-guide/data-structures/datacollection.md diff --git a/docs/user-guide/data-structure/index.md b/docs/user-guide/data-structures/index.md similarity index 98% rename from docs/user-guide/data-structure/index.md rename to docs/user-guide/data-structures/index.md index 3745bc91..bc158267 100644 --- a/docs/user-guide/data-structure/index.md +++ b/docs/user-guide/data-structures/index.md @@ -1,4 +1,4 @@ -# Data Structure +# Data Structures Xdas leverages two main data structures. diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md index 4ab42147..b8323343 100644 --- a/docs/user-guide/index.md +++ b/docs/user-guide/index.md @@ -3,10 +3,10 @@ ```{toctree} :maxdepth: 1 -data-structure/index +data-structures/index +coordinates/index data-formats virtual-datasets -interpolated-coordinates miniseed convert-displacement atoms diff --git a/docs/user-guide/virtual-datasets.md b/docs/user-guide/virtual-datasets.md index 752b14a7..f271d99a 100644 --- a/docs/user-guide/virtual-datasets.md +++ b/docs/user-guide/virtual-datasets.md @@ -49,7 +49,7 @@ To handle individual files, multiple files, and virtual datasets, *xdas* offers | {py:func}`xdas.open_mfdatatree` | {py:class}`~xdas.DataCollection` | Open a directory tree of files, organizing data in a data collection. | | {py:func}`xdas.open_datacollection` | {py:class}`~xdas.DataCollection` | Open a (virtual) collection. | -Please refer to the [](data-structure/datacollection.md) section for the functions that return a data collection. +Please refer to the [](data-structures/datacollection.md) section for the functions that return a data collection. ## Linking multi-file datasets diff --git a/tests/coordinates/test_coordinates.py b/tests/coordinates/test_coordinates.py new file mode 100644 index 00000000..9f87e336 --- /dev/null +++ b/tests/coordinates/test_coordinates.py @@ -0,0 +1,145 @@ +import numpy as np +import pytest + +import xdas as xd + + +class TestCoordinate: + def test_new(self): + assert xd.Coordinate(1).isscalar() + assert xd.Coordinate([1]).isdense() + assert xd.Coordinate({"tie_values": [], "tie_indices": []}).isinterp() + coord = xd.Coordinate(xd.Coordinate([1]), "dim") + assert coord.isdense() + assert coord.dim == "dim" + + def test_empty(self): + with pytest.raises(TypeError, match="cannot infer coordinate type"): + xd.Coordinate() + + def test_isdim(self): + coord = xd.Coordinate([1, 2, 3]) + assert coord.isdim() is None + coord = xd.Coordinate([1, 2, 3], "dim") + assert coord.isdim() is None + coords = xd.Coordinates({"dim": coord}) + assert coords["dim"].isdim() + coords = xd.Coordinates({"other_dim": coord}) + assert not coords["other_dim"].isdim() + + def test_name(self): + coord = xd.Coordinate([1, 2, 3]) + assert coord.name is None + coord = xd.Coordinate([1, 2, 3], "dim") + assert coord.name == "dim" + coords = xd.Coordinates({"dim": coord}) + assert coords["dim"].name == "dim" + coords = xd.Coordinates({"other_dim": coord}) + assert coords["other_dim"].name == "other_dim" + + def test_to_dataarray(self): + coord = xd.Coordinate([1, 2, 3], "dim") + result = coord.to_dataarray() + expected = xd.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") + assert result.equals(expected) + coord = xd.Coordinate([1, 2, 3]) + with pytest.raises(ValueError, match="unnamed coordinate"): + coord.to_dataarray() + coord = xd.Coordinate([1, 2, 3], "dim") + result = coord.to_dataarray() + expected = xd.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") + assert result.equals(expected) + coords = xd.Coordinates({"dim": coord}) + result = coords["dim"].to_dataarray() + assert result.equals(expected) + coords = xd.Coordinates({"other_dim": coord}) + result = coords["other_dim"].to_dataarray() + expected = xd.DataArray( + [1, 2, 3], coords={"other_dim": coord}, dims=["dim"], name="other_dim" + ) + assert result.equals(expected) + coords["dim"] = [4, 5, 6] + result = coords["dim"].to_dataarray() + expected = xd.DataArray( + [4, 5, 6], + coords={"dim": [4, 5, 6], "other_dim": ("dim", [1, 2, 3])}, + dims=["dim"], + name="dim", + ) + assert result.equals(expected) + result = coords["other_dim"].to_dataarray() + expected = xd.DataArray( + [1, 2, 3], + coords={"dim": [4, 5, 6], "other_dim": ("dim", [1, 2, 3])}, + dims=["dim"], + name="other_dim", + ) + assert result.equals(expected) + + +class TestCoordinates: + def test_init(self): + coords = xd.Coordinates( + {"dim": ("dim", {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]})} + ) + coord = coords["dim"] + assert coord.isinterp() + assert np.allclose(coord.tie_indices, [0, 8]) + assert np.allclose(coord.tie_values, [100.0, 900.0]) + assert coords.isdim("dim") + coords = xd.Coordinates({"dim": [1.0, 2.0, 3.0]}) + coord = coords["dim"] + assert coord.isdense() + assert np.allclose(coord.values, [1.0, 2.0, 3.0]) + assert coords.isdim("dim") + coords = xd.Coordinates( + { + "dim_0": ( + "dim_0", + {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, + ), + "dim_1": ( + "dim_0", + {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, + ), + } + ) + assert coords.isdim("dim_0") + assert not coords.isdim("dim_1") + coords = xd.Coordinates() + assert coords == dict() + assert coords.dims == tuple() + + def test_first_last(self): + coords = xd.Coordinates({"dim_0": [1.0, 2.0, 3.0], "dim_1": [1.0, 2.0, 3.0]}) + assert coords["first"].dim == "dim_0" + assert coords["last"].dim == "dim_1" + + def test_setitem(self): + coords = xd.Coordinates() + coords["dim_0"] = [1, 2, 4] + assert coords.dims == ("dim_0",) + coords["dim_1"] = {"tie_indices": [0, 10], "tie_values": [0.0, 100.0]} + assert coords.dims == ("dim_0", "dim_1") + coords["dim_0"] = [1, 2, 3] + assert coords.dims == ("dim_0", "dim_1") + coords["metadata"] = 0 + assert coords.dims == ("dim_0", "dim_1") + coords["non-dimensional"] = ("dim_0", [-1, -1, -1]) + assert coords.dims == ("dim_0", "dim_1") + coords["other_dim"] = ("dim_2", [0]) + assert coords.dims == ("dim_0", "dim_1", "dim_2") + with pytest.raises(TypeError, match="must be of type str"): + coords[0] = ... + + def test_to_from_dict(self): + starttime = np.datetime64("2020-01-01T00:00:00.000") + endtime = np.datetime64("2020-01-01T00:00:10.000") + coords = { + "time": {"tie_indices": [0, 999], "tie_values": [starttime, endtime]}, + "distance": np.linspace(0, 1000, 3), + "channel": ("distance", ["DAS01", "DAS02", "DAS03"]), + "interrogator": (None, "SRN"), + } + coords = xd.Coordinates(coords) + assert xd.Coordinates.from_dict(coords.to_dict()).equals(coords) diff --git a/tests/coordinates/test_dense.py b/tests/coordinates/test_dense.py new file mode 100644 index 00000000..0524bac5 --- /dev/null +++ b/tests/coordinates/test_dense.py @@ -0,0 +1,137 @@ +import numpy as np +import pandas as pd +import pytest + +from xdas.coordinates import DenseCoordinate, ScalarCoordinate + + +class TestDenseCoordinate: + valid = [ + [1, 2, 3], + np.array([1, 2, 3]), + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0]), + ["a", "b", "c"], + np.array(["a", "b", "c"]), + np.array([1, 2, 3], dtype="datetime64[s]"), + ] + invalid = [ + 1, + np.array(1), + 1.0, + np.array(1.0), + "label", + np.array("label"), + np.datetime64(1, "s"), + {"key": "value"}, + ] + + def test_isvalid(self): + for data in self.valid: + assert DenseCoordinate.isvalid(data) + for data in self.invalid: + assert not DenseCoordinate.isvalid(data) + + def test_init(self): + coord = DenseCoordinate([1, 2, 3]) + assert np.array_equiv(coord.data, [1, 2, 3]) + assert coord.dim is None + coord = DenseCoordinate([1, 2, 3], "dim") + assert coord.dim == "dim" + for data in self.valid: + assert np.array_equiv(DenseCoordinate(data).data, data) + for data in self.invalid: + with pytest.raises(TypeError): + DenseCoordinate(data) + + def test_getitem(self): + assert np.array_equiv(DenseCoordinate([1, 2, 3])[...].values, [1, 2, 3]) + assert isinstance(DenseCoordinate([1, 2, 3])[...], DenseCoordinate) + assert np.array_equiv(DenseCoordinate([1, 2, 3])[:].values, [1, 2, 3]) + assert isinstance(DenseCoordinate([1, 2, 3])[:], DenseCoordinate) + assert np.array_equiv(DenseCoordinate([1, 2, 3])[1].values, 2) + assert isinstance(DenseCoordinate([1, 2, 3])[1], ScalarCoordinate) + assert np.array_equiv(DenseCoordinate([1, 2, 3])[1:].values, [2, 3]) + assert isinstance(DenseCoordinate([1, 2, 3])[1:], DenseCoordinate) + + def test_len(self): + for data in self.valid: + assert len(DenseCoordinate(data)) == 3 + + def test_repr(self): + for data in self.valid: + assert DenseCoordinate(data).__repr__() == np.array2string( + np.asarray(data), threshold=0, edgeitems=1 + ) + + def test_array(self): + for data in self.valid: + assert np.array_equiv(DenseCoordinate(data).__array__(), data) + + def test_dtype(self): + for data in self.valid: + assert DenseCoordinate(data).dtype == np.array(data).dtype + + def test_values(self): + for data in self.valid: + assert np.array_equiv(DenseCoordinate(data).values, data) + + def test_index(self): + for data in self.valid: + assert DenseCoordinate(data).index.equals(pd.Index(data)) + + def test_equals(self): + for data in self.valid: + coord = DenseCoordinate(data) + assert coord.equals(coord) + assert DenseCoordinate([1, 2, 3]).equals(DenseCoordinate([1, 2, 3])) + + def test_isinstance(self): + assert not DenseCoordinate([1, 2, 3]).isscalar() + assert DenseCoordinate([1, 2, 3]).isdense() + assert not DenseCoordinate([1, 2, 3]).isinterp() + + def test_get_indexer(self): + assert DenseCoordinate([1, 2, 3]).get_indexer(2) == 1 + assert np.array_equiv(DenseCoordinate([1, 2, 3]).get_indexer([2, 3]), [1, 2]) + assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="nearest") == 1 + assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="ffill") == 1 + assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="bfill") == 2 + + def test_get_slice_indexer(self): + assert np.array_equiv( + DenseCoordinate([1, 2, 3]).slice_indexer(start=2), slice(1, 3) + ) + + def test_to_index(self): + assert DenseCoordinate([1, 2, 3]).to_index(2) == 1 + assert np.array_equiv(DenseCoordinate([1, 2, 3]).to_index([2, 3]), [1, 2]) + assert np.array_equiv( + DenseCoordinate([1, 2, 3]).to_index(slice(2, None)), slice(1, 3) + ) + + def test_to_from_dict(self): + for data in self.valid: + coord = DenseCoordinate(data) + assert DenseCoordinate.from_dict(coord.to_dict()).equals(coord) + + def test_empty(self): + coord = DenseCoordinate() + assert coord.empty + + def test_append(self): + coord0 = DenseCoordinate() + coord1 = DenseCoordinate([1, 2, 3]) + coord2 = DenseCoordinate([4, 5, 6]) + + result = coord1.append(coord2) + expected = DenseCoordinate([1, 2, 3, 4, 5, 6]) + assert result.equals(expected) + + result = coord2.append(coord1) + expected = DenseCoordinate([4, 5, 6, 1, 2, 3]) + assert result.equals(expected) + + assert coord0.append(coord0).empty + assert coord0.append(coord1).equals(coord1) + assert coord1.append(coord0).equals(coord1) diff --git a/tests/coordinates/test_interp.py b/tests/coordinates/test_interp.py new file mode 100644 index 00000000..f9ef27a1 --- /dev/null +++ b/tests/coordinates/test_interp.py @@ -0,0 +1,314 @@ +import numpy as np +import pytest + +from xdas.coordinates import InterpCoordinate, ScalarCoordinate + + +class TestInterpCoordinate: + valid = [ + {"tie_indices": [], "tie_values": []}, + {"tie_indices": [0], "tie_values": [100.0]}, + {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, + {"tie_indices": [0, 8], "tie_values": [100, 900]}, + { + "tie_indices": [0, 8], + "tie_values": [ + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-01T00:00:08"), + ], + }, + {"tie_indices": np.array([0, 8], dtype="int16"), "tie_values": [100.0, 900.0]}, + ] + invalid = [ + 1, + np.array(1), + 1.0, + np.array(1.0), + "label", + np.array("label"), + np.datetime64(1, "s"), + [1, 2, 3], + np.array([1, 2, 3]), + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0]), + ["a", "b", "c"], + np.array(["a", "b", "c"]), + np.array([1, 2, 3], dtype="datetime64[s]"), + {"key": "value"}, + ] + error = [ + {"tie_indices": 0, "tie_values": [100.0]}, + {"tie_indices": [0], "tie_values": 100.0}, + {"tie_indices": [0, 7, 8], "tie_values": [100.0, 900.0]}, + {"tie_indices": [0.0, 8.0], "tie_values": [100.0, 900.0]}, + {"tie_indices": [1, 9], "tie_values": [100.0, 900.0]}, + {"tie_indices": [8, 0], "tie_values": [100.0, 900.0]}, + {"tie_indices": [8, 0], "tie_values": ["a", "b"]}, + ] + + def test_isvalid(self): + for data in self.valid: + assert InterpCoordinate.isvalid(data) + for data in self.invalid: + assert not InterpCoordinate.isvalid(data) + + def test_init(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert np.array_equiv(coord.data["tie_indices"], [0, 8]) + assert np.array_equiv(coord.data["tie_values"], [100.0, 900.0]) + assert coord.dim is None + coord = InterpCoordinate( + {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, "dim" + ) + assert coord.dim == "dim" + for data in self.valid: + coord = InterpCoordinate(data) + assert np.array_equiv(coord.data["tie_indices"], data["tie_indices"]) + assert np.array_equiv(coord.data["tie_values"], data["tie_values"]) + for data in self.invalid: + with pytest.raises(TypeError): + InterpCoordinate(data) + for data in self.error: + with pytest.raises(ValueError): + InterpCoordinate(data) + + def test_len(self): + assert ( + len(InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]})) + == 9 + ) + assert len(InterpCoordinate(dict(tie_indices=[], tie_values=[]))) == 0 + + @pytest.mark.parametrize("valid_input", valid) + def test_repr(self, valid_input): + coord = InterpCoordinate(data=valid_input) + my_coord = repr(coord) + assert isinstance(my_coord, str) + + def test_equals(self): + coord1 = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + coord2 = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord1.equals(coord2) + + def test_getitem(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert isinstance(coord[0], ScalarCoordinate) + assert coord[0].values == 100.0 + assert coord[4].values == 500.0 + assert coord[8].values == 900.0 + assert coord[-1].values == 900.0 + assert coord[-2].values == 800.0 + assert np.allclose(coord[[1, 2, 3]].values, [200.0, 300.0, 400.0]) + with pytest.raises(IndexError): + coord[9] + coord[-9] + assert coord[0:2].equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 200.0])) + ) + assert coord[:].equals(coord) + assert coord[6:3].equals(InterpCoordinate(dict(tie_indices=[], tie_values=[]))) + assert coord[1:2].equals( + InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) + ) + assert coord[-3:-1].equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[700.0, 800.0])) + ) + + def test_setitem(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + with pytest.raises(TypeError): + coord[1] = 0 + coord[:] = 0 + + def test_asarray(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert np.allclose(np.asarray(coord), coord.values) + + def test_empty(self): + assert not InterpCoordinate( + {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]} + ).empty + assert InterpCoordinate(dict(tie_indices=[], tie_values=[])).empty + + def test_dtype(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.dtype == np.float64 + + def test_ndim(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.ndim == 1 + assert isinstance(coord.ndim, int) + + def test_shape(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.shape == (9,) + + def test_format_index(self): + # TODO + pass + + def test_get_value(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.get_value(0) == 100.0 + assert coord.get_value(4) == 500.0 + assert coord.get_value(8) == 900.0 + assert coord.get_value(-1) == 900.0 + assert coord.get_value(-9) == 100.0 + assert np.allclose(coord.get_value([1, 2, 3, -2]), [200.0, 300.0, 400.0, 800.0]) + with pytest.raises(IndexError): + coord.get_value(-10) + coord.get_value(9) + coord.get_value(0.5) + starttime = np.datetime64("2000-01-01T00:00:00") + endtime = np.datetime64("2000-01-01T00:00:08") + coord = InterpCoordinate( + dict(tie_indices=[0, 8], tie_values=[starttime, endtime]) + ) + assert coord.get_value(0) == starttime + assert coord.get_value(4) == np.datetime64("2000-01-01T00:00:04") + assert coord.get_value(8) == endtime + assert coord.get_value(-1) == endtime + assert coord.get_value(-9) == starttime + + def test_get_index(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.get_indexer(100.0) == 0 + assert coord.get_indexer(900.0) == 8 + assert coord.get_indexer(0.0, "nearest") == 0 + assert coord.get_indexer(1000.0, "nearest") == 8 + assert coord.get_indexer(125.0, "nearest") == 0 + assert coord.get_indexer(175.0, "nearest") == 1 + assert coord.get_indexer(175.0, "ffill") == 0 + assert coord.get_indexer(200.0, "ffill") == 1 + assert coord.get_indexer(200.0, "bfill") == 1 + assert coord.get_indexer(125.0, "bfill") == 1 + assert np.all(np.equal(coord.get_indexer([100.0, 900.0]), [0, 8])) + with pytest.raises(KeyError): + assert coord.get_indexer(0.0) == 0 + assert coord.get_indexer(1000.0) == 8 + assert coord.get_indexer(150.0) == 0 + assert coord.get_indexer(1000.0, "bfill") == 8 + assert coord.get_indexer(0.0, "ffill") == 0 + + starttime = np.datetime64("2000-01-01T00:00:00") + endtime = np.datetime64("2000-01-01T00:00:08") + coord = InterpCoordinate( + dict(tie_indices=[0, 8], tie_values=[starttime, endtime]) + ) + assert coord.get_indexer(starttime) == 0 + assert coord.get_indexer(endtime) == 8 + assert coord.get_indexer(str(starttime)) == 0 + assert coord.get_indexer(str(endtime)) == 8 + assert coord.get_indexer("2000-01-01T00:00:04.1", "nearest") == 4 + + def test_indices(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert np.all(np.equal(coord.indices, np.arange(9))) + + def test_values(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert np.allclose(coord.values, np.arange(100.0, 1000.0, 100.0)) + + def test_get_index_slice(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.slice_indexer(100.0, 200.0) == slice(0, 2) + assert coord.slice_indexer(150.0, 250.0) == slice(1, 2) + assert coord.slice_indexer(300.0, 500.0) == slice(2, 5) + assert coord.slice_indexer(0.0, 500.0) == slice(0, 5) + assert coord.slice_indexer(125.0, 175.0) == slice(1, 1) + assert coord.slice_indexer(0.0, 50.0) == slice(0, 0) + assert coord.slice_indexer(1000.0, 1100.0) == slice(9, 9) + assert coord.slice_indexer(1000.0, 500.0) == slice(9, 5) + assert coord.slice_indexer(None, None) == slice(None, None) + + def test_slice_index(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.slice_index(slice(0, 2)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 200.0])) + ) + assert coord.slice_index(slice(7, None)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[800.0, 900.0])) + ) + assert coord.slice_index(slice(None, None)).equals(coord) + assert coord.slice_index(slice(0, 0)).equals( + InterpCoordinate(dict(tie_indices=[], tie_values=[])) + ) + assert coord.slice_index(slice(4, 2)).equals( + InterpCoordinate(dict(tie_indices=[], tie_values=[])) + ) + assert coord.slice_index(slice(9, 9)).equals( + InterpCoordinate(dict(tie_indices=[], tie_values=[])) + ) + assert coord.slice_index(slice(3, 3)).equals( + InterpCoordinate(dict(tie_indices=[], tie_values=[])) + ) + assert coord.slice_index(slice(0, -1)).equals( + InterpCoordinate(dict(tie_indices=[0, 7], tie_values=[100.0, 800.0])) + ) + assert coord.slice_index(slice(0, -2)).equals( + InterpCoordinate(dict(tie_indices=[0, 6], tie_values=[100.0, 700.0])) + ) + assert coord.slice_index(slice(-2, None)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[800.0, 900.0])) + ) + assert coord.slice_index(slice(1, 2)).equals( + InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) + ) + assert coord.slice_index(slice(1, 3, 2)).equals( + InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) + ) + assert coord.slice_index(slice(None, None, 2)).equals( + InterpCoordinate(dict(tie_indices=[0, 4], tie_values=[100.0, 900.0])) + ) + assert coord.slice_index(slice(None, None, 3)).equals( + InterpCoordinate(dict(tie_indices=[0, 2], tie_values=[100.0, 700.0])) + ) + assert coord.slice_index(slice(None, None, 4)).equals( + InterpCoordinate(dict(tie_indices=[0, 2], tie_values=[100.0, 900.0])) + ) + assert coord.slice_index(slice(None, None, 5)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 600.0])) + ) + assert coord.slice_index(slice(2, 7, 3)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[300.0, 600.0])) + ) + + def test_to_index(self): + # TODO + pass + + def test_simplify(self): + xp = np.sort(np.random.choice(10000, 1000, replace=False)) + xp[0] = 0 + xp[-1] = 10000 + yp = xp + (np.random.rand(1000) - 0.5) + coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) + assert len(coord.simplify(1.0).tie_indices) == 2 + + def test_singleton(self): + coord = InterpCoordinate({"tie_indices": [0], "tie_values": [1.0]}) + assert coord[0].values == 1.0 + + def test_to_from_dict(self): + for data in self.valid: + coord = InterpCoordinate(data) + assert InterpCoordinate.from_dict(coord.to_dict()).equals(coord) + + def test_append(self): + coord0 = InterpCoordinate() + coord1 = InterpCoordinate({"tie_indices": [0, 2], "tie_values": [0, 20]}) + coord2 = InterpCoordinate({"tie_indices": [0, 2], "tie_values": [30, 50]}) + + result = coord1.append(coord2).simplify() + expected = InterpCoordinate({"tie_indices": [0, 5], "tie_values": [0, 50]}) + assert result.equals(expected) + + result = coord2.append(coord1).simplify() + expected = InterpCoordinate( + {"tie_indices": [0, 2, 3, 5], "tie_values": [30, 50, 0, 20]} + ) + assert result.equals(expected) + + assert coord0.append(coord0).empty + assert coord0.append(coord1).equals(coord1) + assert coord1.append(coord0).equals(coord1) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py new file mode 100644 index 00000000..82d083d5 --- /dev/null +++ b/tests/coordinates/test_sampled.py @@ -0,0 +1,900 @@ +import tempfile + +import numpy as np +import pandas as pd +import pytest + +import xdas as xd +from xdas.coordinates import ( + Coordinate, + DenseCoordinate, + SampledCoordinate, + ScalarCoordinate, +) + + +class TestSampledCoordinateBasics: + def test_isvalid(self): + assert SampledCoordinate.isvalid( + {"tie_values": [0.0], "tie_lengths": [1], "sampling_interval": 1.0} + ) + assert SampledCoordinate.isvalid( + { + "tie_values": [np.datetime64("2000-01-01T00:00:00")], + "tie_lengths": [1], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + assert not SampledCoordinate.isvalid({"tie_values": [0.0], "tie_lengths": [1]}) + assert not SampledCoordinate.isvalid({}) + + def test_init_and_empty(self): + empty = SampledCoordinate() + assert empty.empty + assert len(empty) == 0 + assert empty.dtype is not None + assert empty.shape == (0,) + assert empty.ndim == 1 + assert empty.values.size == 0 + assert empty.indices.size == 0 + + def test_init_validation_numeric(self): + # valid numeric + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + assert len(coord) == 3 + assert coord.start == 0.0 + assert coord.end == 3.0 + assert coord.issampled() + coord.get_sampling_interval() == 1.0 + + # mismatched lengths + with pytest.raises(ValueError): + SampledCoordinate( + { + "tie_values": [0.0, 10.0], + "tie_lengths": [3], + "sampling_interval": 1.0, + } + ) + # non-integer lengths + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [1.5], "sampling_interval": 1.0} + ) + # non-positive lengths + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [0], "sampling_interval": 1.0} + ) + # sampling interval must be scalar + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": [1.0]} + ) + + # non-numeric tie_values + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": ["a"], "tie_lengths": [3], "sampling_interval": 1.0} + ) + + def test_init_validation_datetime(self): + # valid datetime with timedelta sampling interval + t0 = np.datetime64("2000-01-01T00:00:00") + coord = SampledCoordinate( + { + "tie_values": [t0], + "tie_lengths": [2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + assert coord.start == t0 + assert coord.end == t0 + np.timedelta64(2, "s") + assert coord.get_sampling_interval() == 1 + assert coord.get_sampling_interval(cast=False) == np.timedelta64(1, "s") + + # invalid: datetime with numeric sampling interval + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": [t0], "tie_lengths": [2], "sampling_interval": 1} + ) + + def test_invalid_data(self): + # lack of required keys + with pytest.raises(ValueError): + SampledCoordinate({"tie_values": [0.0], "tie_lengths": [3]}) + with pytest.raises(ValueError): + SampledCoordinate({"tie_lengths": [3], "sampling_interval": 1.0}) + with pytest.raises(ValueError): + SampledCoordinate({"tie_values": [0.0], "sampling_interval": 1.0}) + + def test_invalid_shapes(self): + # tie_values and tie_lengths must be 1D + with pytest.raises(ValueError): + SampledCoordinate( + { + "tie_values": [[0.0, 10.0]], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + } + ) + with pytest.raises(ValueError): + SampledCoordinate( + { + "tie_values": [0.0, 10.0], + "tie_lengths": [[3], [2]], + "sampling_interval": 1.0, + } + ) + + +class TestSampledCoordinateIndexing: + def make_coord(self): + # Two segments: [0,1,2] and [10,11] + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def test_len_indices_values(self): + coord = self.make_coord() + assert len(coord) == 5 + assert np.array_equal(coord.indices, np.arange(5)) + assert np.array_equal(coord.values, np.array([0.0, 1.0, 2.0, 10.0, 11.0])) + + def test_get_value_scalar_and_vector(self): + coord = self.make_coord() + # scalar + assert coord.get_value(0) == 0.0 + assert coord.get_value(1) == 1.0 + assert coord.get_value(2) == 2.0 + assert coord.get_value(3) == 10.0 + assert coord.get_value(4) == 11.0 + # negative index + assert coord.get_value(-1) == 11.0 + assert coord.get_value(-2) == 10.0 + assert coord.get_value(-3) == 2.0 + assert coord.get_value(-4) == 1.0 + assert coord.get_value(-5) == 0.0 + # vectorized + vals = coord.get_value([0, 1, 2, 3, 4, -5, -4, -3, -2, -1]) + assert np.array_equal( + vals, np.array([0.0, 1.0, 2.0, 10.0, 11.0, 0.0, 1.0, 2.0, 10.0, 11.0]) + ) + # bounds + with pytest.raises(IndexError): + coord.get_value(-6) + with pytest.raises(IndexError): + coord.get_value(5) + with pytest.raises(IndexError): + coord.get_value([0, 5]) + with pytest.raises(IndexError): + coord.get_value([-6, 0]) + + def test_values(self): + coord = self.make_coord() + expected = np.array([0.0, 1.0, 2.0, 10.0, 11.0]) + assert np.array_equal(coord.values, expected) + assert np.array_equal(coord.__array__(), expected) + assert np.array_equal(coord.__array__(dtype=expected.dtype), expected) + + def test_getitem(self): + coord = self.make_coord() + # scalar -> ScalarCoordinate + item = coord[1] + assert isinstance(item, ScalarCoordinate) + assert item.values == 1.0 + # slice -> SampledCoordinate or compatible + sub = coord[1:4] + assert isinstance(sub, SampledCoordinate) + assert np.array_equal(sub.values, np.array([1.0, 2.0, 10.0])) + # slice negative + sub_neg = coord[-4:-1] + assert isinstance(sub_neg, SampledCoordinate) + assert np.array_equal(sub_neg.values, np.array([1.0, 2.0, 10.0])) + # full slice + full = coord[:] + assert full.equals(coord) + # None bound indexing + none_start = coord[None:3] + assert isinstance(none_start, SampledCoordinate) + assert np.array_equal(none_start.values, np.array([0.0, 1.0, 2.0])) + none_end = coord[2:None] + assert isinstance(none_end, SampledCoordinate) + assert np.array_equal(none_end.values, np.array([2.0, 10.0, 11.0])) + # step slice -> SampledCoordinate + step = coord[::2] + assert isinstance(step, SampledCoordinate) + assert np.array_equal(step.values, np.array([0.0, 2.0, 11.0])) + # step slice with start/stop + step_ss = coord[1:5:2] + assert isinstance(step_ss, SampledCoordinate) + assert np.array_equal(step_ss.values, np.array([1.0, 10.0])) + # negative step slice with start/stop + step_ss_neg = coord[-4:-1:2] + assert isinstance(step_ss_neg, SampledCoordinate) + assert np.array_equal(step_ss_neg.values, np.array([1.0, 10.0])) + # negative step slice -> raise NotImplementedError + with pytest.raises(NotImplementedError): + coord[::-1] + # array -> DenseCoordinate of values + arr = coord[[0, 4]] + assert isinstance(arr, DenseCoordinate) + assert np.array_equal(arr.values, np.array([0.0, 11.0])) + # negative step is not implemented yet + with pytest.raises(NotImplementedError): + coord[4:0:-1] + + def test_repr(self): + # floating coord + floating = self.make_coord() + assert isinstance(repr(floating), str) + # integer coord + integer = SampledCoordinate( + {"tie_values": [0], "tie_lengths": [3], "sampling_interval": 1} + ) + assert isinstance(repr(integer), str) + # empty coord + empty = SampledCoordinate() + assert repr(empty) == "empty coordinate" + # singleton + singleton = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [1], "sampling_interval": 1.0} + ) + assert isinstance(repr(singleton), str) + # numeric coord + datetime = SampledCoordinate( + { + "tie_values": [np.datetime64("2000-01-01T00:00:00")], + "tie_lengths": [3], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + assert isinstance(repr(datetime), str) + + +class TestSampledCoordinateSliceEdgeCases: + def make_coord(self): + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def test_slice_negative_and_out_of_bounds(self): + coord = self.make_coord() + # negative slice indices + s = coord[-4:-1] + assert isinstance(s, SampledCoordinate) + assert np.array_equal(s.values, np.array([1.0, 2.0, 10.0])) + # slice that extends beyond bounds should clip + s2 = coord[-10:10] + assert s2.equals(coord) + + def test_slice_step_decimate(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [10], "sampling_interval": 1.0} + ) + stepped = coord[::2] + decimated = coord.decimate(2) + assert isinstance(stepped, SampledCoordinate) + assert decimated.equals(stepped) + + +class TestSampledCoordinateValueBasedIndexing: + def make_coord(self): + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) # two segments: [0, 1, 2] and [10, 11] + + def make_coord_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + return SampledCoordinate( + { + "tie_values": [t0, t0 + np.timedelta64(10, "s")], + "tie_lengths": [3, 2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + + def test_get_indexer_exact(self): + # float + coord = self.make_coord() + assert coord.get_indexer(0.0, method=None) == 0 + assert coord.get_indexer(10.0, method=None) == 3 + with pytest.raises(KeyError): + coord.get_indexer(1.5, method=None) + with pytest.raises(KeyError): + coord.get_indexer(5.0, method=None) + + # datetime + coord = self.make_coord_datetime() + t0 = coord[0].values + assert coord.get_indexer(t0, method=None) == 0 + assert coord.get_indexer(t0 + np.timedelta64(10, "s"), method=None) == 3 + with pytest.raises(KeyError): + coord.get_indexer(t0 + np.timedelta64(1500, "ms"), method=None) + with pytest.raises(KeyError): + coord.get_indexer(t0 + np.timedelta64(5, "s"), method=None) + + def test_get_indexer_nearest(self): + # float + coord = self.make_coord() + vals = [0.0, 0.4, 0.6, 1.0, 10.4, 10.6, -10.0, 20.0, 5.9, 6.0, 6.1] + expected = [0, 0, 1, 1, 3, 4, 0, 4, 2, 3, 3] + # scalar + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="nearest") + assert idx == e + # vectorized + idxs = coord.get_indexer(vals, method="nearest") + assert np.array_equal(idxs, np.array(expected)) + + # datetime + coord = self.make_coord_datetime() + t0 = coord[0].values + vals = t0 + np.rint(1000 * np.array(vals)).astype("timedelta64[ms]") + # scalar + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="nearest") + assert idx == e + # vectorized + idxs = coord.get_indexer(vals, method="nearest") + assert np.array_equal(idxs, np.array(expected)) + + def test_get_indexer_ffill(self): + # float + coord = self.make_coord() + vals = [0.0, 0.4, 0.6, 1.0, 10.4, 10.6, 20.0, 5.9, 6.0, 6.1] + expected = [0, 0, 0, 1, 3, 3, 4, 2, 2, 2] + # scalar + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="ffill") + assert idx == e + with pytest.raises(KeyError): + coord.get_indexer(-10.0, method="ffill") + # vectorized + idxs = coord.get_indexer(vals, method="ffill") + assert np.array_equal(idxs, np.array(expected)) + with pytest.raises(KeyError): + coord.get_indexer([-10.0, 0.0], method="ffill") + + # datetime + coord = self.make_coord_datetime() + t0 = coord[0].values + vals = t0 + np.rint(1000 * np.array(vals)).astype("timedelta64[ms]") + # scalar + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="ffill") + assert idx == e + with pytest.raises(KeyError): + coord.get_indexer(t0 - np.timedelta64(10, "s"), method="ffill") + # vectorized + idxs = coord.get_indexer(vals, method="ffill") + assert np.array_equal(idxs, np.array(expected)) + with pytest.raises(KeyError): + coord.get_indexer([t0 - np.timedelta64(10, "s"), t0], method="ffill") + + def test_get_indexer_bfill(self): + # float + coord = self.make_coord() + vals = [0.0, 0.4, 0.6, 1.0, 10.4, 10.6, -10.0, 5.9, 6.0, 6.1] + expected = [0, 1, 1, 1, 4, 4, 0, 3, 3, 3] + # scalar + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="bfill") + assert idx == e + with pytest.raises(KeyError): + coord.get_indexer(20.0, method="bfill") + # vectorized + idxs = coord.get_indexer(vals, method="bfill") + assert np.array_equal(idxs, np.array(expected)) + with pytest.raises(KeyError): + coord.get_indexer([11.0, 20.0], method="bfill") + + # datetime + coord = self.make_coord_datetime() + t0 = coord[0].values + vals = t0 + np.rint(1000 * np.array(vals)).astype("timedelta64[ms]") + # scalar + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="bfill") + assert idx == e + with pytest.raises(KeyError): + coord.get_indexer(t0 + np.timedelta64(20, "s"), method="bfill") + # vectorized + idxs = coord.get_indexer(vals, method="bfill") + assert np.array_equal(idxs, np.array(expected)) + with pytest.raises(KeyError): + coord.get_indexer([t0, t0 + np.timedelta64(20, "s")], method="bfill") + + def test_get_indexer_overlap(self): + coord = SampledCoordinate( + {"tie_values": [0.0, 2.0], "tie_lengths": [3, 3], "sampling_interval": 1.0} + ) # segments: [0,1,2] and [2,3,4] + assert coord.get_indexer(1.0) == 1 + assert coord.get_indexer(3.0) == 4 + with pytest.raises(KeyError): + coord.get_indexer(2.0) + coord = SampledCoordinate( + {"tie_values": [0.0, 2.0], "tie_lengths": [5, 5], "sampling_interval": 1.0} + ) # segments: [0,1,2,3,4] and [2,3,4,5,6] + assert coord.get_indexer(1.0) == 1 + assert coord.get_indexer(6.0) == 9 + with pytest.raises(KeyError): + coord.get_indexer(2.0) + with pytest.raises(KeyError): + coord.get_indexer(2.5, method="nearest") + with pytest.raises(KeyError): + coord.get_indexer(4.0) + + def test_get_indexer_invalid_method(self): + coord = self.make_coord() + with pytest.raises(ValueError): + coord.get_indexer(0.0, method="invalid") + + +class TestSampledCoordinateAppend: + def test_append_two_coords(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate( + {"tie_values": [10.0], "tie_lengths": [2], "sampling_interval": 1.0} + ) + expected = SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + result = coord1.append(coord2) + assert result.equals(expected) + + def test_append_two_datetime_coords(self): + coord1 = SampledCoordinate( + { + "tie_values": [np.datetime64("2000-01-01T00:00:00")], + "tie_lengths": [3], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + coord2 = SampledCoordinate( + { + "tie_values": [np.datetime64("2000-01-01T00:00:10")], + "tie_lengths": [2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + expected = SampledCoordinate( + { + "tie_values": [ + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-01T00:00:10"), + ], + "tie_lengths": [3, 2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + result = coord1.append(coord2) + assert result.equals(expected) + + def test_append_empty(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate() + assert coord1.append(coord2).equals(coord1) + assert coord2.append(coord1).equals(coord1) + + def test_append_sampling_interval_mismatch(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate( + {"tie_values": [10.0], "tie_lengths": [2], "sampling_interval": 2.0} + ) + with pytest.raises(ValueError): + coord1.append(coord2) + + def test_append_dtype_mismatch(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate( + { + "tie_values": [np.datetime64("2000-01-01T00:00:00")], + "tie_lengths": [1], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + with pytest.raises(ValueError): + coord1.append(coord2) + + def test_append_type_mismatch(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = DenseCoordinate(np.array([10.0, 11.0])) + with pytest.raises(TypeError): + coord1.append(coord2) + + def test_append_dimension_mismatch(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0}, + dim="time", + ) + coord2 = SampledCoordinate( + {"tie_values": [10.0], "tie_lengths": [2], "sampling_interval": 1.0}, + dim="depth", + ) + with pytest.raises(ValueError): + coord1.append(coord2) + + +class TestSampledCoordinateDiscontinuitiesAvailabilities: + def test_discontinuities_and_availabilities(self): + # tie_lengths set to create 2 segments + coord = SampledCoordinate( + {"tie_values": [0.0, 5.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + dis = coord.get_discontinuities() + avail = coord.get_availabilities() + # expect DataFrame with specific columns + for df in (dis, avail): + assert isinstance(df, pd.DataFrame) + assert set(df.columns) >= { + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + } + # availabilities should list segments (2 segments -> 2 records) + assert len(avail) >= 1 + + +class TestSampledCoordinateToDatasetAndDict: + def test_to_dict_contains_expected_keys(self): + coord = SampledCoordinate( + { + "tie_values": [0.0, 10.0], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + }, + dim="time", + ) + d = coord.to_dict() + assert "dim" in d + assert "data" in d + assert set(d["data"].keys()) >= { + "tie_values", + "tie_lengths", + "sampling_interval", + } + + def test_to_dict_with_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + coord = SampledCoordinate( + { + "tie_values": [t0, t0 + np.timedelta64(10, "s")], + "tie_lengths": [3, 2], + "sampling_interval": np.timedelta64(1, "s"), + }, + dim="time", + ) + d = coord.to_dict() + assert "dim" in d + assert "data" in d + assert set(d["data"].keys()) >= { + "tie_values", + "tie_lengths", + "sampling_interval", + } + + +class TestSampledCoordinateSlicing: + def make_coord(self): + # Two segments: [0,1,2] and [10,11] + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def test_slice_within_segment(self): + coord = self.make_coord() + sliced = coord[0:2] + assert isinstance(sliced, SampledCoordinate) + assert len(sliced) == 2 + assert np.array_equal(sliced.values, np.array([0.0, 1.0])) + + def test_slice_cross_segments(self): + coord = self.make_coord() + sliced = coord[1:4] + assert isinstance(sliced, SampledCoordinate) + assert len(sliced) == 3 + assert np.array_equal(sliced.values, np.array([1.0, 2.0, 10.0])) + + def test_slice_full(self): + coord = self.make_coord() + sliced = coord[:] + assert sliced.equals(coord) + + +class TestSampledCoordinateDecimate: + def test_decimate(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [10], "sampling_interval": 1.0} + ) + decimated = coord.decimate(2) + assert decimated.sampling_interval == 2.0 + assert decimated.tie_lengths[0] == 5 # (10 + 2 - 1) // 2 = 5 + + +class TestSampledCoordinateSimplify: + def test_simplify_continuous(self): + # Two continuous segments should merge + coord = SampledCoordinate( + { + "tie_values": [0.0, 3.0], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + } + ) + result = coord.simplify() + expected = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [5], "sampling_interval": 1.0} + ) + assert result.equals(expected) + + def test_simplify_with_tolerance(self): + # Two nearly continuous segments should merge with tolerance + coord = SampledCoordinate( + { + "tie_values": [0.0, 3.1], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + } + ) + result = coord.simplify(tolerance=0.2) + expected = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [5], "sampling_interval": 1.0} + ) + assert result.equals(expected) + # more advanced test + coord = SampledCoordinate( + { + "tie_values": 10 * np.arange(100) + np.random.rand(100) * 0.2 - 0.1, + "tie_lengths": 10 * np.ones(100, dtype=int), + "sampling_interval": 1.0, + } + ) + result = coord.simplify(tolerance=0.2) + assert len(result.tie_values) == 1 + # extra test + coord = SampledCoordinate( + { + "tie_values": 10 * np.arange(100) + np.random.rand(100) * 0.2 - 0.1, + "tie_lengths": 10 * np.ones(100, dtype=int), + "sampling_interval": 1.0, + } + ) + result = coord.simplify(tolerance=0.1) + assert np.all(np.abs(result.values - coord.values) <= 0.1) + + +class TestSampledCoordinateGetIndexer: + def make_coord(self): + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def test_get_indexer_exact(self): + coord = self.make_coord() + idx = coord.get_indexer(0.0, method="nearest") + assert idx == 0 + idx = coord.get_indexer(10.0, method="nearest") + assert idx == 3 + + def test_get_indexer_nearest(self): + coord = self.make_coord() + idx = coord.get_indexer(0.5, method="nearest") + assert idx in [0, 1] + + def test_get_indexer_out_of_bounds(self): + coord = self.make_coord() + with pytest.raises(KeyError): + coord.get_indexer(100.0) + + +class TestSampledCoordinateArithmetic: + def test_add(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + result = coord + 10.0 + assert result.tie_values[0] == 10.0 + assert np.array_equal(result.values, np.array([10.0, 11.0, 12.0])) + + def test_sub(self): + coord = SampledCoordinate( + {"tie_values": [10.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + result = coord - 5.0 + assert result.tie_values[0] == 5.0 + assert np.array_equal(result.values, np.array([5.0, 6.0, 7.0])) + + +class TestSampledCoordinateSerialization: + def test_to_from_dict(self): + coord = SampledCoordinate( + { + "tie_values": [0.0, 10.0], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + }, + dim="time", + ) + d = coord.to_dict() + # round-trip via Coordinate factory + back = Coordinate.from_dict(d) + assert isinstance(back, SampledCoordinate) + assert back.equals(coord) + + +class TestSampledCoordinateDatetime: + def make_dt_coord(self): + t0 = np.datetime64("2000-01-01T00:00:00") + return SampledCoordinate( + { + "tie_values": [t0, t0 + np.timedelta64(10, "s")], + "tie_lengths": [3, 2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + + def test_datetime_values_and_dtype(self): + coord = self.make_dt_coord() + assert np.issubdtype(coord.dtype, np.datetime64) + vals = coord.values + assert np.issubdtype(vals.dtype, np.datetime64) + assert vals[0] == np.datetime64("2000-01-01T00:00:00") + assert vals[3] == np.datetime64("2000-01-01T00:00:10") + + def test_get_value_datetime(self): + coord = self.make_dt_coord() + assert coord.get_value(1) == np.datetime64("2000-01-01T00:00:01") + assert coord.get_value(4) == np.datetime64("2000-01-01T00:00:11") + with pytest.raises(IndexError): + coord.get_value(5) + + def test_get_indexer_datetime_methods(self): + coord = self.make_dt_coord() + t = np.datetime64("2000-01-01T00:00:01.500") + # exact required when method=None -> should raise + with pytest.raises(KeyError): + coord.get_indexer(t) + # method variants + assert coord.get_indexer(t, method="nearest") in [1, 2] + assert coord.get_indexer(t, method="ffill") == 1 + assert coord.get_indexer(t, method="bfill") == 2 + # bounds + with pytest.raises(KeyError): + coord.get_indexer(np.datetime64("1999-12-31T23:59:59")) + with pytest.raises(KeyError): + coord.get_indexer(np.datetime64("2000-01-01T00:00:12")) + # string input + assert coord.get_indexer("2000-01-01T00:00:01.500", method="nearest") in [1, 2] + # invalid method + with pytest.raises(ValueError): + coord.get_indexer(t, method="bad") + + def test_start_end_properties_datetime(self): + coord = self.make_dt_coord() + assert coord.start == np.datetime64("2000-01-01T00:00:00") + # end is last tie_value + sampling_interval * last_length + assert coord.end == np.datetime64("2000-01-01T00:00:12") + + +class TestSampledCoordinateIndexerEdgeCases: + def test_invalid_method_raises(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + with pytest.raises(ValueError): + coord.get_indexer(0.0, method="bad") + + def test_non_increasing_tie_values_raises(self): + coord = SampledCoordinate( + {"tie_values": [2.0, 1.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + with pytest.raises(ValueError): + coord.get_indexer(2.0) + + +class TestSampledCoordinateToNetCDF: + def make_dataarray(self): + return xd.DataArray( + np.random.rand(20, 30), + { + "time": { + "tie_values": [ + np.datetime64("2000-01-01T00:00:00.000000000"), + np.datetime64("2000-01-01T00:00:10.000000000"), + ], + "tie_lengths": [5, 15], + "sampling_interval": np.timedelta64(1_000_000_000, "ns").astype( + "timedelta64[ns]" + ), + }, + "distance": { + "tie_values": [0.0], + "tie_lengths": [30], + "sampling_interval": 1.0, + }, + }, + ) + + def test_to_dataset_and_back(self): + import xarray as xr + + da = self.make_dataarray() + dataset = xr.Dataset() + variable_attrs = {} + + # prepare metadata + for coord in da.coords.values(): + dataset, variable_attrs = coord.to_dataset(dataset, variable_attrs) + + dataset["data"] = xr.DataArray(attrs=variable_attrs) + coords = xd.Coordinates.from_dataset(dataset, "data") + + assert coords.equals(da.coords) + + def test_to_netcdf_and_back(self): + expected = self.make_dataarray() + + with tempfile.NamedTemporaryFile(suffix=".nc", delete=False) as file: + expected.to_netcdf(file.name) + result = xd.open_dataarray(file.name) + assert result.equals(expected) + + +class TestGetSplitIndices: + def test_get_split_indices_no_tolerance(self): + coord = SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + div_points = coord.get_split_indices() + expected = np.array([3]) # indices where segments end + assert np.array_equal(div_points, expected) + + def test_get_split_indices_with_tolerance(self): + coord = SampledCoordinate( + { + "tie_values": [0.0, 3.1, 10.0], + "tie_lengths": [3, 2, 2], + "sampling_interval": 1.0, + } + ) + div_points = coord.get_split_indices(tolerance=0.2) + expected = np.array([5]) # only the second gap exceeds tolerance + assert np.array_equal(div_points, expected) + + +class TestFromBlock: + def test_from_block(self): + result = SampledCoordinate.from_block(start=0.0, size=5, step=1.0) + expected = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [5], "sampling_interval": 1.0} + ) + assert result.equals(expected) + + +class TestNotImplementedMethods: + def test_raises(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + with pytest.raises(NotImplementedError): + coord.__array_ufunc__(None, None) + with pytest.raises(NotImplementedError): + coord.__array_function__(None, None, None, None) + with pytest.raises(NotImplementedError): + coord.from_array(None) diff --git a/tests/coordinates/test_scalar.py b/tests/coordinates/test_scalar.py new file mode 100644 index 00000000..9060f60d --- /dev/null +++ b/tests/coordinates/test_scalar.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +from xdas.coordinates import ScalarCoordinate + + +class TestScalarCoordinate: + valid = [ + 1, + np.array(1), + 1.0, + np.array(1.0), + "label", + np.array("label"), + np.datetime64(1, "s"), + ] + invalid = [[1], np.array([1]), {"key": "value"}] + + def test_isvalid(self): + for data in self.valid: + assert ScalarCoordinate.isvalid(data) + for data in self.invalid: + assert not ScalarCoordinate.isvalid(data) + + def test_init(self): + coord = ScalarCoordinate(1) + assert coord.data == 1 + assert coord.dim is None + coord = ScalarCoordinate(1, None) + assert coord.dim is None + with pytest.raises(ValueError): + ScalarCoordinate(1, "dim") + for data in self.valid: + assert ScalarCoordinate(data).data == np.array(data) + for data in self.invalid: + with pytest.raises(TypeError): + ScalarCoordinate(data) + + def test_getitem(self): + assert ScalarCoordinate(1)[...].equals(ScalarCoordinate(1)) + with pytest.raises(IndexError): + ScalarCoordinate(1)[:] + with pytest.raises(IndexError): + ScalarCoordinate(1)[0] + + def test_len(self): + with pytest.raises(TypeError): + len(ScalarCoordinate(1)) + + def test_repr(self): + for data in self.valid: + assert ScalarCoordinate(data).__repr__() == np.array2string( + np.asarray(data), threshold=0, edgeitems=1 + ) + + def test_array(self): + for data in self.valid: + assert ScalarCoordinate(data).__array__() == np.array(data) + + def test_dtype(self): + for data in self.valid: + assert ScalarCoordinate(data).dtype == np.array(data).dtype + + def test_values(self): + for data in self.valid: + assert ScalarCoordinate(data).values == np.array(data) + + def test_equals(self): + for data in self.valid: + coord = ScalarCoordinate(data) + assert coord.equals(coord) + assert ScalarCoordinate(1).equals(ScalarCoordinate(np.array(1))) + + def test_to_index(self): + with pytest.raises(NotImplementedError): + ScalarCoordinate(1).to_index("item") + + def test_isinstance(self): + assert ScalarCoordinate(1).isscalar() + assert not ScalarCoordinate(1).isdense() + assert not ScalarCoordinate(1).isinterp() + + def test_to_from_dict(self): + for data in self.valid: + coord = ScalarCoordinate(data) + assert ScalarCoordinate.from_dict(coord.to_dict()).equals(coord) + + def test_empty(self): + with pytest.raises(TypeError, match="cannot be empty"): + ScalarCoordinate() diff --git a/tests/test_coordinates.py b/tests/test_coordinates.py deleted file mode 100644 index 0b295fa7..00000000 --- a/tests/test_coordinates.py +++ /dev/null @@ -1,681 +0,0 @@ -import numpy as np -import pandas as pd -import pytest - -import xdas -from xdas.core.coordinates import DenseCoordinate, InterpCoordinate, ScalarCoordinate - - -class TestScalarCoordinate: - valid = [ - 1, - np.array(1), - 1.0, - np.array(1.0), - "label", - np.array("label"), - np.datetime64(1, "s"), - ] - invalid = [[1], np.array([1]), {"key": "value"}] - - def test_isvalid(self): - for data in self.valid: - assert ScalarCoordinate.isvalid(data) - for data in self.invalid: - assert not ScalarCoordinate.isvalid(data) - - def test_init(self): - coord = ScalarCoordinate(1) - assert coord.data == 1 - assert coord.dim is None - coord = ScalarCoordinate(1, None) - assert coord.dim is None - with pytest.raises(ValueError): - ScalarCoordinate(1, "dim") - for data in self.valid: - assert ScalarCoordinate(data).data == np.array(data) - for data in self.invalid: - with pytest.raises(TypeError): - ScalarCoordinate(data) - - def test_getitem(self): - assert ScalarCoordinate(1)[...].equals(ScalarCoordinate(1)) - with pytest.raises(IndexError): - ScalarCoordinate(1)[:] - with pytest.raises(IndexError): - ScalarCoordinate(1)[0] - - def test_len(self): - with pytest.raises(TypeError): - len(ScalarCoordinate(1)) - - def test_repr(self): - for data in self.valid: - assert ScalarCoordinate(data).__repr__() == np.array2string( - np.asarray(data), threshold=0, edgeitems=1 - ) - - def test_array(self): - for data in self.valid: - assert ScalarCoordinate(data).__array__() == np.array(data) - - def test_dtype(self): - for data in self.valid: - assert ScalarCoordinate(data).dtype == np.array(data).dtype - - def test_values(self): - for data in self.valid: - assert ScalarCoordinate(data).values == np.array(data) - - def test_equals(self): - for data in self.valid: - coord = ScalarCoordinate(data) - assert coord.equals(coord) - assert ScalarCoordinate(1).equals(ScalarCoordinate(np.array(1))) - - def test_to_index(self): - with pytest.raises(NotImplementedError): - ScalarCoordinate(1).to_index("item") - - def test_isinstance(self): - assert ScalarCoordinate(1).isscalar() - assert not ScalarCoordinate(1).isdense() - assert not ScalarCoordinate(1).isinterp() - - def test_to_from_dict(self): - for data in self.valid: - coord = ScalarCoordinate(data) - assert ScalarCoordinate.from_dict(coord.to_dict()).equals(coord) - - def test_empty(self): - with pytest.raises(TypeError, match="cannot be empty"): - ScalarCoordinate() - - -class TestDenseCoordinate: - valid = [ - [1, 2, 3], - np.array([1, 2, 3]), - [1.0, 2.0, 3.0], - np.array([1.0, 2.0, 3.0]), - ["a", "b", "c"], - np.array(["a", "b", "c"]), - np.array([1, 2, 3], dtype="datetime64[s]"), - ] - invalid = [ - 1, - np.array(1), - 1.0, - np.array(1.0), - "label", - np.array("label"), - np.datetime64(1, "s"), - {"key": "value"}, - ] - - def test_isvalid(self): - for data in self.valid: - assert DenseCoordinate.isvalid(data) - for data in self.invalid: - assert not DenseCoordinate.isvalid(data) - - def test_init(self): - coord = DenseCoordinate([1, 2, 3]) - assert np.array_equiv(coord.data, [1, 2, 3]) - assert coord.dim is None - coord = DenseCoordinate([1, 2, 3], "dim") - assert coord.dim == "dim" - for data in self.valid: - assert np.array_equiv(DenseCoordinate(data).data, data) - for data in self.invalid: - with pytest.raises(TypeError): - DenseCoordinate(data) - - def test_getitem(self): - assert np.array_equiv(DenseCoordinate([1, 2, 3])[...].values, [1, 2, 3]) - assert isinstance(DenseCoordinate([1, 2, 3])[...], DenseCoordinate) - assert np.array_equiv(DenseCoordinate([1, 2, 3])[:].values, [1, 2, 3]) - assert isinstance(DenseCoordinate([1, 2, 3])[:], DenseCoordinate) - assert np.array_equiv(DenseCoordinate([1, 2, 3])[1].values, 2) - assert isinstance(DenseCoordinate([1, 2, 3])[1], ScalarCoordinate) - assert np.array_equiv(DenseCoordinate([1, 2, 3])[1:].values, [2, 3]) - assert isinstance(DenseCoordinate([1, 2, 3])[1:], DenseCoordinate) - - def test_len(self): - for data in self.valid: - assert len(DenseCoordinate(data)) == 3 - - def test_repr(self): - for data in self.valid: - assert DenseCoordinate(data).__repr__() == np.array2string( - np.asarray(data), threshold=0, edgeitems=1 - ) - - def test_array(self): - for data in self.valid: - assert np.array_equiv(DenseCoordinate(data).__array__(), data) - - def test_dtype(self): - for data in self.valid: - assert DenseCoordinate(data).dtype == np.array(data).dtype - - def test_values(self): - for data in self.valid: - assert np.array_equiv(DenseCoordinate(data).values, data) - - def test_index(self): - for data in self.valid: - assert DenseCoordinate(data).index.equals(pd.Index(data)) - - def test_equals(self): - for data in self.valid: - coord = DenseCoordinate(data) - assert coord.equals(coord) - assert DenseCoordinate([1, 2, 3]).equals(DenseCoordinate([1, 2, 3])) - - def test_isinstance(self): - assert not DenseCoordinate([1, 2, 3]).isscalar() - assert DenseCoordinate([1, 2, 3]).isdense() - assert not DenseCoordinate([1, 2, 3]).isinterp() - - def test_get_indexer(self): - assert DenseCoordinate([1, 2, 3]).get_indexer(2) == 1 - assert np.array_equiv(DenseCoordinate([1, 2, 3]).get_indexer([2, 3]), [1, 2]) - assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="nearest") == 1 - assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="ffill") == 1 - assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="bfill") == 2 - - def test_get_slice_indexer(self): - assert np.array_equiv( - DenseCoordinate([1, 2, 3]).slice_indexer(start=2), slice(1, 3) - ) - - def test_to_index(self): - assert DenseCoordinate([1, 2, 3]).to_index(2) == 1 - assert np.array_equiv(DenseCoordinate([1, 2, 3]).to_index([2, 3]), [1, 2]) - assert np.array_equiv( - DenseCoordinate([1, 2, 3]).to_index(slice(2, None)), slice(1, 3) - ) - - def test_to_from_dict(self): - for data in self.valid: - coord = DenseCoordinate(data) - assert DenseCoordinate.from_dict(coord.to_dict()).equals(coord) - - def test_empty(self): - coord = DenseCoordinate() - assert coord.empty - - def test_append(self): - coord0 = DenseCoordinate() - coord1 = DenseCoordinate([1, 2, 3]) - coord2 = DenseCoordinate([4, 5, 6]) - - result = coord1.append(coord2) - expected = DenseCoordinate([1, 2, 3, 4, 5, 6]) - assert result.equals(expected) - - result = coord2.append(coord1) - expected = DenseCoordinate([4, 5, 6, 1, 2, 3]) - assert result.equals(expected) - - assert coord0.append(coord0).empty - assert coord0.append(coord1).equals(coord1) - assert coord1.append(coord0).equals(coord1) - - -class TestInterpCoordinate: - valid = [ - {"tie_indices": [], "tie_values": []}, - {"tie_indices": [0], "tie_values": [100.0]}, - {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, - {"tie_indices": [0, 8], "tie_values": [100, 900]}, - { - "tie_indices": [0, 8], - "tie_values": [ - np.datetime64("2000-01-01T00:00:00"), - np.datetime64("2000-01-01T00:00:08"), - ], - }, - {"tie_indices": np.array([0, 8], dtype="int16"), "tie_values": [100.0, 900.0]}, - ] - invalid = [ - 1, - np.array(1), - 1.0, - np.array(1.0), - "label", - np.array("label"), - np.datetime64(1, "s"), - [1, 2, 3], - np.array([1, 2, 3]), - [1.0, 2.0, 3.0], - np.array([1.0, 2.0, 3.0]), - ["a", "b", "c"], - np.array(["a", "b", "c"]), - np.array([1, 2, 3], dtype="datetime64[s]"), - {"key": "value"}, - ] - error = [ - {"tie_indices": 0, "tie_values": [100.0]}, - {"tie_indices": [0], "tie_values": 100.0}, - {"tie_indices": [0, 7, 8], "tie_values": [100.0, 900.0]}, - {"tie_indices": [0.0, 8.0], "tie_values": [100.0, 900.0]}, - {"tie_indices": [1, 9], "tie_values": [100.0, 900.0]}, - {"tie_indices": [8, 0], "tie_values": [100.0, 900.0]}, - {"tie_indices": [8, 0], "tie_values": ["a", "b"]}, - ] - - def test_isvalid(self): - for data in self.valid: - assert InterpCoordinate.isvalid(data) - for data in self.invalid: - assert not InterpCoordinate.isvalid(data) - - def test_init(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert np.array_equiv(coord.data["tie_indices"], [0, 8]) - assert np.array_equiv(coord.data["tie_values"], [100.0, 900.0]) - assert coord.dim is None - coord = InterpCoordinate( - {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, "dim" - ) - assert coord.dim == "dim" - for data in self.valid: - coord = InterpCoordinate(data) - assert np.array_equiv(coord.data["tie_indices"], data["tie_indices"]) - assert np.array_equiv(coord.data["tie_values"], data["tie_values"]) - for data in self.invalid: - with pytest.raises(TypeError): - InterpCoordinate(data) - for data in self.error: - with pytest.raises(ValueError): - InterpCoordinate(data) - - def test_len(self): - assert ( - len(InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]})) - == 9 - ) - assert len(InterpCoordinate(dict(tie_indices=[], tie_values=[]))) == 0 - - @pytest.mark.parametrize("valid_input", valid) - def test_repr(self, valid_input): - coord = InterpCoordinate(data=valid_input) - my_coord = repr(coord) - assert isinstance(my_coord, str) - - def test_equals(self): - coord1 = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - coord2 = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord1.equals(coord2) - - def test_getitem(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert isinstance(coord[0], ScalarCoordinate) - assert coord[0].values == 100.0 - assert coord[4].values == 500.0 - assert coord[8].values == 900.0 - assert coord[-1].values == 900.0 - assert coord[-2].values == 800.0 - assert np.allclose(coord[[1, 2, 3]].values, [200.0, 300.0, 400.0]) - with pytest.raises(IndexError): - coord[9] - coord[-9] - assert coord[0:2].equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 200.0])) - ) - assert coord[:].equals(coord) - assert coord[6:3].equals(InterpCoordinate(dict(tie_indices=[], tie_values=[]))) - assert coord[1:2].equals( - InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) - ) - assert coord[-3:-1].equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[700.0, 800.0])) - ) - - def test_setitem(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - with pytest.raises(TypeError): - coord[1] = 0 - coord[:] = 0 - - def test_asarray(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert np.allclose(np.asarray(coord), coord.values) - - def test_empty(self): - assert not InterpCoordinate( - {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]} - ).empty - assert InterpCoordinate(dict(tie_indices=[], tie_values=[])).empty - - def test_dtype(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.dtype == np.float64 - - def test_ndim(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.ndim == 1 - assert isinstance(coord.ndim, int) - - def test_shape(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.shape == (9,) - - def test_format_index(self): - # TODO - pass - - def test_format_index_slice(self): - # TODO - pass - - def test_get_value(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.get_value(0) == 100.0 - assert coord.get_value(4) == 500.0 - assert coord.get_value(8) == 900.0 - assert coord.get_value(-1) == 900.0 - assert coord.get_value(-9) == 100.0 - assert np.allclose(coord.get_value([1, 2, 3, -2]), [200.0, 300.0, 400.0, 800.0]) - with pytest.raises(IndexError): - coord.get_value(-10) - coord.get_value(9) - coord.get_value(0.5) - starttime = np.datetime64("2000-01-01T00:00:00") - endtime = np.datetime64("2000-01-01T00:00:08") - coord = InterpCoordinate( - dict(tie_indices=[0, 8], tie_values=[starttime, endtime]) - ) - assert coord.get_value(0) == starttime - assert coord.get_value(4) == np.datetime64("2000-01-01T00:00:04") - assert coord.get_value(8) == endtime - assert coord.get_value(-1) == endtime - assert coord.get_value(-9) == starttime - - def test_get_index(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.get_indexer(100.0) == 0 - assert coord.get_indexer(900.0) == 8 - assert coord.get_indexer(0.0, "nearest") == 0 - assert coord.get_indexer(1000.0, "nearest") == 8 - assert coord.get_indexer(125.0, "nearest") == 0 - assert coord.get_indexer(175.0, "nearest") == 1 - assert coord.get_indexer(175.0, "ffill") == 0 - assert coord.get_indexer(200.0, "ffill") == 1 - assert coord.get_indexer(200.0, "bfill") == 1 - assert coord.get_indexer(125.0, "bfill") == 1 - assert np.all(np.equal(coord.get_indexer([100.0, 900.0]), [0, 8])) - with pytest.raises(KeyError): - assert coord.get_indexer(0.0) == 0 - assert coord.get_indexer(1000.0) == 8 - assert coord.get_indexer(150.0) == 0 - assert coord.get_indexer(1000.0, "bfill") == 8 - assert coord.get_indexer(0.0, "ffill") == 0 - - starttime = np.datetime64("2000-01-01T00:00:00") - endtime = np.datetime64("2000-01-01T00:00:08") - coord = InterpCoordinate( - dict(tie_indices=[0, 8], tie_values=[starttime, endtime]) - ) - assert coord.get_indexer(starttime) == 0 - assert coord.get_indexer(endtime) == 8 - assert coord.get_indexer(str(starttime)) == 0 - assert coord.get_indexer(str(endtime)) == 8 - assert coord.get_indexer("2000-01-01T00:00:04.1", "nearest") == 4 - - def test_indices(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert np.all(np.equal(coord.indices, np.arange(9))) - - def test_values(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert np.allclose(coord.values, np.arange(100.0, 1000.0, 100.0)) - - def test_get_index_slice(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.slice_indexer(100.0, 200.0) == slice(0, 2) - assert coord.slice_indexer(150.0, 250.0) == slice(1, 2) - assert coord.slice_indexer(300.0, 500.0) == slice(2, 5) - assert coord.slice_indexer(0.0, 500.0) == slice(0, 5) - assert coord.slice_indexer(125.0, 175.0) == slice(1, 1) - assert coord.slice_indexer(0.0, 50.0) == slice(0, 0) - assert coord.slice_indexer(1000.0, 1100.0) == slice(9, 9) - assert coord.slice_indexer(1000.0, 500.0) == slice(9, 5) - assert coord.slice_indexer(None, None) == slice(None, None) - - def test_slice_index(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.slice_index(slice(0, 2)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 200.0])) - ) - assert coord.slice_index(slice(7, None)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[800.0, 900.0])) - ) - assert coord.slice_index(slice(None, None)).equals(coord) - assert coord.slice_index(slice(0, 0)).equals( - InterpCoordinate(dict(tie_indices=[], tie_values=[])) - ) - assert coord.slice_index(slice(4, 2)).equals( - InterpCoordinate(dict(tie_indices=[], tie_values=[])) - ) - assert coord.slice_index(slice(9, 9)).equals( - InterpCoordinate(dict(tie_indices=[], tie_values=[])) - ) - assert coord.slice_index(slice(3, 3)).equals( - InterpCoordinate(dict(tie_indices=[], tie_values=[])) - ) - assert coord.slice_index(slice(0, -1)).equals( - InterpCoordinate(dict(tie_indices=[0, 7], tie_values=[100.0, 800.0])) - ) - assert coord.slice_index(slice(0, -2)).equals( - InterpCoordinate(dict(tie_indices=[0, 6], tie_values=[100.0, 700.0])) - ) - assert coord.slice_index(slice(-2, None)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[800.0, 900.0])) - ) - assert coord.slice_index(slice(1, 2)).equals( - InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) - ) - assert coord.slice_index(slice(1, 3, 2)).equals( - InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) - ) - assert coord.slice_index(slice(None, None, 2)).equals( - InterpCoordinate(dict(tie_indices=[0, 4], tie_values=[100.0, 900.0])) - ) - assert coord.slice_index(slice(None, None, 3)).equals( - InterpCoordinate(dict(tie_indices=[0, 2], tie_values=[100.0, 700.0])) - ) - assert coord.slice_index(slice(None, None, 4)).equals( - InterpCoordinate(dict(tie_indices=[0, 2], tie_values=[100.0, 900.0])) - ) - assert coord.slice_index(slice(None, None, 5)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 600.0])) - ) - assert coord.slice_index(slice(2, 7, 3)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[300.0, 600.0])) - ) - - def test_to_index(self): - # TODO - pass - - def test_simplify(self): - xp = np.sort(np.random.choice(10000, 1000, replace=False)) - xp[0] = 0 - xp[-1] = 10000 - yp = xp + (np.random.rand(1000) - 0.5) - coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) - assert len(coord.simplify(1.0).tie_indices) == 2 - - def test_singleton(self): - coord = InterpCoordinate({"tie_indices": [0], "tie_values": [1.0]}) - assert coord[0].values == 1.0 - - def test_to_from_dict(self): - for data in self.valid: - coord = InterpCoordinate(data) - assert InterpCoordinate.from_dict(coord.to_dict()).equals(coord) - - def test_append(self): - coord0 = InterpCoordinate() - coord1 = InterpCoordinate({"tie_indices": [0, 2], "tie_values": [0, 20]}) - coord2 = InterpCoordinate({"tie_indices": [0, 2], "tie_values": [30, 50]}) - - result = coord1.append(coord2).simplify() - expected = InterpCoordinate({"tie_indices": [0, 5], "tie_values": [0, 50]}) - assert result.equals(expected) - - result = coord2.append(coord1).simplify() - expected = InterpCoordinate( - {"tie_indices": [0, 2, 3, 5], "tie_values": [30, 50, 0, 20]} - ) - assert result.equals(expected) - - assert coord0.append(coord0).empty - assert coord0.append(coord1).equals(coord1) - assert coord1.append(coord0).equals(coord1) - - -class TestCoordinate: - def test_new(self): - assert xdas.Coordinate(1).isscalar() - assert xdas.Coordinate([1]).isdense() - assert xdas.Coordinate({"tie_values": [], "tie_indices": []}).isinterp() - coord = xdas.Coordinate(xdas.Coordinate([1]), "dim") - assert coord.isdense() - assert coord.dim == "dim" - - def test_to_dataarray(self): - coord = xdas.Coordinate([1, 2, 3], "dim") - result = coord.to_dataarray() - expected = xdas.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") - assert result.equals(expected) - - def test_empty(self): - with pytest.raises(TypeError, match="cannot infer coordinate type"): - xdas.Coordinate() - - def test_isdim(self): - coord = xdas.Coordinate([1, 2, 3]) - assert coord.isdim() is None - coord = xdas.Coordinate([1, 2, 3], "dim") - assert coord.isdim() is None - coords = xdas.Coordinates({"dim": coord}) - assert coords["dim"].isdim() - coords = xdas.Coordinates({"other_dim": coord}) - assert not coords["other_dim"].isdim() - - def test_name(self): - coord = xdas.Coordinate([1, 2, 3]) - assert coord.name is None - coord = xdas.Coordinate([1, 2, 3], "dim") - assert coord.name == "dim" - coords = xdas.Coordinates({"dim": coord}) - assert coords["dim"].name == "dim" - coords = xdas.Coordinates({"other_dim": coord}) - assert coords["other_dim"].name == "other_dim" - - def test_to_dataarray(self): - coord = xdas.Coordinate([1, 2, 3]) - with pytest.raises(ValueError, match="unnamed coordinate"): - coord.to_dataarray() - coord = xdas.Coordinate([1, 2, 3], "dim") - result = coord.to_dataarray() - expected = xdas.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") - assert result.equals(expected) - coords = xdas.Coordinates({"dim": coord}) - result = coords["dim"].to_dataarray() - assert result.equals(expected) - coords = xdas.Coordinates({"other_dim": coord}) - result = coords["other_dim"].to_dataarray() - expected = xdas.DataArray( - [1, 2, 3], coords={"other_dim": coord}, dims=["dim"], name="other_dim" - ) - assert result.equals(expected) - coords["dim"] = [4, 5, 6] - result = coords["dim"].to_dataarray() - expected = xdas.DataArray( - [4, 5, 6], - coords={"dim": [4, 5, 6], "other_dim": ("dim", [1, 2, 3])}, - dims=["dim"], - name="dim", - ) - assert result.equals(expected) - result = coords["other_dim"].to_dataarray() - expected = xdas.DataArray( - [1, 2, 3], - coords={"dim": [4, 5, 6], "other_dim": ("dim", [1, 2, 3])}, - dims=["dim"], - name="other_dim", - ) - assert result.equals(expected) - - -class TestCoordinates: - def test_init(self): - coords = xdas.Coordinates( - {"dim": ("dim", {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]})} - ) - coord = coords["dim"] - assert coord.isinterp() - assert np.allclose(coord.tie_indices, [0, 8]) - assert np.allclose(coord.tie_values, [100.0, 900.0]) - assert coords.isdim("dim") - coords = xdas.Coordinates({"dim": [1.0, 2.0, 3.0]}) - coord = coords["dim"] - assert coord.isdense() - assert np.allclose(coord.values, [1.0, 2.0, 3.0]) - assert coords.isdim("dim") - coords = xdas.Coordinates( - { - "dim_0": ( - "dim_0", - {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, - ), - "dim_1": ( - "dim_0", - {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, - ), - } - ) - assert coords.isdim("dim_0") - assert not coords.isdim("dim_1") - coords = xdas.Coordinates() - assert coords == dict() - assert coords.dims == tuple() - - def test_first_last(self): - coords = xdas.Coordinates({"dim_0": [1.0, 2.0, 3.0], "dim_1": [1.0, 2.0, 3.0]}) - assert coords["first"].dim == "dim_0" - assert coords["last"].dim == "dim_1" - - def test_setitem(self): - coords = xdas.Coordinates() - coords["dim_0"] = [1, 2, 4] - assert coords.dims == ("dim_0",) - coords["dim_1"] = {"tie_indices": [0, 10], "tie_values": [0.0, 100.0]} - assert coords.dims == ("dim_0", "dim_1") - coords["dim_0"] = [1, 2, 3] - assert coords.dims == ("dim_0", "dim_1") - coords["metadata"] = 0 - assert coords.dims == ("dim_0", "dim_1") - coords["non-dimensional"] = ("dim_0", [-1, -1, -1]) - assert coords.dims == ("dim_0", "dim_1") - coords["other_dim"] = ("dim_2", [0]) - assert coords.dims == ("dim_0", "dim_1", "dim_2") - with pytest.raises(TypeError, match="must be of type str"): - coords[0] = ... - - def test_to_from_dict(self): - starttime = np.datetime64("2020-01-01T00:00:00.000") - endtime = np.datetime64("2020-01-01T00:00:10.000") - coords = { - "time": {"tie_indices": [0, 999], "tie_values": [starttime, endtime]}, - "distance": np.linspace(0, 1000, 3), - "channel": ("distance", ["DAS01", "DAS02", "DAS03"]), - "interrogator": (None, "SRN"), - } - coords = xdas.Coordinates(coords) - assert xdas.Coordinates.from_dict(coords.to_dict()).equals(coords) diff --git a/tests/test_core.py b/tests/test_core.py index aeb5c2de..51c33b2f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -4,7 +4,8 @@ import numpy as np import pytest -import xdas +import xdas as xd +from xdas.coordinates import InterpCoordinate from xdas.synthetics import wavelet_wavefronts from xdas.virtual import VirtualStack @@ -20,7 +21,7 @@ def generate(self, datetime): else: t = {"tie_indices": [0, shape[0] - 1], "tie_values": [0, 3.0 - 1 / 100]} s = {"tie_indices": [0, shape[1] - 1], "tie_values": [0, 990.0]} - return xdas.DataArray( + return xd.DataArray( data=np.random.randn(*shape), coords={ "time": t, @@ -39,7 +40,7 @@ def test_open_mfdatatree(self): for idx, da in enumerate(wavelet_wavefronts(nchunk=3), start=1): da.to_netcdf(os.path.join(dirname, f"{idx:03d}.nc")) da = wavelet_wavefronts() - dc = xdas.open_mfdatatree( + dc = xd.open_mfdatatree( os.path.join(dirpath, "{node}", "00[acquisition].nc") ) assert list(dc.keys()) == keys @@ -51,10 +52,10 @@ def test_open_mfdataarray(self): wavelet_wavefronts().to_netcdf(os.path.join(dirpath, "sample.nc")) for idx, da in enumerate(wavelet_wavefronts(nchunk=3), start=1): da.to_netcdf(os.path.join(dirpath, f"{idx:03}.nc")) - da_monolithic = xdas.open_dataarray(os.path.join(dirpath, "sample.nc")) - da_chunked = xdas.open_mfdataarray(os.path.join(dirpath, "00*.nc")) + da_monolithic = xd.open_dataarray(os.path.join(dirpath, "sample.nc")) + da_chunked = xd.open_mfdataarray(os.path.join(dirpath, "00*.nc")) assert da_monolithic.equals(da_chunked) - da_chunked = xdas.open_mfdataarray( + da_chunked = xd.open_mfdataarray( [ os.path.join(dirpath, fname) for fname in ["001.nc", "002.nc", "003.nc"] @@ -62,9 +63,9 @@ def test_open_mfdataarray(self): ) assert da_monolithic.equals(da_chunked) with pytest.raises(FileNotFoundError): - xdas.open_mfdataarray("not_existing_files_*.nc") + xd.open_mfdataarray("not_existing_files_*.nc") with pytest.raises(FileNotFoundError): - xdas.open_mfdataarray(["not_existing_file.nc"]) + xd.open_mfdataarray(["not_existing_file.nc"]) def test_open_mfdataarray_grouping(self): with TemporaryDirectory() as dirpath: @@ -90,7 +91,7 @@ def test_open_mfdataarray_grouping(self): for da in wavelet_wavefronts(**acq): da.to_netcdf(os.path.join(dirpath, f"{count:03d}.nc")) count += 1 - dc = xdas.open_mfdataarray(os.path.join(dirpath, "*.nc")) + dc = xd.open_mfdataarray(os.path.join(dirpath, "*.nc")) assert len(dc) == 3 for da, acq in zip(dc, acqs): acq |= {"nchunk": None} @@ -108,28 +109,28 @@ def test_concatenate(self): }, "distance": da1["distance"], } - expected = xdas.DataArray(data, coords) - result = xdas.concatenate([da1, da2]) + expected = xd.DataArray(data, coords) + result = xd.concatenate([da1, da2]) assert result.equals(expected) # concatenate an empty data array - result = xdas.concatenate([da1, da2.isel(time=slice(0, 0))]) + result = xd.concatenate([da1, da2.isel(time=slice(0, 0))]) assert result.equals(da1) # concat of sources and stacks with TemporaryDirectory() as tmp_path: da1.to_netcdf(os.path.join(tmp_path, "da1.nc")) da2.to_netcdf(os.path.join(tmp_path, "da2.nc")) - da1 = xdas.open_dataarray(os.path.join(tmp_path, "da1.nc")) - da2 = xdas.open_dataarray(os.path.join(tmp_path, "da2.nc")) - result = xdas.concatenate([da1, da2]) + da1 = xd.open_dataarray(os.path.join(tmp_path, "da1.nc")) + da2 = xd.open_dataarray(os.path.join(tmp_path, "da2.nc")) + result = xd.concatenate([da1, da2]) assert isinstance(result.data, VirtualStack) assert result.equals(expected) da1.data = VirtualStack([da1.data]) da2.data = VirtualStack([da2.data]) - result = xdas.concatenate([da1, da2]) + result = xd.concatenate([da1, da2]) assert isinstance(result.data, VirtualStack) assert result.equals(expected) # concat of 3D data arrays with unsorted coords: - da1 = xdas.DataArray( + da1 = xd.DataArray( data=np.zeros((5, 4, 3)), coords={ "phase": ["A", "B", "C"], @@ -138,7 +139,7 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - da2 = xdas.DataArray( + da2 = xd.DataArray( data=np.ones((7, 4, 3)), coords={ "phase": ["A", "B", "C"], @@ -147,7 +148,7 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - expected = xdas.DataArray( + expected = xd.DataArray( data=np.concatenate((np.zeros((5, 4, 3)), np.ones((7, 4, 3))), axis=0), coords={ "time": {"tie_indices": [0, 11], "tie_values": [0, 11]}, @@ -155,9 +156,9 @@ def test_concatenate(self): "phase": ["A", "B", "C"], }, ) - assert xdas.concatenate((da1, da2), dim="time").equals(expected) + assert xd.concatenate((da1, da2), dim="time").equals(expected) # concat dense coordinates - da1 = xdas.DataArray( + da1 = xd.DataArray( data=np.zeros((5, 4, 3)), coords={ "phase": ["A", "B", "C"], @@ -166,7 +167,7 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - da2 = xdas.DataArray( + da2 = xd.DataArray( data=np.ones((7, 4, 3)), coords={ "phase": ["A", "B", "C"], @@ -175,7 +176,7 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - expected = xdas.DataArray( + expected = xd.DataArray( data=np.concatenate((np.zeros((5, 4, 3)), np.ones((7, 4, 3))), axis=0), coords={ "phase": ["A", "B", "C"], @@ -184,34 +185,34 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - assert xdas.concatenate((da1, da2), dim="time").equals(expected) + assert xd.concatenate((da1, da2), dim="time").equals(expected) # stack da = wavelet_wavefronts() objs = [obj for obj in da] - result = xdas.concatenate(objs, dim="time") - result["time"] = xdas.InterpCoordinate.from_array(result["time"].values) + result = xd.concatenate(objs, dim="time") + result["time"] = InterpCoordinate.from_array(result["time"].values) assert result.equals(da) objs = [obj.drop_coords("time") for obj in da] - result = xdas.concatenate(objs, dim="time") + result = xd.concatenate(objs, dim="time") assert result.equals(da.drop_coords("time")) def test_open_dataarray(self): with pytest.raises(FileNotFoundError): - xdas.open_dataarray("not_existing_file.nc") + xd.open_dataarray("not_existing_file.nc") def test_open_datacollection(self): with pytest.raises(FileNotFoundError): - xdas.open_datacollection("not_existing_file.nc") + xd.open_datacollection("not_existing_file.nc") def test_asdataarray(self): da = self.generate(False) - out = xdas.asdataarray(da.to_xarray()) + out = xd.asdataarray(da.to_xarray()) assert np.array_equal(out.data, da.data) for dim in da.dims: assert np.array_equal(out[dim].values, da[dim].values) def test_split(self): - da = xdas.DataArray( + da = xd.DataArray( np.ones(30), { "time": { @@ -220,22 +221,22 @@ def test_split(self): }, }, ) - assert xdas.concatenate(xdas.split(da)).equals(da) - assert xdas.split(da, tolerance=20.0)[0].equals(da) + assert xd.concatenate(xd.split(da)).equals(da) + assert xd.split(da, tolerance=20.0)[0].equals(da) def test_chunk(self): da = wavelet_wavefronts() - assert xdas.concatenate(xdas.split(da, 3)).equals(da) + assert xd.concatenate(xd.split(da, 3)).equals(da) def test_align(self): - da1 = xdas.DataArray(np.arange(2), {"x": [0, 1]}) - da2 = xdas.DataArray(np.arange(3), {"y": [2, 3, 4]}) - da1, da2 = xdas.align(da1, da2) + da1 = xd.DataArray(np.arange(2), {"x": [0, 1]}) + da2 = xd.DataArray(np.arange(3), {"y": [2, 3, 4]}) + da1, da2 = xd.align(da1, da2) assert da1.sizes == {"x": 2, "y": 1} assert da2.sizes == {"x": 1, "y": 3} - da3 = xdas.DataArray(np.arange(4).reshape(2, 2), {"x": [0, 1], "y": [2, 3]}) + da3 = xd.DataArray(np.arange(4).reshape(2, 2), {"x": [0, 1], "y": [2, 3]}) with pytest.raises(ValueError, match="incompatible sizes"): - xdas.align(da1, da2, da3) - da3 = xdas.DataArray(np.arange(6).reshape(2, 3), {"x": [1, 2], "y": [2, 3, 4]}) + xd.align(da1, da2, da3) + da3 = xd.DataArray(np.arange(6).reshape(2, 3), {"x": [1, 2], "y": [2, 3, 4]}) with pytest.raises(ValueError, match="differs from one data array to another"): - xdas.align(da1, da2, da3) + xd.align(da1, da2, da3) diff --git a/tests/test_dataarray.py b/tests/test_dataarray.py index a0a9c0bf..4a251f7e 100644 --- a/tests/test_dataarray.py +++ b/tests/test_dataarray.py @@ -7,9 +7,8 @@ import numpy as np import pytest -import xdas -from xdas.core.coordinates import Coordinates, DenseCoordinate, InterpCoordinate -from xdas.core.dataarray import DataArray +import xdas as xd +from xdas.coordinates import Coordinates, DenseCoordinate, InterpCoordinate from xdas.synthetics import wavelet_wavefronts @@ -20,12 +19,12 @@ def generate(self, dense=False): else: coords = {"dim": {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}} data = 0.1 * np.arange(9) - da = xdas.DataArray(data, coords) + da = xd.DataArray(data, coords) return da def test_init_without_coords(self): data = np.arange(2 * 3 * 5).reshape(2, 3, 5) - da = xdas.DataArray(data) + da = xd.DataArray(data) assert np.array_equal(da.data, data) assert da.dims == ("dim_0", "dim_1", "dim_2") assert da.coords == {} @@ -53,30 +52,30 @@ def test_init_and_properties(self): assert da.dtype == np.float64 da = self.generate(dense=True) assert isinstance(da["dim"], DenseCoordinate) - da = DataArray() + da = xd.DataArray() assert np.array_equal(da.values, np.array(np.nan), equal_nan=True) assert da.coords == {} assert da.dims == tuple() - da = DataArray([[]]) + da = xd.DataArray([[]]) assert da.dims == ("dim_0", "dim_1") assert da.ndim == 2 - da = DataArray(1) + da = xd.DataArray(1) assert da.dims == tuple() assert da.ndim == 0 def test_raises_on_data_and_coords_mismatch(self): with pytest.raises(ValueError, match="different number of dimensions"): - DataArray(np.zeros(3), dims=("time", "distance")) + xd.DataArray(np.zeros(3), dims=("time", "distance")) with pytest.raises( ValueError, match="inferred number of dimensions 2 from `coords` does not match `data` dimensionality of 1", ): - DataArray(np.zeros(3), coords={"time": [1], "distance": [1]}) + xd.DataArray(np.zeros(3), coords={"time": [1], "distance": [1]}) with pytest.raises(ValueError, match="conflicting sizes for dimension"): - DataArray(np.zeros((2, 3)), coords={"time": [1, 2], "distance": [1, 2]}) + xd.DataArray(np.zeros((2, 3)), coords={"time": [1, 2], "distance": [1, 2]}) def test_coords_setter(self): - da = xdas.DataArray(np.arange(3 * 11).reshape(3, 11)) + da = xd.DataArray(np.arange(3 * 11).reshape(3, 11)) da["dim_0"] = [1, 2, 4] da["dim_1"] = {"tie_indices": [0, 10], "tie_values": [0.0, 100.0]} da["dim_0"] = [1, 2, 3] @@ -163,7 +162,7 @@ def test_sel(self): assert "distance" not in result.coords def test_better_error_when_sel_with_overlaps(self): - da = DataArray( + da = xd.DataArray( np.arange(80).reshape(20, 4), { "time": { @@ -195,7 +194,7 @@ def test_to_xarray(self): result = da.to_xarray() assert np.array_equal(result.values, da.values) assert np.array_equal(result["dim"].values, da["dim"].values) - da = da.sel(dim=slice(1000, 2000)) # empty dataarray + da = da.sel(dim=slice(1000, 2000)) # empty xd.dataarray result = da.to_xarray() assert np.array_equal(result.values, da.values) assert np.array_equal(result["dim"].values, da["dim"].values) @@ -203,7 +202,7 @@ def test_to_xarray(self): def test_from_xarray(self): da = self.generate() da = da.to_xarray() - result = DataArray.from_xarray(da) + result = xd.DataArray.from_xarray(da) assert np.array_equal(result.values, da.values) assert np.array_equal(result["dim"].values, da["dim"].values) @@ -219,7 +218,7 @@ def test_stream(self): assert st[0].stats.npts == da.sizes["time"] assert np.datetime64(st[0].stats.starttime.datetime) == da["time"][0].values assert np.datetime64(st[0].stats.endtime.datetime) == da["time"][-1].values - result = DataArray.from_stream(st) + result = xd.DataArray.from_stream(st) assert np.array_equal(result.values.T, da.values) assert result.sizes == { "channel": da.sizes["distance"], @@ -231,10 +230,10 @@ def test_dense_str(self): coord = [f"D{k}" for k in range(9)] coords = Coordinates({"dim": coord}) data = 0.1 * np.arange(9) - DataArray(data, coords) + xd.DataArray(data, coords) def test_single_index_selection(self): - da = DataArray( + da = xd.DataArray( np.arange(12).reshape(3, 4), { "time": {"tie_values": [0.0, 1.0], "tie_indices": [0, 2]}, @@ -244,7 +243,7 @@ def test_single_index_selection(self): da_getitem = da[1] da_isel = da.isel(time=1) da_sel = da.sel(time=0.5) - da_expected = DataArray( + da_expected = xd.DataArray( np.array([4, 5, 6, 7]), {"time": (None, 0.5), "distance": [0.0, 10.0, 20.0, 30.0]}, ) @@ -254,7 +253,7 @@ def test_single_index_selection(self): da_getitem = da[:, 1] da_isel = da.isel(distance=1) da_sel = da.sel(distance=10.0) - da_expected = DataArray( + da_expected = xd.DataArray( np.array([1, 5, 9]), { "time": {"tie_values": [0.0, 1.0], "tie_indices": [0, 2]}, @@ -266,7 +265,7 @@ def test_single_index_selection(self): assert da_sel.equals(da_expected) def test_assign_coords(self): - da = DataArray( + da = xd.DataArray( data=np.zeros(3), coords={"time": np.array([3, 4, 5])}, ) @@ -277,7 +276,7 @@ def test_assign_coords(self): assert np.array_equal(result["relative_time"].values, [0, 1, 2]) def test_swap_dims(self): - da = DataArray( + da = xd.DataArray( data=[0, 1], coords={"x": ["a", "b"], "y": ("x", [0, 1])}, ) @@ -294,7 +293,7 @@ def test_swap_dims(self): da.swap_dims({"z": "x"}) def test_to_xarray_non_dimensional(self): - da = DataArray( + da = xd.DataArray( data=np.zeros(3), coords={ "time": np.array([3, 4, 5]), @@ -308,7 +307,7 @@ def test_to_xarray_non_dimensional(self): assert result.dims == da.dims def test_netcdf_non_dimensional(self): - da = DataArray( + da = xd.DataArray( data=np.zeros(3), coords={ "time": np.array([3, 4, 5]), @@ -318,16 +317,16 @@ def test_netcdf_non_dimensional(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - result = xdas.open_dataarray(path) + result = xd.open_dataarray(path) assert result.equals(da) with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "da.nc") da = wavelet_wavefronts().assign_coords(lon=("distance", np.arange(401))) da.to_netcdf(path) - tmp = xdas.open_dataarray(path) + tmp = xd.open_dataarray(path) path = path = os.path.join(dirpath, "vds.nc") tmp.to_netcdf(path) - result = xdas.open_dataarray(path) + result = xd.open_dataarray(path) assert result.equals(da) def test_transpose(self): @@ -345,16 +344,16 @@ def test_transpose(self): da.transpose("space", "frequency") def test_expand_dims(self): - da = DataArray([1.0, 2.0, 3.0], {"x": [0, 1, 2]}) + da = xd.DataArray([1.0, 2.0, 3.0], {"x": [0, 1, 2]}) result = da.expand_dims("y", 0) assert result.dims == ("y", "x") assert result.shape == (1, 3) - da = DataArray([1.0, 2.0, 3.0], {"x": [0, 1, 2], "y": 0}, dims=("x",)) + da = xd.DataArray([1.0, 2.0, 3.0], {"x": [0, 1, 2], "y": 0}, dims=("x",)) result = da.expand_dims("y") assert result.dims == ("y", "x") assert result.shape == (1, 3) - assert result["y"].equals(xdas.Coordinate([0], dim="y")) + assert result["y"].equals(xd.Coordinate([0], dim="y")) def test_io(self): # both coords interpolated @@ -362,7 +361,7 @@ def test_io(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - da_recovered = DataArray.from_netcdf(path) + da_recovered = xd.DataArray.from_netcdf(path) assert da.equals(da_recovered) # mixed interpolated and dense @@ -370,7 +369,7 @@ def test_io(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - da_recovered = DataArray.from_netcdf(path) + da_recovered = xd.DataArray.from_netcdf(path) assert da.equals(da_recovered) # only dense coords @@ -378,11 +377,11 @@ def test_io(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - da_recovered = DataArray.from_netcdf(path) + da_recovered = xd.DataArray.from_netcdf(path) assert da.equals(da_recovered) def test_io_with_zfp_compression(self): - da = DataArray(np.random.rand(101, 101)) + da = xd.DataArray(np.random.rand(101, 101)) with TemporaryDirectory() as tmpdir: tmpfile_uncompressed = os.path.join(tmpdir, "uncompressed.nc") da.to_netcdf(tmpfile_uncompressed) @@ -398,7 +397,7 @@ def test_io_with_zfp_compression(self): chunk_compressed_size = os.path.getsize(tmpfile_chunk_compressed) assert chunk_compressed_size < uncompressed_size assert compressed_size < chunk_compressed_size - _da = DataArray.from_netcdf(tmpfile_compressed) + _da = xd.DataArray.from_netcdf(tmpfile_compressed) assert np.abs(da - _da).max().values < 0.001 def test_io_dask(self): @@ -414,7 +413,7 @@ def test_io_dask(self): for chunk in chunks ] data = dask.array.concatenate(chunks, axis=1) - expected = DataArray( + expected = xd.DataArray( data, coords={"time": np.arange(3), "distance": np.arange(10)}, attrs={"version": "1.0"}, @@ -422,7 +421,7 @@ def test_io_dask(self): ) fname = os.path.join(tmpdir, "tmp.nc") expected.to_netcdf(fname) - result = xdas.open_dataarray(fname) + result = xd.open_dataarray(fname) assert isinstance(result.data, dask.array.Array) assert np.array_equal(expected.values, result.values) assert expected.dtype == result.dtype @@ -432,16 +431,16 @@ def test_io_dask(self): assert expected.attrs == result.attrs def test_io_non_dimensional(self): - expected = DataArray(coords={"dim": 0}, dims=()) + expected = xd.DataArray(coords={"dim": 0}, dims=()) with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") expected.to_netcdf(path) - result = DataArray.from_netcdf(path) + result = xd.DataArray.from_netcdf(path) assert expected.equals(result) def test_io_attrs(self): attrs = {"description": "test"} - da = DataArray( + da = xd.DataArray( np.arange(3), coords={"time": np.array([3, 4, 5])}, attrs=attrs, @@ -449,13 +448,13 @@ def test_io_attrs(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - result = DataArray.from_netcdf(path) + result = xd.DataArray.from_netcdf(path) assert result.attrs == attrs assert result.equals(da) - da = xdas.open_dataarray(path) + da = xd.open_dataarray(path) path = os.path.join(dirpath, "vds.nc") da.to_netcdf(path) - result = xdas.open_dataarray(path) + result = xd.open_dataarray(path) assert result.attrs == attrs assert result.equals(da) diff --git a/tests/test_datacollection.py b/tests/test_datacollection.py index 1f61b64e..7ebd2c0f 100644 --- a/tests/test_datacollection.py +++ b/tests/test_datacollection.py @@ -4,7 +4,7 @@ import h5py import pytest -import xdas +import xdas as xd import xdas.signal as xs from xdas.core.datacollection import get_depth from xdas.synthetics import wavelet_wavefronts @@ -12,10 +12,10 @@ class TestDataCollection: def nest(self, da): - return xdas.DataCollection( + return xd.DataCollection( { - "das1": xdas.DataCollection([da, da], "acquisition"), - "das2": xdas.DataCollection([da, da, da], "acquisition"), + "das1": xd.DataCollection([da, da], "acquisition"), + "das2": xd.DataCollection([da, da, da], "acquisition"), }, "instrument", ) @@ -30,12 +30,12 @@ def test_init(self): "das2": ("acquisition", [da, da, da]), }, ) - result = xdas.DataCollection(data) + result = xd.DataCollection(data) assert result.equals(dc) def test_io(self): da = wavelet_wavefronts() - dc = xdas.DataCollection( + dc = xd.DataCollection( { "das1": da, "das2": da, @@ -45,27 +45,27 @@ def test_io(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") dc.to_netcdf(path) - result = xdas.DataCollection.from_netcdf(path) + result = xd.DataCollection.from_netcdf(path) assert result.equals(dc) - dc = xdas.DataCollection([da, da], "instrument") + dc = xd.DataCollection([da, da], "instrument") with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") dc.to_netcdf(path) - result = xdas.DataCollection.from_netcdf(path) + result = xd.DataCollection.from_netcdf(path) assert result.equals(dc) - dc = xdas.DataCollection( + dc = xd.DataCollection( { - "das1": xdas.DataCollection([da, da], "acquisition"), - "das2": xdas.DataCollection([da, da, da], "acquisition"), + "das1": xd.DataCollection([da, da], "acquisition"), + "das2": xd.DataCollection([da, da, da], "acquisition"), }, "instrument", ) with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") dc.to_netcdf(path) - result = xdas.DataCollection.from_netcdf(path) + result = xd.DataCollection.from_netcdf(path) assert result.equals(dc) - result = xdas.open_datacollection(path) + result = xd.open_datacollection(path) assert result.equals(dc) def test_depth_counter(self): @@ -108,9 +108,9 @@ def test_query(self): da = wavelet_wavefronts() dc = self.nest(da) result = dc.query(instrument="das1", acquisition=0) - expected = xdas.DataCollection( + expected = xd.DataCollection( { - "das1": xdas.DataCollection([da], "acquisition"), + "das1": xd.DataCollection([da], "acquisition"), }, "instrument", ) diff --git a/tests/test_processing.py b/tests/test_processing.py index 3acfb62a..5f72976b 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -10,10 +10,10 @@ import pandas as pd import scipy.signal as sp -import xdas +import xdas as xd import xdas.processing as xp from xdas.atoms import Partial, Sequential -from xdas.processing.core import ( +from xdas.processing import ( DataArrayLoader, DataArrayWriter, DataFrameWriter, @@ -31,7 +31,7 @@ def test_stateful(self): with tempfile.TemporaryDirectory() as tempdir: # generate test dataarray wavelet_wavefronts().to_netcdf(os.path.join(tempdir, "sample.nc")) - da = xdas.open_dataarray(os.path.join(tempdir, "sample.nc")) + da = xd.open_dataarray(os.path.join(tempdir, "sample.nc")) # declare processing sequence sos = sp.iirfilter(4, 0.1, btype="lowpass", output="sos") @@ -189,20 +189,20 @@ def publish(): result.append(packet) if n == len(packets): break - return xdas.concatenate(result) + return xd.concatenate(result) def test_publish_and_subscribe(self): - expected = xdas.synthetics.dummy() - packets = xdas.split(expected, 10) - address = f"tcp://localhost:{xdas.io.get_free_port()}" + expected = xd.synthetics.dummy() + packets = xd.split(expected, 10) + address = f"tcp://localhost:{xd.io.get_free_port()}" result = self._publish_and_subscribe(packets, address) assert result.equals(expected) def test_encoding(self): - expected = xdas.synthetics.dummy() - packets = xdas.split(expected, 10) - address = f"tcp://localhost:{xdas.io.get_free_port()}" + expected = xd.synthetics.dummy() + packets = xd.split(expected, 10) + address = f"tcp://localhost:{xd.io.get_free_port()}" encoding = {"chunks": (10, 10), **hdf5plugin.Zfp(accuracy=1e-6)} result = self._publish_and_subscribe(packets, address, encoding=encoding) @@ -221,7 +221,7 @@ def test_without_gap(self): endtime = starttime + np.timedelta64(10, "ms") * (data.shape[0] - 1) distance = 5.0 * np.arange(data.shape[1]) - da = xdas.DataArray( + da = xd.DataArray( data=data, coords={ "time": { @@ -275,7 +275,7 @@ def test_without_gap(self): def test_with_gap(self): with tempfile.TemporaryDirectory() as tempdir: - da = xdas.DataArray( + da = xd.DataArray( data=np.random.randint( low=-1000, high=1000, size=(900, 10), dtype=np.int32 ), @@ -350,7 +350,7 @@ def test_flat(self): endtime = starttime + np.timedelta64(10, "ms") * (data.shape[0] - 1) distance = 5.0 * np.arange(data.shape[1]) - da = xdas.DataArray( + da = xd.DataArray( data=data, coords={ "time": { diff --git a/tests/test_routines.py b/tests/test_routines.py index 32d1d4da..d74e00ac 100644 --- a/tests/test_routines.py +++ b/tests/test_routines.py @@ -2,9 +2,7 @@ import pytest import xdas as xd -from xdas.core.coordinates import Coordinates -from xdas.core.dataarray import DataArray -from xdas.core.routines import Bag, CompatibilityError, combine_by_coords +from xdas.core.routines import Bag, CompatibilityError class TestBag: @@ -14,30 +12,30 @@ def test_bag_initialization(self): assert bag.objs == [] def test_bag_append_initializes(self): - da = DataArray( + da = xd.DataArray( np.random.rand(10, 5), {"time": np.arange(10), "space": np.arange(5)} ) bag = Bag(dim="time") bag.append(da) assert len(bag.objs) == 1 assert bag.objs[0] is da - assert bag.subcoords.equals(Coordinates({"space": np.arange(5)})) + assert bag.subcoords.equals(xd.Coordinates({"space": np.arange(5)})) assert bag.subshape == (5,) assert bag.dims == ("time", "space") assert bag.delta def test_bag_append_compatible(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 5), dims=("time", "space")) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) bag = Bag(dim="time") bag.append(da1) bag.append(da2) assert len(bag.objs) == 2 assert bag.objs[1] is da2 - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), {"time": np.arange(10), "space": np.arange(5)} ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), {"time": np.arange(10, 20), "space": np.arange(5)} ) bag = Bag(dim="time") @@ -47,36 +45,38 @@ def test_bag_append_compatible(self): assert bag.objs[1] is da2 def test_bag_append_incompatible_dims(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 5), dims=("space", "time")) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 5), dims=("space", "time")) bag = Bag(dim="time") bag.append(da1) with pytest.raises(CompatibilityError): bag.append(da2) def test_bag_append_incompatible_shape(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 6), dims=("time", "space")) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 6), dims=("time", "space")) bag = Bag(dim="time") bag.append(da1) with pytest.raises(CompatibilityError): bag.append(da2) def test_bag_append_incompatible_dtype(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.randint(0, 10, size=(10, 5)), dims=("time", "space")) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray( + np.random.randint(0, 10, size=(10, 5)), dims=("time", "space") + ) bag = Bag(dim="time") bag.append(da1) with pytest.raises(CompatibilityError): bag.append(da2) def test_bag_append_incompatible_coords(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"space": np.arange(5)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"space": np.arange(5) + 1}, @@ -87,12 +87,12 @@ def test_bag_append_incompatible_coords(self): bag.append(da2) def test_bag_append_incompatible_sampling_interval(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"time": np.arange(10)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"time": np.arange(10) * 2}, @@ -106,91 +106,93 @@ def test_bag_append_incompatible_sampling_interval(self): class TestCombineByCoords: def test_basic(self): # without coords - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - combined = combine_by_coords([da1, da2], dim="time", squeeze=True) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + combined = xd.combine_by_coords([da1, da2], dim="time", squeeze=True) assert combined.shape == (20, 5) # with coords - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), coords={"time": np.arange(10), "space": np.arange(5)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), coords={"time": np.arange(10, 20), "space": np.arange(5)}, ) - combined = combine_by_coords([da1, da2], dim="time", squeeze=True) + combined = xd.combine_by_coords([da1, da2], dim="time", squeeze=True) assert combined.shape == (20, 5) def test_incompatible_shape(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 6), dims=("time", "space")) - dc = combine_by_coords([da1, da2], dim="time") + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 6), dims=("time", "space")) + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_incompatible_dims(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 5), dims=("space", "time")) - dc = combine_by_coords([da1, da2], dim="time") + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 5), dims=("space", "time")) + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_incompatible_dtype(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.randint(0, 10, size=(10, 5)), dims=("time", "space")) - dc = combine_by_coords([da1, da2], dim="time") + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray( + np.random.randint(0, 10, size=(10, 5)), dims=("time", "space") + ) + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_incompatible_coords(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"space": np.arange(5)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"space": np.arange(5) + 1}, ) - dc = combine_by_coords([da1, da2], dim="time") + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_incompatible_sampling_interval(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"time": np.arange(10)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"time": np.arange(10) * 2}, ) - dc = combine_by_coords([da1, da2], dim="time") + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_expand_scalar_coordinate(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10), dims=("time",), coords={"time": np.arange(10), "space": 0}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10), dims=("time",), coords={"time": np.arange(10), "space": 1}, ) - dc = combine_by_coords([da1, da2], dim="space", squeeze=True) + dc = xd.combine_by_coords([da1, da2], dim="space", squeeze=True) assert dc.shape == (2, 10) assert dc.dims == ("space", "time") assert dc.coords["space"].values.tolist() == [0, 1] @@ -198,7 +200,7 @@ def test_expand_scalar_coordinate(self): class TestOpenMFDataArray: def test_warn_on_corrupted_files(self, tmp_path): - expected = DataArray( + expected = xd.DataArray( np.random.rand(10, 5), coords={ "time": np.arange(10), diff --git a/tests/test_virtual.py b/tests/test_virtual.py index 4e3e2a05..94b80b3d 100644 --- a/tests/test_virtual.py +++ b/tests/test_virtual.py @@ -5,7 +5,7 @@ import numpy as np import pytest -import xdas +import xdas as xd from xdas.synthetics import wavelet_wavefronts from xdas.virtual import ( Selection, @@ -22,11 +22,11 @@ class TestFunctional: # TODO: move elsewhere def test_all(self): with tempfile.TemporaryDirectory() as dirpath: expected = wavelet_wavefronts() - chunks = xdas.split(expected, 3) + chunks = xd.split(expected, 3) for index, chunk in enumerate(chunks, start=1): chunk.to_netcdf(os.path.join(dirpath, f"{index:03d}.nc")) - da = xdas.open_dataarray(os.path.join(dirpath, "002.nc")) + da = xd.open_dataarray(os.path.join(dirpath, "002.nc")) datasource = da.data assert np.allclose(np.asarray(datasource[0]), da.load().values[0]) assert np.allclose(np.asarray(datasource[0][1]), da.load().values[0][1]) @@ -68,9 +68,9 @@ def test_dtypes(self, tmp_path): np.complex128, ) for dtype in dtypes: - expected = xdas.DataArray(np.zeros((3, 5), dtype=dtype)) + expected = xd.DataArray(np.zeros((3, 5), dtype=dtype)) expected.to_netcdf(tmp_path / "data.nc") - result = xdas.open_dataarray(tmp_path / "data.nc") + result = xd.open_dataarray(tmp_path / "data.nc") assert result.equals(expected) diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 382a4fe6..9e87a62a 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -1,7 +1,7 @@ import numpy as np +import xdas as xd import xdas.core.methods as xm -from xdas.core.dataarray import DataArray from xdas.synthetics import wavelet_wavefronts @@ -15,19 +15,19 @@ def test_returns_dataarray(self): "quantile", ]: result = func(da, 0.5) - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) result = getattr(da, name)(0.5) - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) elif name == "diff": result = func(da, "time") - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) result = getattr(da, name)("time") - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) else: result = func(da) - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) result = getattr(da, name)() - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) def test_mean(self): da = wavelet_wavefronts() diff --git a/xdas/__init__.py b/xdas/__init__.py index e9a85cb5..e98f9d11 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -1,13 +1,17 @@ -from . import atoms, config, fft, io, parallel, processing, signal, synthetics, virtual -from .core import coordinates, dataarray, datacollection, methods, numpy, routines -from .core.coordinates import ( - Coordinate, - Coordinates, - DenseCoordinate, - InterpCoordinate, - ScalarCoordinate, - get_sampling_interval, +from . import ( + atoms, + config, + coordinates, + fft, + io, + parallel, + processing, + signal, + synthetics, + virtual, ) +from .coordinates import Coordinate, Coordinates, get_sampling_interval +from .core import dataarray, datacollection, methods, numpy, routines from .core.dataarray import DataArray from .core.datacollection import DataCollection, DataMapping, DataSequence from .core.methods import * diff --git a/xdas/atoms/ml.py b/xdas/atoms/ml.py index f3fc4faf..5add9e22 100644 --- a/xdas/atoms/ml.py +++ b/xdas/atoms/ml.py @@ -2,9 +2,9 @@ import numpy as np -from ..atoms import Atom, State from ..core.dataarray import DataArray from ..core.routines import concatenate +from .core import Atom, State class LazyModule: diff --git a/xdas/atoms/signal.py b/xdas/atoms/signal.py index 7378a7d5..21f4027b 100644 --- a/xdas/atoms/signal.py +++ b/xdas/atoms/signal.py @@ -3,7 +3,7 @@ import numpy as np import scipy.signal as sp -from ..core.coordinates import Coordinate, get_sampling_interval +from ..coordinates.core import Coordinate, get_sampling_interval from ..core.dataarray import DataArray from ..core.routines import concatenate, split from ..parallel import parallelize diff --git a/xdas/coordinates/__init__.py b/xdas/coordinates/__init__.py new file mode 100644 index 00000000..f7eaeaee --- /dev/null +++ b/xdas/coordinates/__init__.py @@ -0,0 +1,6 @@ +from .core import Coordinate, Coordinates, get_sampling_interval +from .default import DefaultCoordinate +from .dense import DenseCoordinate +from .interp import InterpCoordinate +from .sampled import SampledCoordinate +from .scalar import ScalarCoordinate diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py new file mode 100644 index 00000000..4154467d --- /dev/null +++ b/xdas/coordinates/core.py @@ -0,0 +1,634 @@ +from copy import copy, deepcopy +from functools import wraps +from itertools import pairwise + +import numpy as np +import pandas as pd + + +def wraps_first_last(func): + @wraps(func) + def wrapper(self, dim, *args, **kwargs): + if dim == "first": + dim = self._dims[0] + if dim == "last": + dim = self._dims[-1] + return func(self, dim, *args, **kwargs) + + return wrapper + + +class Coordinates(dict): + """ + Dictionary like container for coordinates. + + Parameters + ---------- + coords: dict-like, optional + Mapping from coordinate names to any of the followings: + + - Coordinate objects + - tuples (dim, coordinate-like) which can be either dimensional (`dim == name`) + or non-dimensional (`dim != name` or `dim == None`). + - coordinate-like objects (that are passed to the Coordinate constructor) + which are assumed to be a dimensional coordinate with `dim` set to the + related name. + + dims: squence of str, optional + An ordered sequence of dimensions. It is meant to match the dimensionality of + its associated data. If provided, it must at least include all dimensions found + in `coords` (extras dimensions will be considered as empty coordinates). + Otherwise, dimensions will be guessed from `coords`. + + Examples + -------- + >>> import xdas + + >>> coords = { + ... "time": {"tie_indices": [0, 999], "tie_values": [0.0, 10.0]}, + ... "distance": [0, 1, 2], + ... "channel": ("distance", ["DAS01", "DAS02", "DAS03"]), + ... "interrogator": (None, "SRN"), + ... } + >>> xdas.Coordinates(coords) + Coordinates: + * time (time): 0.000 to 10.000 + * distance (distance): [0 ... 2] + channel (distance): ['DAS01' ... 'DAS03'] + interrogator: 'SRN' + """ + + def __init__(self, coords=None, dims=None): + super().__init__() + if isinstance(coords, Coordinates): + if dims is None: + dims = coords.dims + coords = dict(coords) + self._dims = () if dims is None else tuple(dims) + self._parent = None + if coords is not None: + for name in coords: + self[name] = coords[name] + + @wraps_first_last + def __getitem__(self, key): + if key in self.dims and key not in self: + raise KeyError(f"dimension {key} has no coordinate") + return super().__getitem__(key) + + @wraps_first_last + def __setitem__(self, key, value): + if not isinstance(key, str): + raise TypeError("dimension names must be of type str") + coord = Coordinate(value) + if coord.dim is None and not coord.isscalar(): + coord.dim = key + if self._parent is None: + if coord.dim is not None and coord.dim not in self.dims: + self._dims = self.dims + (coord.dim,) + else: + if coord.dim is not None: + if coord.dim not in self.dims: + raise KeyError( + f"cannot add new dimension {coord.dim} to an existing DataArray" + ) + size = self._parent.sizes[coord.dim] + if not len(coord) == size: + raise ValueError( + f"conflicting sizes for dimension {coord.dim}: size {len(coord)} " + f"in `coords` and size {size} in `data`" + ) + coord._parent = self + return super().__setitem__(key, coord) + + def __repr__(self): + lines = ["Coordinates:"] + for name, coord in self.items(): + if self.isdim(name): + lines.append(f" * {name} ({coord.dim}): {coord}") + else: + if coord.dim is None: + lines.append(f" {name}: {coord}") + else: + lines.append(f" {name} ({coord.dim}): {coord}") + return "\n".join(lines) + + def __reduce__(self): + return self.__class__, (dict(self), self.dims), {"_parent": self._parent} + + @property + def dims(self): + return self._dims + + def isdim(self, name): + return self[name].dim == name + + def get_query(self, item): + """ + Format a query from one or multiple indexer. + + Parameters + ---------- + item: indexer-like or sequence or mapping + Object to be parsed as a query. If item is indexer-like object, it is + applied on the first dimension. If item is a sequence, positional indexing + is performed. If item is a mapping, labeled indexing is performed. + + Returns + ------- + dict: + A mapping between each dim and a given indexer. If No indexer was found for + a given dim, slice(None) will be used. + """ + query = {dim: slice(None) for dim in self.dims} + if isinstance(item, dict): + if "first" in item: + item[self.dims[0]] = item.pop("first") + if "last" in item: + item[self.dims[-1]] = item.pop("last") + query.update(item) + elif isinstance(item, tuple): + for k in range(len(item)): + query[self.dims[k]] = item[k] + else: + query[self.dims[0]] = item + for dim, item in query.items(): + if isinstance(item, tuple): + msg = f"cannot use tuple {item} to index dim '{dim}'" + if len(item) == 2: + msg += f". Did you mean: {dim}=slice({item[0]}, {item[1]})?" + raise TypeError(msg) + return query + + def to_index(self, item, method=None, endpoint=True): + query = self.get_query(item) + return {dim: self[dim].to_index(query[dim], method, endpoint) for dim in query} + + def equals(self, other): + if not isinstance(other, Coordinates): + return False + for name in self: + if not self[name].equals(other[name]): + return False + return True + + def to_dict(self): + """Convert this `Coordinates` object into a pure python dictionnary. + + Examples + -------- + + >>> import xdas + + >>> coords = xdas.Coordinates( + ... { + ... "time": {"tie_indices": [0, 999], "tie_values": [0.0, 10.0]}, + ... "distance": [0, 1, 2], + ... "channel": ("distance", ["DAS01", "DAS02", "DAS03"]), + ... "interrogator": (None, "SRN"), + ... } + ... ) + >>> coords.to_dict() + {'dims': ('time', 'distance'), + 'coords': {'time': {'dim': 'time', + 'data': {'tie_indices': [0, 999], 'tie_values': [0.0, 10.0]}, + 'dtype': 'float64'}, + 'distance': {'dim': 'distance', 'data': [0, 1, 2], 'dtype': 'int64'}, + 'channel': {'dim': 'distance', + 'data': ['DAS01', 'DAS02', 'DAS03'], + 'dtype': '= len(self)): + raise IndexError("index is out of bounds") + elif bounds == "clip": + idx = np.clip(idx, 0, len(self)) + return idx + + def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): + if start is not None: + try: + start_index = self.get_indexer(start, method="bfill") + except KeyError: + start_index = len(self) + else: + start_index = None + if stop is not None: + try: + end_index = self.get_indexer(stop, method="ffill") + stop_index = end_index + 1 + except KeyError: + stop_index = 0 + else: + stop_index = None + if step is not None: + raise NotImplementedError("cannot use step yet") + if ( + (not endpoint) + and (stop is not None) + and (self[stop_index - 1].values == stop) + ): + stop_index -= 1 + return slice(start_index, stop_index) + + def isscalar(self): + return False + + def isdefault(self): + return False + + def isdense(self): + return False + + def isinterp(self): + return False + + def issampled(self): + return False + + def append(self, other): + raise NotImplementedError(f"append is not implemented for {self.__class__}") + + def get_split_indices(self, tolerance=None): + raise NotImplementedError( + f"get_split_indices is not implemented for {self.__class__}" + ) + + def get_discontinuities(self, tolerance=None): + """ + Returns a DataFrame containing information about the discontinuities. + + Returns + ------- + pandas.DataFrame + A DataFrame with the following columns: + + - start_index : int + The index where the discontinuity starts. + - end_index : int + The index where the discontinuity ends. + - start_value : float + The value at the start of the discontinuity. + - end_value : float + The value at the end of the discontinuity. + - delta : float + The difference between the end_value and start_value. + - type : str + The type of the discontinuity, either "gap" or "overlap". + + """ + if self.empty: + return pd.DataFrame( + columns=[ + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + ] + ) + indices = self.get_split_indices(tolerance) + records = [] + for index in indices: + start_index = index + end_index = index + 1 + start_value = self.get_value(index) + end_value = self.get_value(index + 1) + delta = end_value - start_value + if tolerance is not None and np.abs(delta) < tolerance: + continue + record = { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": ("gap" if end_value > start_value else "overlap"), + } + records.append(record) + return pd.DataFrame.from_records(records) + + def get_availabilities(self): + """ + Returns a DataFrame containing information about the data availability. + + Returns + ------- + pandas.DataFrame + A DataFrame with the following columns: + + - start_index : int + The index where the discontinuity starts. + - end_index : int + The index where the discontinuity ends. + - start_value : float + The value at the start of the discontinuity. + - end_value : float + The value at the end of the discontinuity. + - delta : float + The difference between the end_value and start_value. + - type : str + The type of the discontinuity, always "data". + + """ + if self.empty: + return pd.DataFrame( + columns=[ + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + ] + ) + indices = np.concatenate([[0], self.get_split_indices(), [len(self)]]) + records = [] + for start_index, stop_index in pairwise(indices): + end_index = stop_index - 1 + start_value = self.get_value(start_index) + end_value = self.get_value(end_index) + records.append( + { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": "data", + } + ) + return pd.DataFrame.from_records(records) + + def to_dataarray(self): + from ..core.dataarray import DataArray # TODO: avoid defered import? + + if self.name is None: + raise ValueError("cannot convert unnamed coordinate to DataArray") + + if self.parent is None: + return DataArray( + self.values, + {self.dim: self}, + dims=[self.dim], + name=self.name, + ) + else: + return DataArray( + self.values, + { + name: coord + for name, coord in self.parent.items() + if coord.dim == self.dim + }, + dims=[self.dim], + name=self.name, + ) + + def to_dict(self): + raise NotImplementedError + + @classmethod + def from_dict(cls, dct): + return cls(**dct) + + def to_dataset(self, dataset, attrs): + dataset = dataset.assign_coords( + {self.name: (self.dim, self.values) if self.dim else self.values} + ) + return dataset, attrs + + @classmethod + def from_dataset(cls, dataset, name): + coords = {} + for subcls in cls.__subclasses__(): + if hasattr(subcls, "from_dataset"): + coords |= subcls.from_dataset(dataset, name) + return coords + + @classmethod + def from_block(cls, start, size, step, dim=None, dtype=None): + raise NotImplementedError + + +def parse(data, dim=None): + if isinstance(data, tuple): + if dim is None: + dim, data = data + else: + _, data = data + if isinstance(data, Coordinate): + if dim is None: + dim = data.dim + data = data.data + return data, dim + + +def get_sampling_interval(da, dim, cast=True): + """ + Returns the sample spacing along a given dimension. + + Parameters + ---------- + da : DataArray or DataArray or DataArray + The data from which extract the sample spacing. + dim : str + The dimension along which get the sample spacing. + cast: bool, optional + Wether to cast datetime64 to seconds, by default True. + + Returns + ------- + float + The sample spacing. + + """ + return da[dim].get_sampling_interval(cast=cast) + + +def isscalar(data): + data = np.asarray(data) + return (data.dtype != np.dtype(object)) and (data.ndim == 0) + + +def is_strictly_increasing(x): + if np.issubdtype(x.dtype, np.datetime64): + return np.all(np.diff(x) > np.timedelta64(0, "ns")) + else: + return np.all(np.diff(x) > 0) + + +def format_datetime(x): + string = str(x) + if "." in string: + datetime, digits = string.split(".") + digits = digits[:3] + return ".".join([datetime, digits]) + else: + return string diff --git a/xdas/coordinates/default.py b/xdas/coordinates/default.py new file mode 100644 index 00000000..df24191a --- /dev/null +++ b/xdas/coordinates/default.py @@ -0,0 +1,96 @@ +import numpy as np + +from .core import Coordinate, isscalar, parse + + +class DefaultCoordinate(Coordinate, name="default"): + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + # empty + if data is None: + data = {"size": 0} + + # parse data + data, dim = parse(data, dim) + if not self.isvalid(data): + raise TypeError("`data` must be a mapping {'size': }") + + # check dtype + if dtype is not None: + raise ValueError("`dtype` is not supported for DefaultCoordinate") + + # store data + self.data = data + self.dim = dim + + @property + def empty(self): + return bool(self.data["size"]) + + @property + def dtype(self): + return np.int64 + + @property + def ndim(self): + return 1 + + @property + def shape(self): + return (len(self),) + + @staticmethod + def isvalid(data): + match data: + case {"size": None | int(_)}: + return True + case _: + return False + + def __len__(self): + if self.data["size"] is None: + return 0 + else: + return self.data["size"] + + def __getitem__(self, item): + data = self.__array__()[item] + dim = None if isscalar(data) else self.dim + return Coordinate(data, dim) + + def __array__(self, dtype=None): + return np.arange(self.data["size"], dtype=dtype) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + raise NotImplementedError + + def __array_function__(self, func, types, args, kwargs): + raise NotImplementedError + + def isdefault(self): + return True + + def get_sampling_interval(self, cast=True): + return 1 + + def equals(self, other): + if isinstance(other, self.__class__): + return self.data["size"] == other.data["size"] + + def get_indexer(self, value, method=None): + return value + + def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): + return slice(start, stop, step) + + def append(self, other): + if not isinstance(other, self.__class__): + raise TypeError(f"cannot append {type(other)} to {self.__class__}") + if not self.dim == other.dim: + raise ValueError("cannot append coordinate with different dimension") + return self.__class__({"size": len(self) + len(other)}, self.dim) + + def to_dict(self): + return {"dim": self.dim, "data": self.data.tolist(), "dtype": str(self.dtype)} diff --git a/xdas/coordinates/dense.py b/xdas/coordinates/dense.py new file mode 100644 index 00000000..ff8c5357 --- /dev/null +++ b/xdas/coordinates/dense.py @@ -0,0 +1,118 @@ +import numpy as np +import pandas as pd + +from .core import Coordinate, parse + + +class DenseCoordinate(Coordinate, name="dense"): + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + # empty + if data is None: + data = [] + + # parse data + data, dim = parse(data, dim) + if not self.isvalid(data): + raise TypeError("`data` must be array-like") + + # store data + self.data = np.asarray(data, dtype=dtype) + self.dim = dim + + @property + def index(self): + return pd.Index(self.data) + + @staticmethod + def isvalid(data): + data = np.asarray(data) + return (data.dtype != np.dtype(object)) and (data.ndim == 1) + + def isdense(self): + return True + + def equals(self, other): + if isinstance(other, self.__class__): + return ( + np.array_equal(self.data, other.data) + and self.dim == other.dim + and self.dtype == other.dtype + ) + else: + return False + + def get_indexer(self, value, method=None): + if np.isscalar(value): + out = self.index.get_indexer([value], method).item() + else: + out = self.index.get_indexer(value, method) + if np.any(out == -1): + raise KeyError("index not found") + return out + + def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): + slc = self.index.slice_indexer(start, stop, step) + if ( + (not endpoint) + and (stop is not None) + and (self[slc.stop - 1].values == stop) + ): + slc = slice(slc.start, slc.stop - 1, slc.step) + return slc + + def append(self, other): + if not isinstance(other, self.__class__): + raise TypeError(f"cannot append {type(other)} to {self.__class__}") + if not self.dim == other.dim: + raise ValueError("cannot append coordinate with different dimension") + if self.empty: + return other + if other.empty: + return self + if not self.dtype == other.dtype: + raise ValueError("cannot append coordinate with different dtype") + return self.__class__(np.concatenate([self.data, other.data]), self.dim) + + def get_div_points(self, tolerance=None): + deltas = np.diff(self.data) + if tolerance is not None: + div_points = np.nonzero(np.abs(deltas) >= tolerance)[0] + 1 + else: + raise NotImplementedError( + "get_div_points without tolerance is not implemented for DenseCoordinate" + ) + div_points = np.concatenate(([0], div_points, [len(self)])) + return div_points + + def to_dict(self): + if np.issubdtype(self.dtype, np.datetime64): + data = self.data.astype(str).tolist() + else: + data = self.data.tolist() + return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + + @classmethod + def from_dataset(cls, dataset, name): + return { + name: ( + ( + coord.dims[0], + ( + coord.values.astype("U") + if coord.dtype == np.dtype("O") + else coord.values + ), + ) + if coord.dims + else coord.values + ) + for name, coord in dataset[name].coords.items() + } + + @classmethod + def from_block(cls, start, size, step, dim=None, dtype=None): + data = start + step * np.arange(size) + return cls(data, dim=dim, dtype=dtype) diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py new file mode 100644 index 00000000..741e4b43 --- /dev/null +++ b/xdas/coordinates/interp.py @@ -0,0 +1,395 @@ +import re + +import numpy as np +import pandas as pd +from xinterp import forward, inverse + +from .core import Coordinate, format_datetime, is_strictly_increasing, parse + + +class InterpCoordinate(Coordinate, name="interpolated"): + """ + Array-like object used to represent piecewise evenly spaced coordinates using the + CF convention. + + The coordinate ticks are describes by the mean of tie points that are interpolated + when intermediate values are required. Coordinate objects provides label based + selections methods. + + Parameters + ---------- + tie_indices : sequence of integers + The indices of the tie points. Must include index 0 and be strictly increasing. + tie_values : sequence of float or datetime64 + The values of the tie points. Must be strictly increasing to enable label-based + selection. The len of `tie_indices` and `tie_values` sizes must match. + """ + + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + # empty + if data is None: + data = {"tie_indices": [], "tie_values": []} + + # parse data + data, dim = parse(data, dim) + if not self.__class__.isvalid(data): + raise TypeError("`data` must be dict-like") + if not set(data) == {"tie_indices", "tie_values"}: + raise ValueError( + "both `tie_indices` and `tie_values` key should be provided" + ) + tie_indices = np.asarray(data["tie_indices"]) + tie_values = np.asarray(data["tie_values"], dtype=dtype) + + # check shapes + if not tie_indices.ndim == 1: + raise ValueError("`tie_indices` must be 1D") + if not tie_values.ndim == 1: + raise ValueError("`tie_values` must be 1D") + if not len(tie_indices) == len(tie_values): + raise ValueError("`tie_indices` and `tie_values` must have the same length") + + # check dtypes + if not tie_indices.shape == (0,): + if not np.issubdtype(tie_indices.dtype, np.integer): + raise ValueError("`tie_indices` must be integer-like") + if not tie_indices[0] == 0: + raise ValueError("`tie_indices` must start with a zero") + if not is_strictly_increasing(tie_indices): + raise ValueError("`tie_indices` must be strictly increasing") + if not ( + np.issubdtype(tie_values.dtype, np.number) + or np.issubdtype(tie_values.dtype, np.datetime64) + ): + raise ValueError("`tie_values` must have either numeric or datetime dtype") + + # store data + tie_indices = tie_indices.astype(int) + self.data = dict(tie_indices=tie_indices, tie_values=tie_values) + self.dim = dim + + @property + def tie_indices(self): + return self.data["tie_indices"] + + @property + def tie_values(self): + return self.data["tie_values"] + + @property + def dtype(self): + return self.tie_values.dtype + + @property + def empty(self): + return self.tie_indices.shape == (0,) + + @property + def ndim(self): + return self.tie_values.ndim + + @property + def shape(self): + return (len(self),) + + @property + def indices(self): + if self.empty: + return np.array([], dtype="int") + else: + return np.arange(self.tie_indices[-1] + 1) + + @property + def values(self): + if self.empty: + return np.array([], dtype=self.dtype) + else: + return self.get_value(self.indices) + + @staticmethod + def isvalid(data): + match data: + case {"tie_indices": _, "tie_values": _}: + return True + case _: + return False + + def __len__(self): + if self.empty: + return 0 + else: + return self.tie_indices[-1] - self.tie_indices[0] + 1 + + def __repr__(self): + if len(self) == 0: + return "empty coordinate" + elif len(self) == 1: + return f"{self.tie_values[0]}" + else: + if np.issubdtype(self.dtype, np.floating): + return f"{self.tie_values[0]:.3f} to {self.tie_values[-1]:.3f}" + elif np.issubdtype(self.dtype, np.datetime64): + start = format_datetime(self.tie_values[0]) + end = format_datetime(self.tie_values[-1]) + return f"{start} to {end}" + else: + return f"{self.tie_values[0]} to {self.tie_values[-1]}" + + def __getitem__(self, item): + if isinstance(item, slice): + return self.slice_index(item) + elif np.isscalar(item): + return Coordinate(self.get_value(item), None) + else: + return Coordinate(self.get_value(item), self.dim) + + def __add__(self, other): + return self.__class__( + {"tie_indices": self.tie_indices, "tie_values": self.tie_values + other}, + self.dim, + ) + + def __sub__(self, other): + return self.__class__( + {"tie_indices": self.tie_indices, "tie_values": self.tie_values - other}, + self.dim, + ) + + def __array__(self, dtype=None): + out = self.values + if dtype is not None: + out = out.__array__(dtype) + return out + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + raise NotImplementedError + + def __array_function__(self, func, types, args, kwargs): + raise NotImplementedError + + def isinterp(self): + return True + + def get_sampling_interval(self, cast=True): + if len(self) < 2: + return None + num = np.diff(self.tie_values) + den = np.diff(self.tie_indices) + mask = den != 1 + num = num[mask] + den = den[mask] + delta = np.median(num / den) + if cast and np.issubdtype(delta.dtype, np.timedelta64): + delta = delta / np.timedelta64(1, "s") + return delta + + def equals(self, other): + return ( + np.array_equal(self.tie_indices, other.tie_indices) + and np.array_equal(self.tie_values, other.tie_values) + and self.dim == other.dim + and self.dtype == other.dtype + ) + + def get_value(self, index): + index = self.format_index(index) + return forward(index, self.tie_indices, self.tie_values) + + def slice_index(self, index_slice): + start_index, stop_index, step_index = index_slice.indices(len(self)) + if step_index < 0: + raise NotImplementedError("negative slice step is not implemented") + if stop_index - start_index <= 0: + return self.__class__(dict(tie_indices=[], tie_values=[]), dim=self.dim) + elif (stop_index - start_index) <= step_index: + tie_indices = [0] + tie_values = [self.get_value(start_index)] + return self.__class__( + dict(tie_indices=tie_indices, tie_values=tie_values), dim=self.dim + ) + else: + end_index = stop_index - 1 + start_value = self.get_value(start_index) + end_value = self.get_value(end_index) + mask = (start_index < self.tie_indices) & (self.tie_indices < end_index) + tie_indices = np.insert( + self.tie_indices[mask], + (0, self.tie_indices[mask].size), + (start_index, end_index), + ) + tie_values = np.insert( + self.tie_values[mask], + (0, self.tie_values[mask].size), + (start_value, end_value), + ) + tie_indices -= tie_indices[0] + data = {"tie_indices": tie_indices, "tie_values": tie_values} + coord = self.__class__(data, self.dim) + if step_index != 1: + coord = coord.decimate(step_index) + return coord + + def get_indexer(self, value, method=None): + if isinstance(value, str): + value = np.datetime64(value) + else: + value = np.asarray(value) + try: + indexer = inverse(value, self.tie_indices, self.tie_values, method) + except ValueError as e: + if str(e) == "fp must be strictly increasing": + raise ValueError( + "overlaps were found in the coordinate. If this is due to some " + "jitter in the tie values, consider smoothing the coordinate by " + "including some tolerance. This can be done by " + "`da[dim] = da[dim].simplify(tolerance)`, or by specifying a " + "tolerance when opening multiple files." + ) + else: + raise e + return indexer + + def append(self, other): + if not isinstance(other, self.__class__): + raise TypeError(f"cannot append {type(other)} to {self.__class__}") + if not self.dim == other.dim: + raise ValueError("cannot append coordinate with different dimension") + if self.empty: + return other + if other.empty: + return self + if not self.dtype == other.dtype: + raise ValueError("cannot append coordinate with different dtype") + coord = self.__class__( + { + "tie_indices": np.append( + self.tie_indices, other.tie_indices + len(self) + ), + "tie_values": np.append(self.tie_values, other.tie_values), + }, + self.dim, + ) + return coord + + def decimate(self, q): + tie_indices = (self.tie_indices // q) * q + for k in range(1, len(tie_indices) - 1): + if tie_indices[k] == tie_indices[k - 1]: + tie_indices[k] += q + tie_values = [self.get_value(idx) for idx in tie_indices] + tie_indices //= q + return self.__class__( + dict(tie_indices=tie_indices, tie_values=tie_values), self.dim + ) + + def simplify(self, tolerance=None): + if tolerance is None: + if np.issubdtype(self.dtype, np.datetime64): + tolerance = np.timedelta64(0, "ns") + else: + tolerance = 0.0 + tie_indices, tie_values = douglas_peucker( + self.tie_indices, self.tie_values, tolerance + ) + return self.__class__( + dict(tie_indices=tie_indices, tie_values=tie_values), self.dim + ) + + def get_split_indices(self, tolerance=None): + (indices,) = np.nonzero(np.diff(self.tie_indices) == 1) + indices += 1 + if tolerance is not None: + deltas = self.tie_values[indices + 1] - self.tie_values[indices] + indices = indices[np.abs(deltas) >= tolerance] + return np.array( + [self.tie_indices[index] for index in indices], dtype=self.tie_indices.dtype + ) + + @classmethod + def from_array(cls, arr, dim=None, tolerance=None): + return cls( + {"tie_indices": np.arange(len(arr)), "tie_values": arr}, dim + ).simplify(tolerance) + + def to_dict(self): + tie_indices = self.data["tie_indices"] + tie_values = self.data["tie_values"] + if np.issubdtype(tie_values.dtype, np.datetime64): + tie_values = tie_values.astype(str) + data = { + "tie_indices": tie_indices.tolist(), + "tie_values": tie_values.tolist(), + } + return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + + def to_dataset(self, dataset, attrs): + mapping = f"{self.name}: {self.name}_indices {self.name}_values" + if "coordinate_interpolation" in attrs: + attrs["coordinate_interpolation"] += " " + mapping + else: + attrs["coordinate_interpolation"] = mapping + tie_indices = self.tie_indices + tie_values = ( + self.tie_values.astype("M8[ns]") + if np.issubdtype(self.tie_values.dtype, np.datetime64) + else self.tie_values + ) + interp_attrs = { + "interpolation_name": "linear", + "tie_points_mapping": f"{self.name}_points: {self.name}_indices {self.name}_values", + } + dataset.update( + { + f"{self.name}_interpolation": ((), np.nan, interp_attrs), + f"{self.name}_indices": (f"{self.name}_points", tie_indices), + f"{self.name}_values": (f"{self.name}_points", tie_values), + } + ) + return dataset, attrs + + @classmethod + def from_dataset(cls, dataset, name): + coords = {} + mapping = dataset[name].attrs.pop("coordinate_interpolation", None) + if mapping is not None: + matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) + for match in matches: + dim, indices, values = match + data = {"tie_indices": dataset[indices], "tie_values": dataset[values]} + coords[dim] = Coordinate(data, dim) + return coords + + @classmethod + def from_block(cls, start, size, step, dim=None, dtype=None): + return cls( + { + "tie_indices": [0, size - 1], + "tie_values": [start, start + step * (size - 1)], + }, + dim=dim, + ) + + +def douglas_peucker(x, y, epsilon): + mask = np.ones(len(x), dtype=bool) + stack = [(0, len(x))] + while stack: + start, stop = stack.pop() + ysimple = forward( + x[start:stop], + x[[start, stop - 1]], + y[[start, stop - 1]], + ) + d = np.abs(y[start:stop] - ysimple) + index = np.argmax(d) + dmax = d[index] + index += start + if dmax > epsilon: + stack.append([start, index + 1]) + stack.append([index, stop]) + else: + mask[start + 1 : stop - 1] = False + return x[mask], y[mask] diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py new file mode 100644 index 00000000..34485eaf --- /dev/null +++ b/xdas/coordinates/sampled.py @@ -0,0 +1,503 @@ +import re + +import numpy as np + +from .core import Coordinate, format_datetime, is_strictly_increasing, parse + +CODE_TO_UNITS = { + "h": "hours", + "m": "minutes", + "s": "seconds", + "ms": "milliseconds", + "us": "microseconds", + "ns": "nanoseconds", +} +UNITS_TO_CODE = {v: k for k, v in CODE_TO_UNITS.items()} + + +class SampledCoordinate(Coordinate, name="sampled"): + """ + A coordinate that is sampled at regular intervals. + + Parameters + ---------- + data : dict-like + The data of the coordinate. + dim : str, optional + The dimension name of the coordinate, by default None. + dtype : str or numpy.dtype, optional + The data type of the coordinate, by default None. + """ + + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + # empty + if data is None: + data = {"tie_values": [], "tie_lengths": [], "sampling_interval": None} + empty = True + else: + empty = False + + # parse data + data, dim = parse(data, dim) + if not self.__class__.isvalid(data): + raise ValueError( + "`data` must be dict-like and contain `tie_values`, `tie_lengths`, and " + "`sampling_interval`" + ) + tie_values = np.asarray(data["tie_values"], dtype=dtype) + tie_lengths = np.asarray(data["tie_lengths"]) + sampling_interval = data["sampling_interval"] + + # check shapes + if not tie_values.ndim == 1: + raise ValueError("`tie_values` must be 1D") + if not tie_lengths.ndim == 1: + raise ValueError("`tie_lengths` must be 1D") + if not len(tie_values) == len(tie_lengths): + raise ValueError("`tie_values` and `tie_lengths` must have the same length") + + # check dtypes and values + if not empty: + # tie_values + if not ( + np.issubdtype(tie_values.dtype, np.number) + or np.issubdtype(tie_values.dtype, np.datetime64) + ): + raise ValueError( + "`tie_values` must have either numeric or datetime dtype" + ) + + # tie_lengths + if not np.issubdtype(tie_lengths.dtype, np.integer): + raise ValueError("`tie_lengths` must be integer-like") + if not np.all(tie_lengths > 0): + raise ValueError("`tie_lengths` must be strictly positive integers") + + # sampling_interval + if not np.isscalar(sampling_interval): + raise ValueError("`sampling_interval` must be a scalar value") + sampling_interval = np.asarray(sampling_interval)[()] # ensure numpy scalar + if np.issubdtype(tie_values.dtype, np.datetime64): + if not np.issubdtype( + np.asarray(sampling_interval).dtype, np.timedelta64 + ): + raise ValueError( + "`sampling_interval` must be timedelta64 for datetime64 `tie_values`" + ) + + # store data + self.data = { + "tie_values": tie_values, + "tie_lengths": tie_lengths, + "sampling_interval": sampling_interval, + } + self.dim = dim + + @property + def tie_values(self): + return self.data["tie_values"] + + @property + def tie_lengths(self): + return self.data["tie_lengths"] + + @property + def sampling_interval(self): + return self.data["sampling_interval"] + + @property + def dtype(self): + return self.tie_values.dtype + + @property + def tie_indices(self): + return np.concatenate(([0], np.cumsum(self.tie_lengths[:-1]))) + + @property + def empty(self): + return self.tie_values.shape == (0,) + + @property + def ndim(self): + return self.tie_values.ndim + + @property + def shape(self): + return (len(self),) + + @property + def indices(self): + if self.empty: + return np.array([], dtype="int") + else: + return np.arange(len(self)) + + @property + def values(self): + if self.empty: + return np.array([], dtype=self.dtype) + else: + return self.get_value(self.indices) + + @property + def start(self): + return self.tie_values[0] + + @property + def end(self): + return self.tie_values[-1] + self.sampling_interval * self.tie_lengths[-1] + + @staticmethod + def isvalid(data): + match data: + case { + "tie_values": _, + "tie_lengths": _, + "sampling_interval": _, + }: + return True + case _: + return False + + def __len__(self): + if self.empty: + return 0 + else: + return sum(self.tie_lengths) + + def __repr__(self): + if self.empty: + return "empty coordinate" + elif len(self) == 1: + return f"{self.tie_values[0]}" + else: + if np.issubdtype(self.dtype, np.floating): + return f"{self.start:.3f} to {self.end:.3f}" + elif np.issubdtype(self.dtype, np.datetime64): + start_str = format_datetime(self.start) + end_str = format_datetime(self.end) + return f"{start_str} to {end_str}" + else: + return f"{self.start} to {self.end}" + + def __getitem__(self, item): + if isinstance(item, slice): + return self.slice_index(item) + else: + return Coordinate( + self.get_value(item), None if np.isscalar(item) else self.dim + ) + + def __add__(self, other): + return self.__class__( + { + "tie_values": self.tie_values + other, + "tie_lengths": self.tie_lengths, + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def __sub__(self, other): + return self.__class__( + { + "tie_values": self.tie_values - other, + "tie_lengths": self.tie_lengths, + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def __array__(self, dtype=None): + out = self.values + if dtype is not None: + out = out.__array__(dtype) + return out + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + raise NotImplementedError + + def __array_function__(self, func, types, args, kwargs): + raise NotImplementedError + + def issampled(self): + return True + + def get_sampling_interval(self, cast=True): + delta = self.sampling_interval + if cast and np.issubdtype(delta.dtype, np.timedelta64): + delta = delta / np.timedelta64(1, "s") + return delta + + def equals(self, other): + return ( + np.array_equal(self.tie_values, other.tie_values) + and np.array_equal(self.tie_lengths, other.tie_lengths) + and self.sampling_interval == other.sampling_interval + and self.dim == other.dim + and self.dtype == other.dtype + ) + + def get_value(self, index): + index = self.format_index(index, bounds="raise") + reference = np.searchsorted(self.tie_indices, index, side="right") - 1 + return self.tie_values[reference] + ( + (index - self.tie_indices[reference]) * self.sampling_interval + ) + + def slice_index(self, index_slice): + # normalize slice + start, stop, step = index_slice.indices(len(self)) + + if step < 0: + raise NotImplementedError("negative slice step is not implemented") + + # align stop + stop += (start - stop) % step # TODO: check for negative step + + # get relative start and stop within each tie + q, r = np.divmod(start - self.tie_indices, step) + lo = np.maximum(q, 0) * step + r + + q, r = np.divmod(self.tie_indices + self.tie_lengths - stop, step) + hi = self.tie_lengths - np.maximum(q, 0) * step + r + + # filter empty segments + mask = hi > lo + lo = lo[mask] + hi = hi[mask] + + # compute new tie values, tie lengths and sampling interval + tie_values = self.tie_values[mask] + lo * self.sampling_interval + tie_lengths = (hi - lo) // step + sampling_interval = self.sampling_interval * step + + # build new coordinate + data = { + "tie_values": tie_values, + "tie_lengths": tie_lengths, + "sampling_interval": sampling_interval, + } + return self.__class__(data, self.dim) + + def get_indexer(self, value, method=None): + if isinstance(value, str): + value = np.datetime64(value) + else: + value = np.asarray(value) + if not is_strictly_increasing( + self.tie_values + ): # TODO: make it work even in this case + raise ValueError("tie_values must be strictly increasing") + + # find preceeding tie point + reference = np.searchsorted(self.tie_values, value, side="right") - 1 + reference = np.maximum(reference, 0) + + # overlaps + before = np.maximum(reference - 1, 0) + end = ( + self.tie_values[before] + + (self.tie_lengths[before] - 1) * self.sampling_interval + ) + if np.any((reference > 0) & (value <= end)): + raise KeyError("value is in an overlap region") + + # gap + after = np.minimum(reference + 1, len(self.tie_values) - 1) + end = ( + self.tie_values[reference] + + (self.tie_lengths[reference] - 1) * self.sampling_interval + ) + match method: + case "nearest": + mask = (reference < len(self.tie_values) - 1) & ( + value - end >= self.tie_values[after] - value + ) + reference = np.where(mask, after, reference) + case "bfill": + mask = (reference < len(self.tie_values) - 1) & (value >= end) + reference = np.where(mask, after, reference) + case "ffill" | None: + pass + case _: + raise ValueError( + "method must be one of `None`, 'nearest', 'ffill', or 'bfill'" + ) + + offset = (value - self.tie_values[reference]) / self.sampling_interval + + match method: + case None: + if np.any( + (offset % 1 != 0) + | (offset < 0) + | (offset >= self.tie_lengths[reference]) + ): + raise KeyError("index not found") + offset = offset.astype(int) + case "nearest": + offset = np.round(offset).astype(int) + offset = np.clip(offset, 0, self.tie_lengths[reference] - 1) + case "ffill": + offset = np.floor(offset).astype(int) + if np.any(offset < 0): + raise KeyError("index not found") + offset = np.minimum(offset, self.tie_lengths[reference] - 1) + case "bfill": + offset = np.ceil(offset).astype(int) + if np.any(offset > self.tie_lengths[reference] - 1): + raise KeyError("index not found") + offset = np.maximum(offset, 0) + return self.tie_indices[reference] + offset + + def append(self, other): + if not isinstance(other, self.__class__): + raise TypeError(f"cannot append {type(other)} to {self.__class__}") + if not self.dim == other.dim: + raise ValueError("cannot append coordinate with different dimension") + if self.empty: + return other + if other.empty: + return self + if not self.dtype == other.dtype: + raise ValueError("cannot append coordinate with different dtype") + if not self.sampling_interval == other.sampling_interval: + raise ValueError( + "cannot append coordinate with different sampling intervals" + ) + tie_values = np.concatenate([self.tie_values, other.tie_values]) + tie_lengths = np.concatenate([self.tie_lengths, other.tie_lengths]) + return self.__class__( + { + "tie_values": tie_values, + "tie_lengths": tie_lengths, + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def decimate(self, q): + return self[::q] + + def simplify(self, tolerance=None): + if tolerance is None: + tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()] + tie_values = [self.tie_values[0]] + tie_lengths = [self.tie_lengths[0]] + for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): + delta = value - (tie_values[-1] + self.sampling_interval * tie_lengths[-1]) + if np.abs(delta) <= tolerance: + tie_lengths[-1] += length + else: + tie_values.append(value) + tie_lengths.append(length) + return self.__class__( + { + "tie_values": np.array(tie_values), + "tie_lengths": np.array(tie_lengths), + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def get_split_indices(self, tolerance=None): + indices = self.tie_indices[1:] + if tolerance is not None: + deltas = self.tie_values[1:] - ( + self.tie_values[:-1] + self.sampling_interval * self.tie_lengths[:-1] + ) + indices = indices[np.abs(deltas) > tolerance] + return indices + + @classmethod + def from_array(cls, arr, dim=None, sampling_interval=None): + raise NotImplementedError("from_array is not implemented for SampledCoordinate") + + def to_dict(self): + tie_values = self.data["tie_values"] + tie_lengths = self.data["tie_lengths"] + if np.issubdtype(tie_values.dtype, np.datetime64): + tie_values = tie_values.astype(str) + data = { + "tie_values": tie_values.tolist(), + "tie_lengths": tie_lengths.tolist(), + "sampling_interval": self.sampling_interval, + } + return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + + def to_dataset(self, dataset, attrs): + mapping = f"{self.name}: {self.name}_sampling" + if "coordinate_sampling" in attrs: + attrs["coordinate_sampling"] += " " + mapping + else: + attrs["coordinate_sampling"] = mapping + tie_values = ( + self.tie_values.astype("M8[ns]") + if np.issubdtype(self.tie_values.dtype, np.datetime64) + else self.tie_values + ) + tie_lengths = self.tie_lengths + interp_attrs = { + "tie_point_mapping": f"{self.dim}: {self.name}_values {self.name}_lengths", + } + + # timedelta + if np.issubdtype(self.sampling_interval.dtype, np.timedelta64): + code, count = np.datetime_data(self.sampling_interval.dtype) + interp_attrs["dtype"] = "timedelta64[ns]" + interp_attrs["units"] = CODE_TO_UNITS[code] + sampling_interval = count * self.sampling_interval.astype(int) + else: + sampling_interval = self.sampling_interval + + dataset.update( + { + f"{self.name}_sampling": ((), sampling_interval, interp_attrs), + f"{self.name}_values": (f"{self.name}_points", tie_values), + f"{self.name}_lengths": (f"{self.name}_points", tie_lengths), + } + ) + return dataset, attrs + + @classmethod + def from_dataset(cls, dataset, name): + coords = {} + mapping = dataset[name].attrs.pop("coordinate_sampling", None) + if mapping is not None: + matches = re.findall(r"(\w+): (\w+)", mapping) + for match in matches: + name, sampling = match + dim, values, lengths = re.match( + r"(\w+): (\w+) (\w+)", dataset[sampling].attrs["tie_point_mapping"] + ).groups() + data = { + "tie_values": dataset[values].values, + "tie_lengths": dataset[lengths].values, + "sampling_interval": dataset[sampling].values[()], + } + + # timedelta + if ( + "dtype" in dataset[sampling].attrs + and "units" in dataset[sampling].attrs + ): + data["sampling_interval"] = np.timedelta64( + data["sampling_interval"], + UNITS_TO_CODE[dataset[sampling].attrs.pop("units")], + ).astype(dataset[sampling].attrs.pop("dtype")) + + coords[name] = Coordinate(data, dim) + return coords + + @classmethod + def from_block(cls, start, size, step, dim=None, dtype=None): + data = { + "tie_values": [start], + "tie_lengths": [size], + "sampling_interval": step, + } + return cls(data, dim=dim, dtype=dtype) diff --git a/xdas/coordinates/scalar.py b/xdas/coordinates/scalar.py new file mode 100644 index 00000000..8a97da47 --- /dev/null +++ b/xdas/coordinates/scalar.py @@ -0,0 +1,54 @@ +import numpy as np + +from .core import Coordinate, parse + + +class ScalarCoordinate(Coordinate): + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + if data is None: + raise TypeError("scalar coordinate cannot be empty, please provide a value") + data, dim = parse(data, dim) + if dim is not None: + raise ValueError("a scalar coordinate cannot be a dim") + if not self.__class__.isvalid(data): + raise TypeError("`data` must be scalar-like") + self.data = np.asarray(data, dtype=dtype) + + @property + def dim(self): + return None + + @dim.setter + def dim(self, value): + if value is not None: + raise ValueError("A scalar coordinate cannot have a `dim` other that None") + + @staticmethod + def isvalid(data): + data = np.asarray(data) + return (data.dtype != np.dtype(object)) and (data.ndim == 0) + + def isscalar(self): + return True + + def get_sampling_interval(self, cast=True): + return None + + def equals(self, other): + if isinstance(other, self.__class__): + return self.data == other.data + else: + return False + + def to_index(self, item, method=None, endpoint=True): + raise NotImplementedError("cannot get index of scalar coordinate") + + def to_dict(self): + if np.issubdtype(self.dtype, np.datetime64): + data = self.data.astype(str).item() + else: + data = self.data.item() + return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py deleted file mode 100644 index 8f251821..00000000 --- a/xdas/core/coordinates.py +++ /dev/null @@ -1,1104 +0,0 @@ -from copy import copy, deepcopy -from functools import wraps - -import numpy as np -import pandas as pd -from xinterp import forward, inverse - - -def wraps_first_last(func): - @wraps(func) - def wrapper(self, dim, *args, **kwargs): - if dim == "first": - dim = self._dims[0] - if dim == "last": - dim = self._dims[-1] - return func(self, dim, *args, **kwargs) - - return wrapper - - -class Coordinates(dict): - """ - Dictionary like container for coordinates. - - Parameters - ---------- - coords: dict-like, optional - Mapping from coordinate names to any of the followings: - - - Coordinate objects - - tuples (dim, coordinate-like) which can be either dimensional (`dim == name`) - or non-dimensional (`dim != name` or `dim == None`). - - coordinate-like objects (that are passed to the Coordinate constructor) - which are assumed to be a dimensional coordinate with `dim` set to the - related name. - - dims: squence of str, optional - An ordered sequence of dimensions. It is meant to match the dimensionality of - its associated data. If provided, it must at least include all dimensions found - in `coords` (extras dimensions will be considered as empty coordinates). - Otherwise, dimensions will be guessed from `coords`. - - Examples - -------- - >>> import xdas - - >>> coords = { - ... "time": {"tie_indices": [0, 999], "tie_values": [0.0, 10.0]}, - ... "distance": [0, 1, 2], - ... "channel": ("distance", ["DAS01", "DAS02", "DAS03"]), - ... "interrogator": (None, "SRN"), - ... } - >>> xdas.Coordinates(coords) - Coordinates: - * time (time): 0.000 to 10.000 - * distance (distance): [0 ... 2] - channel (distance): ['DAS01' ... 'DAS03'] - interrogator: 'SRN' - """ - - def __init__(self, coords=None, dims=None): - super().__init__() - if isinstance(coords, Coordinates): - if dims is None: - dims = coords.dims - coords = dict(coords) - self._dims = () if dims is None else tuple(dims) - self._parent = None - if coords is not None: - for name in coords: - self[name] = coords[name] - - @wraps_first_last - def __getitem__(self, key): - if key in self.dims and key not in self: - raise KeyError(f"dimension {key} has no coordinate") - return super().__getitem__(key) - - @wraps_first_last - def __setitem__(self, key, value): - if not isinstance(key, str): - raise TypeError("dimension names must be of type str") - coord = Coordinate(value) - if coord.dim is None and not coord.isscalar(): - coord.dim = key - if self._parent is None: - if coord.dim is not None and coord.dim not in self.dims: - self._dims = self.dims + (coord.dim,) - else: - if coord.dim is not None: - if coord.dim not in self.dims: - raise KeyError( - f"cannot add new dimension {coord.dim} to an existing DataArray" - ) - size = self._parent.sizes[coord.dim] - if not len(coord) == size: - raise ValueError( - f"conflicting sizes for dimension {coord.dim}: size {len(coord)} " - f"in `coords` and size {size} in `data`" - ) - coord._parent = self - return super().__setitem__(key, coord) - - def __repr__(self): - lines = ["Coordinates:"] - for name, coord in self.items(): - if self.isdim(name): - lines.append(f" * {name} ({coord.dim}): {coord}") - else: - if coord.dim is None: - lines.append(f" {name}: {coord}") - else: - lines.append(f" {name} ({coord.dim}): {coord}") - return "\n".join(lines) - - def __reduce__(self): - return self.__class__, (dict(self), self.dims), {"_parent": self._parent} - - @property - def dims(self): - return self._dims - - def isdim(self, name): - return self[name].dim == name - - def get_query(self, item): - """ - Format a query from one or multiple indexer. - - Parameters - ---------- - item: indexer-like or sequence or mapping - Object to be parsed as a query. If item is indexer-like object, it is - applied on the first dimension. If item is a sequence, positional indexing - is performed. If item is a mapping, labeled indexing is performed. - - Returns - ------- - dict: - A mapping between each dim and a given indexer. If No indexer was found for - a given dim, slice(None) will be used. - """ - query = {dim: slice(None) for dim in self.dims} - if isinstance(item, dict): - if "first" in item: - item[self.dims[0]] = item.pop("first") - if "last" in item: - item[self.dims[-1]] = item.pop("last") - query.update(item) - elif isinstance(item, tuple): - for k in range(len(item)): - query[self.dims[k]] = item[k] - else: - query[self.dims[0]] = item - for dim, item in query.items(): - if isinstance(item, tuple): - msg = f"cannot use tuple {item} to index dim '{dim}'" - if len(item) == 2: - msg += f". Did you mean: {dim}=slice({item[0]}, {item[1]})?" - raise TypeError(msg) - return query - - def to_index(self, item, method=None, endpoint=True): - query = self.get_query(item) - return {dim: self[dim].to_index(query[dim], method, endpoint) for dim in query} - - def equals(self, other): - if not isinstance(other, Coordinates): - return False - for name in self: - if not self[name].equals(other[name]): - return False - return True - - def to_dict(self): - """Convert this `Coordinates` object into a pure python dictionnary. - - Examples - -------- - - >>> import xdas - - >>> coords = xdas.Coordinates( - ... { - ... "time": {"tie_indices": [0, 999], "tie_values": [0.0, 10.0]}, - ... "distance": [0, 1, 2], - ... "channel": ("distance", ["DAS01", "DAS02", "DAS03"]), - ... "interrogator": (None, "SRN"), - ... } - ... ) - >>> coords.to_dict() - {'dims': ('time', 'distance'), - 'coords': {'time': {'dim': 'time', - 'data': {'tie_indices': [0, 999], 'tie_values': [0.0, 10.0]}, - 'dtype': 'float64'}, - 'distance': {'dim': 'distance', 'data': [0, 1, 2], 'dtype': 'int64'}, - 'channel': {'dim': 'distance', - 'data': ['DAS01', 'DAS02', 'DAS03'], - 'dtype': '}") - if dtype is not None: - raise ValueError("`dtype` is not supported for DefaultCoordinate") - self.data = data - self.dim = dim - - def __len__(self): - if self.data["size"] is None: - return 0 - else: - return self.data["size"] - - def __getitem__(self, item): - data = self.__array__()[item] - if ScalarCoordinate.isvalid(data): - return ScalarCoordinate(data) - else: - return Coordinate(data, self.dim) - - def __array__(self, dtype=None): - return np.arange(self.data["size"], dtype=dtype) - - @staticmethod - def isvalid(data): - match data: - case {"size": None | int(_)}: - return True - case _: - return False - - @property - def empty(self): - return bool(self.data["size"]) - - @property - def dtype(self): - return np.int64 - - @property - def ndim(self): - return 1 - - @property - def shape(self): - return (len(self),) - - def equals(self, other): - if isinstance(other, self.__class__): - return self.data["size"] == other.data["size"] - - def get_indexer(self, value, method=None): - return value - - def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): - return slice(start, stop, step) - - def append(self, other): - if not isinstance(other, self.__class__): - raise TypeError(f"cannot append {type(other)} to {self.__class__}") - if not self.dim == other.dim: - raise ValueError("cannot append coordinate with different dimension") - return self.__class__({"size": len(self) + len(other)}, self.dim) - - def to_dict(self): - return {"dim": self.dim, "data": self.data.tolist(), "dtype": str(self.dtype)} - - -class DenseCoordinate(Coordinate): - def __new__(cls, *args, **kwargs): - return object.__new__(cls) - - def __init__(self, data=None, dim=None, dtype=None): - if data is None: - data = [] - data, dim = parse(data, dim) - if not self.isvalid(data): - raise TypeError("`data` must be array-like") - self.data = np.asarray(data, dtype=dtype) - self.dim = dim - - @staticmethod - def isvalid(data): - data = np.asarray(data) - return (data.dtype != np.dtype(object)) and (data.ndim == 1) - - @property - def index(self): - return pd.Index(self.data) - - def equals(self, other): - if isinstance(other, self.__class__): - return ( - np.array_equal(self.data, other.data) - and self.dim == other.dim - and self.dtype == other.dtype - ) - else: - return False - - def get_indexer(self, value, method=None): - if np.isscalar(value): - out = self.index.get_indexer([value], method).item() - else: - out = self.index.get_indexer(value, method) - if np.any(out == -1): - raise KeyError("index not found") - return out - - def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): - slc = self.index.slice_indexer(start, stop, step) - if ( - (not endpoint) - and (stop is not None) - and (self[slc.stop - 1].values == stop) - ): - slc = slice(slc.start, slc.stop - 1, slc.step) - return slc - - def append(self, other): - if not isinstance(other, self.__class__): - raise TypeError(f"cannot append {type(other)} to {self.__class__}") - if not self.dim == other.dim: - raise ValueError("cannot append coordinate with different dimension") - if self.empty: - return other - if other.empty: - return self - if not self.dtype == other.dtype: - raise ValueError("cannot append coordinate with different dtype") - return self.__class__(np.concatenate([self.data, other.data]), self.dim) - - def to_dict(self): - if np.issubdtype(self.dtype, np.datetime64): - data = self.data.astype(str).tolist() - else: - data = self.data.tolist() - return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} - - -class InterpCoordinate(Coordinate): - """ - Array-like object used to represent piecewise evenly spaced coordinates using the - CF convention. - - The coordinate ticks are describes by the mean of tie points that are interpolated - when intermediate values are required. Coordinate objects provides label based - selections methods. - - Parameters - ---------- - tie_indices : sequence of integers - The indices of the tie points. Must include index 0 and be strictly increasing. - tie_values : sequence of float or datetime64 - The values of the tie points. Must be strictly increasing to enable label-based - selection. The len of `tie_indices` and `tie_values` sizes must match. - """ - - def __new__(cls, *args, **kwargs): - return object.__new__(cls) - - def __init__(self, data=None, dim=None, dtype=None): - if data is None: - data = {"tie_indices": [], "tie_values": []} - data, dim = parse(data, dim) - if not self.__class__.isvalid(data): - raise TypeError("`data` must be dict-like") - if not set(data) == {"tie_indices", "tie_values"}: - raise ValueError( - "both `tie_indices` and `tie_values` key should be provided" - ) - tie_indices = np.asarray(data["tie_indices"]) - tie_values = np.asarray(data["tie_values"], dtype=dtype) - if not tie_indices.ndim == 1: - raise ValueError("`tie_indices` must be 1D") - if not tie_values.ndim == 1: - raise ValueError("`tie_values` must be 1D") - if not len(tie_indices) == len(tie_values): - raise ValueError("`tie_indices` and `tie_values` must have the same length") - if not tie_indices.shape == (0,): - if not np.issubdtype(tie_indices.dtype, np.integer): - raise ValueError("`tie_indices` must be integer-like") - if not tie_indices[0] == 0: - raise ValueError("`tie_indices` must start with a zero") - if not is_strictly_increasing(tie_indices): - raise ValueError("`tie_indices` must be strictly increasing") - if not ( - np.issubdtype(tie_values.dtype, np.number) - or np.issubdtype(tie_values.dtype, np.datetime64) - ): - raise ValueError("`tie_values` must have either numeric or datetime dtype") - tie_indices = tie_indices.astype(int) - self.data = dict(tie_indices=tie_indices, tie_values=tie_values) - self.dim = dim - - @staticmethod - def isvalid(data): - match data: - case {"tie_indices": _, "tie_values": _}: - return True - case _: - return False - - def __len__(self): - if self.empty: - return 0 - else: - return self.tie_indices[-1] - self.tie_indices[0] + 1 - - def __repr__(self): - if len(self) == 0: - return "empty coordinate" - elif len(self) == 1: - return f"{self.tie_values[0]}" - else: - if np.issubdtype(self.dtype, np.floating): - return f"{self.tie_values[0]:.3f} to {self.tie_values[-1]:.3f}" - elif np.issubdtype(self.dtype, np.datetime64): - start = format_datetime(self.tie_values[0]) - end = format_datetime(self.tie_values[-1]) - return f"{start} to {end}" - else: - return f"{self.tie_values[0]} to {self.tie_values[-1]}" - - def __getitem__(self, item): - if isinstance(item, slice): - return self.slice_index(item) - elif np.isscalar(item): - return ScalarCoordinate(self.get_value(item), None) - else: - return DenseCoordinate(self.get_value(item), self.dim) - - def __add__(self, other): - return self.__class__( - {"tie_indices": self.tie_indices, "tie_values": self.tie_values + other}, - self.dim, - ) - - def __sub__(self, other): - return self.__class__( - {"tie_indices": self.tie_indices, "tie_values": self.tie_values - other}, - self.dim, - ) - - def __array__(self, dtype=None): - out = self.values - if dtype is not None: - out = out.__array__(dtype) - return out - - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - raise NotImplementedError - - def __array_function__(self, func, types, args, kwargs): - raise NotImplementedError - - @property - def tie_indices(self): - return self.data["tie_indices"] - - @property - def tie_values(self): - return self.data["tie_values"] - - @property - def empty(self): - return self.tie_indices.shape == (0,) - - @property - def dtype(self): - return self.tie_values.dtype - - @property - def ndim(self): - return self.tie_values.ndim - - @property - def shape(self): - return (len(self),) - - @property - def indices(self): - if self.empty: - return np.array([], dtype="int") - else: - return np.arange(self.tie_indices[-1] + 1) - - @property - def values(self): - if self.empty: - return np.array([], dtype=self.dtype) - else: - return self.get_value(self.indices) - - def equals(self, other): - return ( - np.array_equal(self.tie_indices, other.tie_indices) - and np.array_equal(self.tie_values, other.tie_values) - and self.dim == other.dim - and self.dtype == other.dtype - ) - - def get_value(self, index): - index = self.format_index(index) - return forward(index, self.tie_indices, self.tie_values) - - def format_index(self, idx, bounds="raise"): - idx = np.asarray(idx) - if not np.issubdtype(idx.dtype, np.integer): - raise IndexError("only integer are valid index") - idx = idx + (idx < 0) * len(self) - if bounds == "raise": - if np.any(idx < 0) or np.any(idx >= len(self)): - raise IndexError("index is out of bounds") - elif bounds == "clip": - idx = np.clip(idx, 0, len(self)) - return idx - - def slice_index(self, index_slice): - index_slice = self.format_index_slice(index_slice) - start_index, stop_index, step_index = ( - index_slice.start, - index_slice.stop, - index_slice.step, - ) - if stop_index - start_index <= 0: - return self.__class__(dict(tie_indices=[], tie_values=[])) - elif (stop_index - start_index) <= step_index: - tie_indices = [0] - tie_values = [self.get_value(start_index)] - return self.__class__(dict(tie_indices=tie_indices, tie_values=tie_values)) - else: - end_index = stop_index - 1 - start_value = self.get_value(start_index) - end_value = self.get_value(end_index) - mask = (start_index < self.tie_indices) & (self.tie_indices < end_index) - tie_indices = np.insert( - self.tie_indices[mask], - (0, self.tie_indices[mask].size), - (start_index, end_index), - ) - tie_values = np.insert( - self.tie_values[mask], - (0, self.tie_values[mask].size), - (start_value, end_value), - ) - tie_indices -= tie_indices[0] - data = {"tie_indices": tie_indices, "tie_values": tie_values} - coord = self.__class__(data, self.dim) - if step_index != 1: - coord = coord.decimate(step_index) - return coord - - def format_index_slice(self, slc): - start = slc.start - stop = slc.stop - step = slc.step - if start is None: - start = 0 - if stop is None: - stop = len(self) - if step is None: - step = 1 - start = self.format_index(start, bounds="clip") - stop = self.format_index(stop, bounds="clip") - return slice(start, stop, step) - - def get_indexer(self, value, method=None): - if isinstance(value, str): - value = np.datetime64(value) - else: - value = np.asarray(value) - try: - indexer = inverse(value, self.tie_indices, self.tie_values, method) - except ValueError as e: - if str(e) == "fp must be strictly increasing": - raise ValueError( - "overlaps were found in the coordinate. If this is due to some " - "jitter in the tie values, consider smoothing the coordinate by " - "including some tolerance. This can be done by " - "`da[dim] = da[dim].simplify(tolerance)`, or by specifying a " - "tolerance when opening multiple files." - ) - else: - raise e - return indexer - - def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): - if start is not None: - try: - start_index = self.get_indexer(start, method="bfill") - except KeyError: - start_index = len(self) - else: - start_index = None - if stop is not None: - try: - end_index = self.get_indexer(stop, method="ffill") - stop_index = end_index + 1 - except KeyError: - stop_index = 0 - else: - stop_index = None - if step is not None: - raise NotImplementedError("cannot use step yet") - if ( - (not endpoint) - and (stop is not None) - and (self[stop_index - 1].values == stop) - ): - stop_index -= 1 - return slice(start_index, stop_index) - - def append(self, other): - if not isinstance(other, self.__class__): - raise TypeError(f"cannot append {type(other)} to {self.__class__}") - if not self.dim == other.dim: - raise ValueError("cannot append coordinate with different dimension") - if self.empty: - return other - if other.empty: - return self - if not self.dtype == other.dtype: - raise ValueError("cannot append coordinate with different dtype") - coord = self.__class__( - { - "tie_indices": np.append( - self.tie_indices, other.tie_indices + len(self) - ), - "tie_values": np.append(self.tie_values, other.tie_values), - }, - self.dim, - ) - return coord - - def decimate(self, q): - tie_indices = (self.tie_indices // q) * q - for k in range(1, len(tie_indices) - 1): - if tie_indices[k] == tie_indices[k - 1]: - tie_indices[k] += q - tie_values = [self.get_value(idx) for idx in tie_indices] - tie_indices //= q - return self.__class__( - dict(tie_indices=tie_indices, tie_values=tie_values), self.dim - ) - - def simplify(self, tolerance=None): - if tolerance is None: - if np.issubdtype(self.dtype, np.datetime64): - tolerance = np.timedelta64(0, "ns") - else: - tolerance = 0.0 - tie_indices, tie_values = douglas_peucker( - self.tie_indices, self.tie_values, tolerance - ) - return self.__class__( - dict(tie_indices=tie_indices, tie_values=tie_values), self.dim - ) - - def get_discontinuities(self): - """ - Returns a DataFrame containing information about the discontinuities. - - Returns - ------- - pandas.DataFrame - A DataFrame with the following columns: - - - start_index : int - The index where the discontinuity starts. - - end_index : int - The index where the discontinuity ends. - - start_value : float - The value at the start of the discontinuity. - - end_value : float - The value at the end of the discontinuity. - - delta : float - The difference between the end_value and start_value. - - type : str - The type of the discontinuity, either "gap" or "overlap". - - """ - (indices,) = np.nonzero(np.diff(self.tie_indices) == 1) - records = [] - for index in indices: - start_index = self.tie_indices[index] - end_index = self.tie_indices[index + 1] - start_value = self.tie_values[index] - end_value = self.tie_values[index + 1] - record = { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": ("gap" if end_value > start_value else "overlap"), - } - records.append(record) - return pd.DataFrame.from_records(records) - - def get_availabilities(self): - """ - Returns a DataFrame containing information about the data availability. - - Returns - ------- - pandas.DataFrame - A DataFrame with the following columns: - - - start_index : int - The index where the discontinuity starts. - - end_index : int - The index where the discontinuity ends. - - start_value : float - The value at the start of the discontinuity. - - end_value : float - The value at the end of the discontinuity. - - delta : float - The difference between the end_value and start_value. - - type : str - The type of the discontinuity, always "data". - - """ - if self.empty: - return pd.DataFrame( - columns=[ - "start_index", - "end_index", - "start_value", - "end_value", - "delta", - "type", - ] - ) - (cut_pos,) = np.nonzero(np.diff(self.tie_indices) == 1) - # start each segment after the previous cut (or at 0) - starts = np.concatenate(([0], cut_pos + 1)) - # end each segment at the cut position (or at n-1 for the last) - ends = np.concatenate((cut_pos, [len(self.tie_indices) - 1])) - records = [] - for s, e in zip(starts, ends): - start_index = self.tie_indices[s] - end_index = self.tie_indices[e] - start_value = self.tie_values[s] - end_value = self.tie_values[e] - records.append( - { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": "data", - } - ) - return pd.DataFrame.from_records(records) - - @classmethod - def from_array(cls, arr, dim=None, tolerance=None): - return cls( - {"tie_indices": np.arange(len(arr)), "tie_values": arr}, dim - ).simplify(tolerance) - - def to_dict(self): - tie_indices = self.data["tie_indices"] - tie_values = self.data["tie_values"] - if np.issubdtype(tie_values.dtype, np.datetime64): - tie_values = tie_values.astype(str) - data = { - "tie_indices": tie_indices.tolist(), - "tie_values": tie_values.tolist(), - } - return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} - - -def parse(data, dim=None): - if isinstance(data, tuple): - if dim is None: - dim, data = data - else: - _, data = data - if isinstance(data, Coordinate): - if dim is None: - dim = data.dim - data = data.data - return data, dim - - -def get_sampling_interval(da, dim, cast=True): - """ - Returns the sample spacing along a given dimension. - - Parameters - ---------- - da : DataArray or DataArray or DataArray - The data from which extract the sample spacing. - dim : str - The dimension along which get the sample spacing. - cast: bool, optional - Wether to cast datetime64 to seconds, by default True. - - Returns - ------- - float - The sample spacing. - """ - if da.sizes[dim] < 2: - raise ValueError( - "cannot compute sample spacing on a dimension with less than 2 points" - ) - coord = da[dim] - if isinstance(coord, InterpCoordinate): - num = np.diff(coord.tie_values) - den = np.diff(coord.tie_indices) - mask = den != 1 - num = num[mask] - den = den[mask] - d = np.median(num / den) - else: - d = (coord[-1].values - coord[0].values) / (len(coord) - 1) - d = np.asarray(d) - if cast and np.issubdtype(d.dtype, np.timedelta64): - d = d / np.timedelta64(1, "s") - return d - - -def is_strictly_increasing(x): - if np.issubdtype(x.dtype, np.datetime64): - return np.all(np.diff(x) > np.timedelta64(0, "ns")) - else: - return np.all(np.diff(x) > 0) - - -def douglas_peucker(x, y, epsilon): - mask = np.ones(len(x), dtype=bool) - stack = [(0, len(x))] - while stack: - start, stop = stack.pop() - ysimple = forward( - x[start:stop], - x[[start, stop - 1]], - y[[start, stop - 1]], - ) - d = np.abs(y[start:stop] - ysimple) - index = np.argmax(d) - dmax = d[index] - index += start - if dmax > epsilon: - stack.append([start, index + 1]) - stack.append([index, stop]) - else: - mask[start + 1 : stop - 1] = False - return x[mask], y[mask] - - -def format_datetime(x): - string = str(x) - if "." in string: - datetime, digits = string.split(".") - digits = digits[:3] - return ".".join([datetime, digits]) - else: - return string diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 2dd762a6..5100cee4 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -1,6 +1,4 @@ import copy -import json -import re import warnings from functools import partial @@ -12,9 +10,9 @@ from dask.array import Array as DaskArray from numpy.lib.mixins import NDArrayOperatorsMixin -from ..dask.core import dumps, from_dict, loads, to_dict +from ..coordinates import Coordinates, get_sampling_interval +from ..dask.core import create_variable, from_dict, loads, to_dict from ..virtual import VirtualArray, VirtualSource, _to_human -from .coordinates import Coordinate, Coordinates, get_sampling_interval HANDLED_NUMPY_FUNCTIONS = {} HANDLED_METHODS = {} @@ -131,7 +129,8 @@ def __array__(self, dtype=None): return self.data.__array__(dtype) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - from .routines import broadcast_coords, broadcast_to # TODO: circular import + from .routines import broadcast_coords # TODO: circular import + from .routines import broadcast_to if not method == "__call__": return NotImplemented @@ -874,45 +873,31 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): """ if virtual is None: virtual = isinstance(self.data, (VirtualArray, DaskArray)) - ds = xr.Dataset(attrs={"Conventions": "CF-1.9"}) - mappings = [] - for name, coord in self.coords.items(): - if coord.isinterp(): - mappings.append(f"{name}: {name}_indices {name}_values") - tie_indices = coord.tie_indices - tie_values = ( - coord.tie_values.astype("M8[ns]") - if np.issubdtype(coord.tie_values.dtype, np.datetime64) - else coord.tie_values - ) - attrs = { - "interpolation_name": "linear", - "tie_points_mapping": f"{name}_points: {name}_indices {name}_values", - } - ds.update( - { - f"{name}_interpolation": ((), np.nan, attrs), - f"{name}_indices": (f"{name}_points", tie_indices), - f"{name}_values": (f"{name}_points", tie_values), - } - ) - else: - ds = ds.assign_coords( - {name: (coord.dim, coord.values) if coord.dim else coord.values} - ) - mapping = " ".join(mappings) - attrs = {} if self.attrs is None else self.attrs - attrs |= {"coordinate_interpolation": mapping} if mapping else attrs - name = "__values__" if self.name is None else self.name + + # initialize + dataset = xr.Dataset(attrs={"Conventions": "CF-1.9"}) + variable_attrs = {} if self.attrs is None else self.attrs + variable_name = "__values__" if self.name is None else self.name + + # prepare metadata + for coord in self.coords.values(): + dataset, variable_attrs = coord.to_dataset(dataset, variable_attrs) + + # write data with h5netcdf.File(fname, mode=mode) as file: + # group if group is not None and group not in file: file.create_group(group) file = file if group is None else file[group] + + # dims file.dimensions.update(self.sizes) + + # variable if not virtual: encoding = {} if encoding is None else encoding variable = file.create_variable( - name, + variable_name, self.dims, self.dtype, data=self.values, @@ -922,28 +907,24 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): if encoding is not None: raise ValueError("cannot use `encoding` with in virtual mode") if isinstance(self.data, VirtualArray): - self.data.to_dataset(file._h5group, name) - variable = file._variable_cls(file, name, self.dims) - file._variables[name] = variable - variable._attach_dim_scales() - variable._attach_coords() - variable._ensure_dim_id() - elif isinstance(self.data, DaskArray): - variable = file.create_variable( - name, - self.dims, - self.dtype, + variable = self.data.create_variable( + file, variable_name, self.dims, self.dtype ) - variable.attrs.update( - {"__dask_array__": np.frombuffer(dumps(self.data), "uint8")} + elif isinstance(self.data, DaskArray): + variable = create_variable( + self.data, file, variable_name, self.dims, self.dtype ) else: raise ValueError( "can only use `virtual=True` with a virtual array as data" ) - if attrs: - variable.attrs.update(attrs) - ds.to_netcdf(fname, mode="a", group=group, engine="h5netcdf") + + # attrs + if variable_attrs: + variable.attrs.update(variable_attrs) + + # write metadata + dataset.to_netcdf(fname, mode="a", group=group, engine="h5netcdf") @classmethod def from_netcdf(cls, fname, group=None): @@ -962,70 +943,54 @@ def from_netcdf(cls, fname, group=None): DataArray The openend data array. """ - with xr.open_dataset(fname, group=group, engine="h5netcdf") as ds: - if not ("Conventions" in ds.attrs and "CF" in ds.attrs["Conventions"]): + # read metadata + with xr.open_dataset( + fname, group=group, engine="h5netcdf", decode_timedelta=False + ) as dataset: + # check file format + if not ( + "Conventions" in dataset.attrs and "CF" in dataset.attrs["Conventions"] + ): raise TypeError( "file format not recognized. please provide the file format " "with the `engine` keyword argument" ) - if len(ds) == 1: - name, da = next(iter(ds.items())) - coords = { - name: ( - ( - coord.dims[0], - ( - coord.values.astype("U") - if coord.dtype == np.dtype("O") - else coord.values - ), - ) - if coord.dims - else coord.values - ) - for name, coord in da.coords.items() - } + + # identify the "main" data array + if len(dataset) == 1: + name = next(iter(dataset.keys())) else: - data_vars = [ - var - for var in ds.values() - if "coordinate_interpolation" in var.attrs - ] + data_vars = { + key: var + for key, var in dataset.items() + if any("coordinate" in attr for attr in var.attrs) + } if len(data_vars) == 1: - da = data_vars[0] + name = next(iter(data_vars.keys())) else: raise ValueError("several possible data arrays detected") - coords = { - name: ( - ( - coord.dims[0], - ( - coord.values.astype("U") - if coord.dtype == np.dtype("O") - else coord.values - ), - ) - if coord.dims - else coord.values - ) - for name, coord in da.coords.items() - } - mapping = da.attrs.pop("coordinate_interpolation") - matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) - for match in matches: - dim, indices, values = match - data = {"tie_indices": ds[indices], "tie_values": ds[values]} - coords[dim] = Coordinate(data, dim) - with h5py.File(fname) as file: - if group: - file = file[group] - name = "__values__" if da.name is None else da.name - variable = file[name] - if "__dask_array__" in variable.attrs: - data = loads(da.attrs.pop("__dask_array__")) - else: - data = VirtualSource(file[name]) - return cls(data, coords, da.dims, da.name, None if da.attrs == {} else da.attrs) + + # read coordinates + coords = Coordinates.from_dataset(dataset, name) + + # read data + if "__dask_array__" in dataset[name].attrs: + data = loads(dataset[name].attrs.pop("__dask_array__")) + else: + with h5py.File(fname) as file: + if group: + file = file[group] + variable = file["__values__" if name is None else name] + data = VirtualSource(variable) + + # pack everything + return cls( + data, + coords, + dataset[name].dims, + name, + None if dataset[name].attrs == {} else dataset[name].attrs, + ) def to_dict(self): """Convert the DataArray to a dictionary.""" diff --git a/xdas/core/routines.py b/xdas/core/routines.py index bc9927f3..c3a75dcb 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -4,6 +4,7 @@ from collections import defaultdict from concurrent.futures import ProcessPoolExecutor, as_completed from glob import glob +from itertools import pairwise import numpy as np import pandas as pd @@ -11,8 +12,8 @@ import xarray as xr from tqdm import tqdm +from ..coordinates.core import Coordinates, get_sampling_interval from ..virtual import VirtualSource, VirtualStack -from .coordinates import Coordinates, InterpCoordinate, get_sampling_interval from .dataarray import DataArray from .datacollection import DataCollection, DataMapping, DataSequence @@ -612,9 +613,9 @@ def initialize(self, da): if self.dim in self.dims else da.coords.drop_coords(self.dim) ) - try: + if self.dim in da.coords: self.delta = get_sampling_interval(da, self.dim) - except (ValueError, KeyError): + else: self.delta = None self.dtype = da.dtype @@ -778,17 +779,9 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None if isinstance(indices_or_sections, str) and ( indices_or_sections == "discontinuities" ): - if isinstance(da[dim], InterpCoordinate): - coord = da[dim].simplify(tolerance) - (points,) = np.nonzero(np.diff(coord.tie_indices, prepend=[0]) == 1) - div_points = [coord.tie_indices[point] for point in points] - div_points = [0] + div_points + [da.sizes[dim]] - else: - raise TypeError( - "discontinuities can only be found on dimension that have as type " - "`InterpCoordinate`." - ) - elif isinstance(indices_or_sections, int): + indices_or_sections = da[dim].get_split_indices(tolerance) + + if isinstance(indices_or_sections, int): nsamples = da.sizes[dim] nchunk = indices_or_sections if nchunk <= 0: @@ -799,12 +792,9 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None chunks = extras * [chunk_size + 1] + (nchunk - extras) * [chunk_size] div_points = np.cumsum([0] + chunks, dtype=np.int64) else: - div_points = [0] + indices_or_sections + [da.sizes[dim]] + div_points = np.concatenate([[0], indices_or_sections, [da.sizes[dim]]]) return DataCollection( - [ - da.isel({dim: slice(div_points[idx], div_points[idx + 1])}) - for idx in range(len(div_points) - 1) - ] + [da.isel({dim: slice(start, stop)}) for start, stop in pairwise(div_points)] ) diff --git a/xdas/dask/__init__.py b/xdas/dask/__init__.py index 60e4af0a..4f612c92 100644 --- a/xdas/dask/__init__.py +++ b/xdas/dask/__init__.py @@ -1 +1 @@ -from .core import dumps, loads +from .core import create_variable, dumps, loads diff --git a/xdas/dask/core.py b/xdas/dask/core.py index 23972237..911145ad 100644 --- a/xdas/dask/core.py +++ b/xdas/dask/core.py @@ -1,8 +1,15 @@ +import numpy as np from dask.array import Array from . import serial +def create_variable(arr, file, name, dims=None, dtype=None): + variable = file.create_variable(name, dims, dtype) + variable.attrs.update({"__dask_array__": np.frombuffer(dumps(arr), "uint8")}) + return variable + + def dumps(arr): """Serialize a dask array.""" return serial.dumps(to_dict(arr)) diff --git a/xdas/fft.py b/xdas/fft.py index 5d6ca31d..bb47bb89 100644 --- a/xdas/fft.py +++ b/xdas/fft.py @@ -1,7 +1,7 @@ import numpy as np from .atoms.core import atomized -from .core.coordinates import get_sampling_interval +from .coordinates.core import get_sampling_interval from .core.dataarray import DataArray from .parallel import parallelize diff --git a/xdas/io/apsensing.py b/xdas/io/apsensing.py index be03a7f1..fca5089d 100644 --- a/xdas/io/apsensing.py +++ b/xdas/io/apsensing.py @@ -1,11 +1,14 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname): +def read(fname, ctype=None): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: t0 = file["Metadata"]["Timestamp"][()].item().decode() fs = file["DAQ"]["RepetitionFrequency"][()].item() @@ -18,6 +21,6 @@ def read(fname): t0 = np.datetime64(t0) dt = np.timedelta64(round(1e9 / fs), "ns") nt, nd = data.shape - t = {"tie_indices": [0, nt - 1], "tie_values": [t0, t0 + (nt - 1) * dt]} - d = {"tie_indices": [0, nd - 1], "tie_values": [0.0, (nd - 1) * dx]} - return DataArray(data, {"time": t, "distance": d}) + time = Coordinate[ctype["time"]].from_block(t0, nt, dt, dim="time") + distance = Coordinate[ctype["distance"]].from_block(0.0, nd, dx, dim="distance") + return DataArray(data, {"time": time, "distance": distance}) diff --git a/xdas/io/asn.py b/xdas/io/asn.py index 7dc42344..5ed5bb52 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -4,13 +4,14 @@ import numpy as np import zmq -from xdas.core.coordinates import get_sampling_interval - +from ..coordinates.core import Coordinate, get_sampling_interval from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname): +def read(fname, ctype=None): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: header = file["header"] t0 = np.datetime64(round(header["time"][()] * 1e9), "ns") @@ -18,8 +19,8 @@ def read(fname): dx = header["dx"][()] * np.median(np.diff(header["channels"])) data = VirtualSource(file["data"]) nt, nx = data.shape - time = {"tie_indices": [0, nt - 1], "tie_values": [t0, t0 + (nt - 1) * dt]} - distance = {"tie_indices": [0, nx - 1], "tie_values": [0.0, (nx - 1) * dx]} + time = Coordinate[ctype["time"]].from_block(t0, nt, dt, dim="time") + distance = Coordinate[ctype["distance"]].from_block(0.0, nx, dx, dim="distance") return DataArray(data, {"time": time, "distance": distance}) @@ -106,7 +107,7 @@ def _update_header(self, message): roiTable = header["roiTable"][0] di = (roiTable["roiStart"] // roiTable["roiDec"]) * header["dx"] de = (roiTable["roiEnd"] // roiTable["roiDec"]) * header["dx"] - self.distance = { + self.distance = { # TODO: use from_block "tie_indices": [0, header["nChannels"] - 1], "tie_values": [di, de], } @@ -115,7 +116,7 @@ def _update_header(self, message): def _unpack(self, message): t0 = np.frombuffer(message[:8], "datetime64[ns]").reshape(()) data = np.frombuffer(message[8:], self.dtype).reshape(self.shape) - time = { + time = { # TODO: use from_block "tie_indices": [0, self.shape[0] - 1], "tie_values": [t0, t0 + (self.shape[0] - 1) * self.delta], } diff --git a/xdas/io/core.py b/xdas/io/core.py index a4171a76..806740af 100644 --- a/xdas/io/core.py +++ b/xdas/io/core.py @@ -18,3 +18,26 @@ def get_free_port(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] + + +def parse_ctype(ctype): + if ctype is None: + ctype = { + "time": "interpolated", + "distance": "interpolated", + } + elif isinstance(ctype, str): + ctype = { + "time": ctype, + "distance": ctype, + } + elif isinstance(ctype, dict): + ctype = { + "time": ctype.get("time", "interpolated"), + "distance": ctype.get("distance", "interpolated"), + } + else: + raise ValueError( + "ctype must be None, str, or dict with 'time' and/or 'distance' keys" + ) + return ctype diff --git a/xdas/io/febus.py b/xdas/io/febus.py index 3488352d..e1c2f84d 100644 --- a/xdas/io/febus.py +++ b/xdas/io/febus.py @@ -3,12 +3,14 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..core.routines import concatenate from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname, overlaps=None, offset=None): +def read(fname, overlaps=None, offset=None, ctype=None): """ Open a Febus file into a xdas DataArray object. @@ -40,6 +42,7 @@ def read(fname, overlaps=None, offset=None): A data array containing the data from the Febus file. """ + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: (device_name,) = list(file.keys()) source = file[device_name]["Source1"] @@ -87,16 +90,19 @@ def read(fname, overlaps=None, offset=None): dt, dx = delta _, nt, nx = chunks.shape + dt = np.rint(1e6 * dt).astype("m8[us]").astype("m8[ns]") + dc = [] for t0, chunk in zip(times, chunks): - time = { - "tie_indices": [0, nt - 1], - "tie_values": np.rint(1e6 * np.array([t0, t0 + (nt - 1) * dt])) - .astype("M8[us]") - .astype("M8[ns]"), - } - distance = {"tie_indices": [0, nx - 1], "tie_values": [0.0, (nx - 1) * dx]} + + t0 = np.rint(1e6 * t0).astype("M8[us]").astype("M8[ns]") + time = Coordinate[ctype["time"]].from_block(t0, nt, dt, dim="time") + distance = Coordinate[ctype["distance"]].from_block(0.0, nx, dx, dim="distance") da = DataArray(chunk, {"time": time, "distance": distance}, name=name) dc.append(da) return concatenate(dc, "time") + + +def _to_datetime64(data): + return diff --git a/xdas/io/miniseed.py b/xdas/io/miniseed.py index 37e74610..375e25dc 100644 --- a/xdas/io/miniseed.py +++ b/xdas/io/miniseed.py @@ -2,19 +2,19 @@ import numpy as np import obspy -from ..core.coordinates import Coordinate, Coordinates +from ..coordinates.core import Coordinate, Coordinates from ..core.dataarray import DataArray -def read(fname, ignore_last_sample=False): - shape, dtype, coords, method = read_header(fname, ignore_last_sample) +def read(fname, ignore_last_sample=False, ctype="interpolated"): + shape, dtype, coords, method = read_header(fname, ignore_last_sample, ctype) data = dask.array.from_delayed( dask.delayed(read_data)(fname, method, ignore_last_sample), shape, dtype ) return DataArray(data, coords) -def read_header(path, ignore_last_sample): +def read_header(path, ignore_last_sample, ctype): st = obspy.read(path, headonly=True) dtype = uniquifiy(tr.data.dtype for tr in st) @@ -33,16 +33,20 @@ def read_header(path, ignore_last_sample): tmp_st = st.select(channel=channels[0]) for n, tr in enumerate(tmp_st): if n == 0: - time = get_time_coord(tr, ignore_last_sample=False) + time = get_time_coord(tr, ignore_last_sample=False, ctype=ctype) elif n == len(tmp_st) - 1: - time = time.append(get_time_coord(tr, ignore_last_sample)) + time = time.append(get_time_coord(tr, ignore_last_sample, ctype=ctype)) else: - time = time.append(get_time_coord(tr, ignore_last_sample=False)) + time = time.append( + get_time_coord(tr, ignore_last_sample=False, ctype=ctype) + ) else: method = "synchronized" - time = get_time_coord(st[0], ignore_last_sample) + time = get_time_coord(st[0], ignore_last_sample, ctype) - if not all(get_time_coord(tr, ignore_last_sample).equals(time) for tr in st): + if not all( + get_time_coord(tr, ignore_last_sample, ctype).equals(time) for tr in st + ): raise ValueError("All traces must be synchronized") network = uniquifiy(tr.stats.network for tr in st) @@ -85,27 +89,11 @@ def read_data(path, method, ignore_last_sample): return np.array(data) -def get_time_coord(tr, ignore_last_sample): - if ignore_last_sample: - return Coordinate( - { - "tie_indices": [0, tr.stats.npts - 2], - "tie_values": [ - np.datetime64(tr.stats.starttime), - np.datetime64(tr.stats.endtime - tr.stats.delta), - ], - } - ) - else: - return Coordinate( - { - "tie_indices": [0, tr.stats.npts - 1], - "tie_values": [ - np.datetime64(tr.stats.starttime), - np.datetime64(tr.stats.endtime), - ], - } - ) +def get_time_coord(tr, ignore_last_sample, ctype): + t0 = np.datetime64(tr.stats.starttime) + dt = np.rint(1e6 * tr.stats.delta).astype("m8[us]").astype("m8[ns]") + nt = tr.stats.npts - int(ignore_last_sample) + return Coordinate[ctype].from_block(t0, nt, dt, dim="time") def uniquifiy(seq): diff --git a/xdas/io/optasense.py b/xdas/io/optasense.py index 97d92084..f4f34bf7 100644 --- a/xdas/io/optasense.py +++ b/xdas/io/optasense.py @@ -1,11 +1,14 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname): +def read(fname, ctype=None): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: acquisition = file["Acquisition"] dx = acquisition.attrs["SpatialSamplingInterval"] @@ -14,6 +17,9 @@ def read(fname): tend = np.datetime64(rawdata.attrs["PartEndTime"][:-1]) data = VirtualSource(rawdata) nd, nt = data.shape - time = {"tie_indices": [0, nt - 1], "tie_values": [tstart, tend]} - distance = {"tie_indices": [0, nd - 1], "tie_values": [0.0, (nd - 1) * dx]} + time = { + "tie_indices": [0, nt - 1], + "tie_values": [tstart, tend], + } # TODO: use from_block + distance = Coordinate[ctype["distance"]].from_block(0.0, nd, dx, dim="distance") return DataArray(data, {"distance": distance, "time": time}) diff --git a/xdas/io/silixa.py b/xdas/io/silixa.py index f7955834..2c64e688 100644 --- a/xdas/io/silixa.py +++ b/xdas/io/silixa.py @@ -1,31 +1,31 @@ import dask import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray +from .core import parse_ctype from .tdms import TdmsReader -def read(fname): - shape, dtype, coords = read_header(fname) +def read(fname, ctype=None): + ctype = parse_ctype(ctype) + shape, dtype, coords = read_header(fname, ctype) data = dask.array.from_delayed(dask.delayed(read_data)(fname), shape, dtype) return DataArray(data, coords) -def read_header(fname): +def read_header(fname, ctype): with TdmsReader(fname) as tdms: props = tdms.get_properties() shape = tdms.channel_length, tdms.fileinfo["n_channels"] dtype = tdms._data_type t0 = np.datetime64(props["GPSTimeStamp"]) dt = np.timedelta64(round(1e9 / props["SamplingFrequency[Hz]"]), "ns") - time = { - "tie_indices": [0, shape[0] - 1], - "tie_values": [t0, t0 + dt * (shape[0] - 1)], - } + time = Coordinate[ctype["time"]].from_block(t0, shape[0], dt, dim="time") distance = { "tie_indices": [0, shape[1] - 1], "tie_values": [props["Start Distance (m)"], props["Stop Distance (m)"]], - } + } # TODO: use from_block coords = {"time": time, "distance": distance} return shape, dtype, coords diff --git a/xdas/io/sintela.py b/xdas/io/sintela.py index ba6fc865..60902824 100644 --- a/xdas/io/sintela.py +++ b/xdas/io/sintela.py @@ -1,11 +1,14 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname): +def read(fname, ctype=None): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: acquisition = file["Acquisition"] dx = acquisition.attrs["SpatialSamplingInterval"] @@ -14,6 +17,9 @@ def read(fname): tend = np.datetime64(rawdata.attrs["PartEndTime"].decode().split("+")[0]) data = VirtualSource(rawdata) nt, nd = data.shape - time = {"tie_indices": [0, nt - 1], "tie_values": [tstart, tend]} - distance = {"tie_indices": [0, nd - 1], "tie_values": [0.0, (nd - 1) * dx]} + time = { + "tie_indices": [0, nt - 1], + "tie_values": [tstart, tend], + } # TODO: use from_block + distance = Coordinate[ctype["distance"]].from_block(0.0, nd, dx, dim="distance") return DataArray(data, {"time": time, "distance": distance}) diff --git a/xdas/io/terra15.py b/xdas/io/terra15.py index 7c969acf..e9a539e4 100644 --- a/xdas/io/terra15.py +++ b/xdas/io/terra15.py @@ -3,11 +3,14 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname, tz=timezone.utc): +def read(fname, ctype=None, tz=timezone.utc): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: ti = np.datetime64( datetime.fromtimestamp(file["data_product"]["gps_time"][0], tz=tz) @@ -19,6 +22,6 @@ def read(fname, tz=timezone.utc): dx = file.attrs["dx"] data = VirtualSource(file["data_product"]["data"]) nt, nd = data.shape - t = {"tie_indices": [0, nt - 1], "tie_values": [ti, tf]} - d = {"tie_indices": [0, nd - 1], "tie_values": [d0, d0 + (nd - 1) * dx]} - return DataArray(data, {"time": t, "distance": d}) + time = {"tie_indices": [0, nt - 1], "tie_values": [ti, tf]} # TODO: use from_block + distance = Coordinate[ctype["distance"]].from_block(d0, nd, dx, dim="distance") + return DataArray(data, {"time": time, "distance": distance}) diff --git a/xdas/processing/__init__.py b/xdas/processing/__init__.py index b8ffae40..2cbe1d76 100644 --- a/xdas/processing/__init__.py +++ b/xdas/processing/__init__.py @@ -1,4 +1,3 @@ -from . import monitor from .core import ( DataArrayLoader, DataArrayWriter, diff --git a/xdas/signal.py b/xdas/signal.py index 5d155527..fefb4d53 100644 --- a/xdas/signal.py +++ b/xdas/signal.py @@ -2,7 +2,7 @@ import scipy.signal as sp from .atoms.core import atomized -from .core.coordinates import Coordinate, get_sampling_interval +from .coordinates.core import Coordinate, get_sampling_interval from .core.dataarray import DataArray from .parallel import parallelize from .spectral import stft diff --git a/xdas/spectral.py b/xdas/spectral.py index 44485ad3..c4a6e279 100644 --- a/xdas/spectral.py +++ b/xdas/spectral.py @@ -2,7 +2,7 @@ from scipy.fft import fft, fftfreq, fftshift, rfft, rfftfreq from scipy.signal import get_window -from .core.coordinates import get_sampling_interval +from .coordinates.core import get_sampling_interval from .core.dataarray import DataArray from .parallel import parallelize diff --git a/xdas/trigger.py b/xdas/trigger.py index b0a62eed..02142b6d 100644 --- a/xdas/trigger.py +++ b/xdas/trigger.py @@ -3,7 +3,7 @@ from numba import njit from .atoms.core import Atom, State, atomized -from .core.coordinates import Coordinate +from .coordinates.core import Coordinate class Trigger(Atom): diff --git a/xdas/virtual.py b/xdas/virtual.py index e5ce9aa2..92aba6b4 100644 --- a/xdas/virtual.py +++ b/xdas/virtual.py @@ -49,6 +49,15 @@ def nbytes(self): else: return 0 + def create_variable(self, file, name, dims=None, dtype=None): + self.to_dataset(file._h5group, name) + variable = file._variable_cls(file, name, dims) + file._variables[name] = variable + variable._attach_dim_scales() + variable._attach_coords() + variable._ensure_dim_id() + return variable + class VirtualStack(VirtualArray): def __init__(self, sources=[], axis=0):