Xarray#

Cubed can work with Xarray datasets via the cubed-xarray package.

Install by running the following:

pip install cubed cubed-xarray xarray pooch netCDF4

Note that pooch and netCDF4 are needed to access the Xarray tutorial datasets that we use in the example below.

Open dataset#

Start by importing Xarray - note that we don’t need to import Cubed or cubed-xarray, since they will be picked up automatically.

import xarray as xr

xr.set_options(display_expand_attrs=False, display_expand_data=True);

We open an Xarray dataset (in netCDF format) using the usual open_dataset function. By specifying chunks={} we ensure that the dataset is chunked using the on-disk chunking (here it is the netCDF file chunking). The chunked_array_type argument specifies which chunked array type to use - Cubed in this case.

ds = xr.tutorial.open_dataset(
    "air_temperature", chunked_array_type="cubed", chunks={}
)
ds
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/IPython/core/formatters.py:406, in BaseFormatter.__call__(self, obj)
    404     method = get_real_method(obj, self.print_method)
    405     if method is not None:
--> 406         return method()
    407     return None
    408 else:

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/dataset.py:2417, in Dataset._repr_html_(self)
   2415 if OPTIONS["display_style"] == "text":
   2416     return f"<pre>{escape(repr(self))}</pre>"
-> 2417 return formatting_html.dataset_repr(self)

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:379, in dataset_repr(ds)
    376 if ds.coords:
    377     sections.append(coord_section(ds.coords))
--> 379 sections.append(datavar_section(ds.data_vars))
    381 display_default_indexes = _get_boolean_with_default(
    382     "display_default_indexes", False
    383 )
    384 xindexes = filter_nondefault_indexes(
    385     _get_indexes_dict(ds.xindexes), not display_default_indexes
    386 )

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:225, in _mapping_section(mapping, name, details_func, max_items_collapse, expand_option_name, enabled, **kwargs)
    218 expanded = max_items_collapse is None or _get_boolean_with_default(
    219     expand_option_name, n_items < max_items_collapse
    220 )
    221 collapsed = not expanded
    223 return collapsible_section(
    224     f"{name}:",
--> 225     details=details_func(mapping, **kwargs),
    226     n_items=n_items,
    227     enabled=enabled,
    228     collapsed=collapsed,
    229 )

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:136, in summarize_vars(variables)
    135 def summarize_vars(variables) -> str:
--> 136     vars_li = "".join(
    137         f"<li class='xr-var-item'>{summarize_variable(k, v)}</li>"
    138         for k, v in variables.items()
    139     )
    141     return f"<ul class='xr-var-list'>{vars_li}</ul>"

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:137, in <genexpr>(.0)
    135 def summarize_vars(variables) -> str:
    136     vars_li = "".join(
--> 137         f"<li class='xr-var-item'>{summarize_variable(k, v)}</li>"
    138         for k, v in variables.items()
    139     )
    141     return f"<ul class='xr-var-list'>{vars_li}</ul>"

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:98, in summarize_variable(name, var, is_index, dtype)
     96 preview = escape(inline_variable_array_repr(variable, 35))
     97 attrs_ul = summarize_attrs(var.attrs)
---> 98 data_repr = short_data_repr_html(variable)
    100 attrs_icon = _icon("icon-file-text2")
    101 data_icon = _icon("icon-database")

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:45, in short_data_repr_html(array)
     43 internal_data = getattr(array, "variable", array)._data
     44 if hasattr(internal_data, "_repr_html_"):
---> 45     return internal_data._repr_html_()
     46 text = escape(short_data_repr(array))
     47 return f"<pre>{text}</pre>"

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/cubed/array_api/array_object.py:50, in Array._repr_html_(self)
     49 def _repr_html_(self):
---> 50     from cubed.diagnostics.widgets import get_template
     52     try:
     53         grid = self.to_svg(size=ARRAY_SVG_SIZE)

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/cubed/diagnostics/__init__.py:1
----> 1 from .rich import RichProgressBar as ProgressBar
      3 __all__ = ["ProgressBar"]

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/cubed/diagnostics/rich.py:6
      3 import time
      4 from contextlib import contextmanager
----> 6 from rich.console import RenderableType
      7 from rich.progress import (
      8     BarColumn,
      9     MofNCompleteColumn,
   (...)     15     TimeElapsedColumn,
     16 )
     17 from rich.text import Text

ModuleNotFoundError: No module named 'rich'
<xarray.Dataset> Size: 31MB
Dimensions:  (time: 2920, lat: 25, lon: 53)
Coordinates:
  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0
  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0
Data variables:
    air      (time, lat, lon) float64 31MB cubed.Array<chunksize=(2920, 25, 53)>
Attributes: (5)

Notice that the air data variable is a cubed.Array. Since Cubed has a lazy computation model, this array is not loaded from disk until a computation is run.

Convert to Zarr#

We can use Cubed to convert the dataset to Zarr format by calling to_zarr on the dataset:

ds.to_zarr("air_temperature_cubed.zarr", mode="w", consolidated=True);
/tmp/ipykernel_2498/172254778.py:1: SerializationWarning: saving variable None with floating point data as an integer dtype without any _FillValue to use for NaNs
  ds.to_zarr("air_temperature_cubed.zarr", mode="w", consolidated=True);
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/zarr/api/asynchronous.py:247: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.
  warnings.warn(

This will run a computation that loads the input data and writes it out to a Zarr store on the local filesystem.

Compute the mean#

We can also use Xarray’s API to run computations on the dataset using Cubed. Here we find the mean air temperature over time, for each location:

mean = ds.air.mean("time")
mean
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/IPython/core/formatters.py:406, in BaseFormatter.__call__(self, obj)
    404     method = get_real_method(obj, self.print_method)
    405     if method is not None:
--> 406         return method()
    407     return None
    408 else:

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/common.py:189, in AbstractArray._repr_html_(self)
    187 if OPTIONS["display_style"] == "text":
    188     return f"<pre>{escape(repr(self))}</pre>"
--> 189 return formatting_html.array_repr(self)

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:344, in array_repr(arr)
    336 arr_name = escape(repr(arr.name)) if getattr(arr, "name", None) else ""
    338 header_components = [
    339     f"<div class='xr-obj-type'>{obj_type}</div>",
    340     f"<div class='xr-obj-name'>{arr_name}</div>",
    341     format_dims(dims, indexed_dims),
    342 ]
--> 344 sections = [array_section(arr)]
    346 if hasattr(arr, "coords"):
    347     if arr.coords:

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:250, in array_section(obj)
    248 variable = getattr(obj, "variable", obj)
    249 preview = escape(inline_variable_array_repr(variable, max_width=70))
--> 250 data_repr = short_data_repr_html(obj)
    251 data_icon = _icon("icon-database")
    253 return (
    254     "<div class='xr-array-wrap'>"
    255     f"<input id='{data_id}' class='xr-array-in' type='checkbox' {collapsed}>"
   (...)    259     "</div>"
    260 )

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:45, in short_data_repr_html(array)
     43 internal_data = getattr(array, "variable", array)._data
     44 if hasattr(internal_data, "_repr_html_"):
---> 45     return internal_data._repr_html_()
     46 text = escape(short_data_repr(array))
     47 return f"<pre>{text}</pre>"

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/cubed/array_api/array_object.py:50, in Array._repr_html_(self)
     49 def _repr_html_(self):
---> 50     from cubed.diagnostics.widgets import get_template
     52     try:
     53         grid = self.to_svg(size=ARRAY_SVG_SIZE)

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/cubed/diagnostics/__init__.py:1
----> 1 from .rich import RichProgressBar as ProgressBar
      3 __all__ = ["ProgressBar"]

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/cubed/diagnostics/rich.py:6
      3 import time
      4 from contextlib import contextmanager
----> 6 from rich.console import RenderableType
      7 from rich.progress import (
      8     BarColumn,
      9     MofNCompleteColumn,
   (...)     15     TimeElapsedColumn,
     16 )
     17 from rich.text import Text

ModuleNotFoundError: No module named 'rich'
<xarray.DataArray 'air' (lat: 25, lon: 53)> Size: 11kB
cubed.Array<array-012, shape=(25, 53), dtype=float64, chunks=((25,), (53,))>
Coordinates:
  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0
  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0
Attributes: (11)

To run the computation we need to call compute:

mean.compute()
<xarray.DataArray 'air' (lat: 25, lon: 53)> Size: 11kB
array([[260.37644178, 260.18305137, 259.88662671, ..., 250.81590068,
        251.93811644, 253.43804795],
       [262.73439384, 262.79397603, 262.74933904, ..., 249.75590411,
        251.58575685, 254.35926027],
       [264.7687637 , 264.32730822, 264.06169521, ..., 250.60789041,
        253.58351027, 257.71559932],
       ...,
       [297.64986301, 296.95333219, 296.62931507, ..., 296.81092466,
        296.28796233, 295.81645548],
       [298.12920205, 297.93700685, 297.47039384, ..., 296.85954795,
        296.7770274 , 296.44383562],
       [298.36615068, 298.38573973, 298.11414384, ..., 297.33820548,
        297.28144521, 297.30510274]], shape=(25, 53))
Coordinates:
  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0
  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0
Attributes: (11)

This is fine for outputs that fit in memory like the example here, but sometimes we want to write the output of the computation to Zarr, which we do by calling to_zarr on the dataset instead of compute:

mean.to_zarr("mean_air_temperature.zarr", mode="w", consolidated=True);
/opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/zarr/api/asynchronous.py:247: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.
  warnings.warn(

We can check that the Zarr file was created by loading it from disk using xarray.open_dataset:

xr.open_dataset(
    "mean_air_temperature.zarr", chunked_array_type="cubed", chunks={}
)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/IPython/core/formatters.py:406, in BaseFormatter.__call__(self, obj)
    404     method = get_real_method(obj, self.print_method)
    405     if method is not None:
--> 406         return method()
    407     return None
    408 else:

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/dataset.py:2417, in Dataset._repr_html_(self)
   2415 if OPTIONS["display_style"] == "text":
   2416     return f"<pre>{escape(repr(self))}</pre>"
-> 2417 return formatting_html.dataset_repr(self)

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:379, in dataset_repr(ds)
    376 if ds.coords:
    377     sections.append(coord_section(ds.coords))
--> 379 sections.append(datavar_section(ds.data_vars))
    381 display_default_indexes = _get_boolean_with_default(
    382     "display_default_indexes", False
    383 )
    384 xindexes = filter_nondefault_indexes(
    385     _get_indexes_dict(ds.xindexes), not display_default_indexes
    386 )

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:225, in _mapping_section(mapping, name, details_func, max_items_collapse, expand_option_name, enabled, **kwargs)
    218 expanded = max_items_collapse is None or _get_boolean_with_default(
    219     expand_option_name, n_items < max_items_collapse
    220 )
    221 collapsed = not expanded
    223 return collapsible_section(
    224     f"{name}:",
--> 225     details=details_func(mapping, **kwargs),
    226     n_items=n_items,
    227     enabled=enabled,
    228     collapsed=collapsed,
    229 )

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:136, in summarize_vars(variables)
    135 def summarize_vars(variables) -> str:
--> 136     vars_li = "".join(
    137         f"<li class='xr-var-item'>{summarize_variable(k, v)}</li>"
    138         for k, v in variables.items()
    139     )
    141     return f"<ul class='xr-var-list'>{vars_li}</ul>"

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:137, in <genexpr>(.0)
    135 def summarize_vars(variables) -> str:
    136     vars_li = "".join(
--> 137         f"<li class='xr-var-item'>{summarize_variable(k, v)}</li>"
    138         for k, v in variables.items()
    139     )
    141     return f"<ul class='xr-var-list'>{vars_li}</ul>"

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:98, in summarize_variable(name, var, is_index, dtype)
     96 preview = escape(inline_variable_array_repr(variable, 35))
     97 attrs_ul = summarize_attrs(var.attrs)
---> 98 data_repr = short_data_repr_html(variable)
    100 attrs_icon = _icon("icon-file-text2")
    101 data_icon = _icon("icon-database")

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/xarray/core/formatting_html.py:45, in short_data_repr_html(array)
     43 internal_data = getattr(array, "variable", array)._data
     44 if hasattr(internal_data, "_repr_html_"):
---> 45     return internal_data._repr_html_()
     46 text = escape(short_data_repr(array))
     47 return f"<pre>{text}</pre>"

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/cubed/array_api/array_object.py:50, in Array._repr_html_(self)
     49 def _repr_html_(self):
---> 50     from cubed.diagnostics.widgets import get_template
     52     try:
     53         grid = self.to_svg(size=ARRAY_SVG_SIZE)

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/cubed/diagnostics/__init__.py:1
----> 1 from .rich import RichProgressBar as ProgressBar
      3 __all__ = ["ProgressBar"]

File /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages/cubed/diagnostics/rich.py:6
      3 import time
      4 from contextlib import contextmanager
----> 6 from rich.console import RenderableType
      7 from rich.progress import (
      8     BarColumn,
      9     MofNCompleteColumn,
   (...)     15     TimeElapsedColumn,
     16 )
     17 from rich.text import Text

ModuleNotFoundError: No module named 'rich'
<xarray.Dataset> Size: 11kB
Dimensions:  (lat: 25, lon: 53)
Coordinates:
  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0
  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0
Data variables:
    air      (lat, lon) float64 11kB cubed.Array<chunksize=(25, 53)>