VirtualZarr with Kerchunk for RiOMar data#

Context#

Purpose#

The goal is to create a virtualzarr for all RiOMar data using Kerchunk (since Icechunk does not work at the moment on Pangeo-EOSC or for data on datamor (https access).

Description#

In this notebook, we will:

  • list all the RiOMar data available online on Datamore

  • Create a virtualzarr of the RiOMar data

  • Save it as kerchunk in parquet format

Contributions#

Notebook#

  • Justus Magin (author), CNRS-LOPS (France), @keewis

Bibliography and other interesting resources#

from functools import partial

import fsspec
import virtualizarr
import xarray as xr

fs = fsspec.filesystem("http")
inroot = "https://data-fair2adapt.ifremer.fr/riomar/GAMAR"
urls = fs.glob(f"{inroot}/*.nc")
import distributed

cluster = distributed.LocalCluster(n_workers=24)
client = cluster.get_client()
client
func = partial(
    virtualizarr.open_virtual_dataset,
    backend=virtualizarr.readers.hdf.HDFVirtualBackend,
    indexes={},
    loadable_variables=[
        "time_counter",
        "time_instant",
        "x_rho",
        "y_rho",
        "x_u",
        "x_v",
        "y_u",
        "y_v",
        "axis_nbounds",
    ],
    decode_times=True,
)

futures = client.map(func, urls)
dss = client.gather(futures)
grid_url = "https://data-fair2adapt.ifremer.fr/riomar/misc/croco_grd_hdf5.nc"
grid = virtualizarr.open_virtual_dataset(
    grid_url, filetype="netcdf4", indexes={}, loadable_variables=["lon_rho", "lat_rho"]
)
grid
ds = (
    xr.concat(
        dss,
        dim="time_counter",
        compat="override",
        coords="minimal",
        combine_attrs="drop_conflicts",
    )
    .set_coords(["time_counter_bounds", "time_instant_bounds"])
    .assign_coords(
        {
            "nav_lon_rho": lambda ds: ds["nav_lon_rho"].copy(data=grid["lon_rho"].data),
            "nav_lat_rho": lambda ds: ds["nav_lat_rho"].copy(data=grid["lat_rho"].data),
        }
    )
)
ds
ds.virtualize.to_kerchunk("riomar.parquet", format="parquet")
ds.virtualize
reopened = xr.open_dataset("riomar.parquet", engine="kerchunk", chunks={})
reopened
(reopened["nav_lat_rho"] == -1).sum().compute()
virtualizarr.open_virtual_dataset(
    "riomar.parquet",
    filetype="kerchunk",
    indexes={},
    loadable_variables=[
        "time_counter",
        "time_instant",
        "x_rho",
        "y_rho",
        "x_u",
        "x_v",
        "y_u",
        "y_v",
        "axis_nbounds",
    ],
)