VirtualZarr with Kerchunk for RiOMar data#
Context#
Purpose#
The goal is to create a virtualzarr for all RiOMar data using Kerchunk (since Icechunk does not work at the moment on Pangeo-EOSC or for data on datamor (https access).
Description#
In this notebook, we will:
list all the RiOMar data available online on Datamore
Create a virtualzarr of the RiOMar data
Save it as kerchunk in parquet format
Contributions#
Notebook#
Justus Magin (author), CNRS-LOPS (France), @keewis
Bibliography and other interesting resources#
from functools import partial
import fsspec
import virtualizarr
import xarray as xr
fs = fsspec.filesystem("http")
inroot = "https://data-fair2adapt.ifremer.fr/riomar/GAMAR"
urls = fs.glob(f"{inroot}/*.nc")
import distributed
cluster = distributed.LocalCluster(n_workers=24)
client = cluster.get_client()
client
func = partial(
virtualizarr.open_virtual_dataset,
backend=virtualizarr.readers.hdf.HDFVirtualBackend,
indexes={},
loadable_variables=[
"time_counter",
"time_instant",
"x_rho",
"y_rho",
"x_u",
"x_v",
"y_u",
"y_v",
"axis_nbounds",
],
decode_times=True,
)
futures = client.map(func, urls)
dss = client.gather(futures)
grid_url = "https://data-fair2adapt.ifremer.fr/riomar/misc/croco_grd_hdf5.nc"
grid = virtualizarr.open_virtual_dataset(
grid_url, filetype="netcdf4", indexes={}, loadable_variables=["lon_rho", "lat_rho"]
)
grid
ds = (
xr.concat(
dss,
dim="time_counter",
compat="override",
coords="minimal",
combine_attrs="drop_conflicts",
)
.set_coords(["time_counter_bounds", "time_instant_bounds"])
.assign_coords(
{
"nav_lon_rho": lambda ds: ds["nav_lon_rho"].copy(data=grid["lon_rho"].data),
"nav_lat_rho": lambda ds: ds["nav_lat_rho"].copy(data=grid["lat_rho"].data),
}
)
)
ds
ds.virtualize.to_kerchunk("riomar.parquet", format="parquet")
ds.virtualize
reopened = xr.open_dataset("riomar.parquet", engine="kerchunk", chunks={})
reopened
(reopened["nav_lat_rho"] == -1).sum().compute()
virtualizarr.open_virtual_dataset(
"riomar.parquet",
filetype="kerchunk",
indexes={},
loadable_variables=[
"time_counter",
"time_instant",
"x_rho",
"y_rho",
"x_u",
"x_v",
"y_u",
"y_v",
"axis_nbounds",
],
)