Create RO-Crate from NIRD dataset#
Context#
Purpose#
We are demonstrating how to create a RO-Crate for a dataset by combining data stored in the NIRD archive (Norwegian Research Data Archive) with metadata derived from the RiOMar dataset. This example uses the rocrate Python library and adheres to the RO-Crate v1.1 specification, though it does not include a specific RO-Crate profile.
Requirements#
To run this notebook:
NIRD Archive Account: You will need an account on the NIRD archive.
Development Status: The NIRD archive is currently under development. The notebook will be updated once the archive becomes fully operational.
Access Limitations: At this stage, only members of the development or testing team of the new NIRD archive can execute this notebook.
We will update this notebook as soon as the NIRD archive is fully functional and publicly available.
Contributions#
Notebook#
Anne Fouilloux (author), Simula Research Laboratory (Norway), @annefou
XX (reviewer)
Biblipgraphy and other interesting resources#
rocrate Python package
pip install rocrate
import requests
import json
from rocrate.rocrate import ROCrate
from rocrate.model.person import Person
import pandas as pd
from datetime import datetime
import geopandas
import shapely
import xarray as xr
import numpy as np
import s3fs
Open RiOMar data to get metadata#
url = "https://data-fair2adapt.ifremer.fr/riomar/small.zarr"
ds = xr.open_zarr(url)
ds
Get metadata from RiOMAR#
Get bounding box in WKT#
Latitudes with values of -1 are NaN
minlat = ds.nav_lat_rho.where(ds.nav_lat_rho > -1, np.nan).min().values
maxlat = ds.nav_lat_rho.max().values
minlon = ds.nav_lon_rho.min().values
maxlon = ds.nav_lon_rho.max().values
print(minlat, maxlat, minlon, maxlon)
geometry_wkt = shapely.geometry.box(minlon, minlat, maxlon, maxlat).wkt
geometry_wkt
time range
ts = pd.to_datetime(str(ds.time_counter.min().values))
te = pd.to_datetime(str(ds.time_counter.max().values))
date_start = ts.strftime('%Y.%m.%d')
date_end = te.strftime('%Y.%m.%d')
date_start, date_end
Get information for the provenance#
prov = {
"@id": "https://doi.org/10.5281/zenodo.13898339",
"@type": "SoftwareApplication",
"url": "https://www.croco-ocean.org",
"name": "CROCO, Coastal and Regional Ocean COmmunity",
"version": "CROCO GAMA model v2.0.1 https://doi.org/10.5281/zenodo.13898339"
}
Get Institutions#
#fs = s3fs.S3FileSystem(anon=True)
client_kwargs={'endpoint_url': 'https://pangeo-eosc-minioapi.vm.fedcloud.eu/'}
s3 = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs)
filename = "afouilloux-fair2adapt/institution-v3.json"
with s3.open(filename, 'r') as file:
data = file.read()
institutions = json.loads(data)
institutions = pd.DataFrame.from_dict(institutions)
institutions.head()
Split name in several columns#
rename_cols = {}
for name in institutions["name"].apply(pd.Series).columns:
rename_cols[name] = "name-" + name
rename_cols
institutions = institutions.join( institutions["name"].apply(pd.Series))
institutions = institutions.rename(columns=rename_cols)
institutions.head()
Access the NIRD Archive via its API#
To be able to access the NIRD Archive via its API, you first need to register to the NIRD Archive and request a token.
Please note that the NIRD archive is still under development and registration to
https://admin.ckan-archive-test.sigma2.nois reserved to administrators.However, once the NIRD archive opens and you obtain your token, and can create a file called
nird.jsonthat will look like this:
{
"token": "xxxx"
}
try:
with open("./nird.json") as config:
token = json.load(config)["token"]
except FileNotFoundError:
print("config.json not found!")
url = "https://admin.ckan-archive-test.sigma2.no/api/3/action/package_search"
response = requests.get(url, headers={ "Authorization": token})
list_records = response.json()["result"]["results"]
list_records
DOI to find in the NIRD archive#
doi = "10.82969/2025.hkfi3bn9.v2"
for rec in (x for x in list_records if x["doi"] == doi):
pass
print(rec)
rec
Create a new RO-Crate#
crate = ROCrate()
Add the license for the RO-Crate#
crate.update_jsonld(
{
"@id": "./",
"license": { "@id": rec["license_url"]},
})
license = {
"@id": rec["license_url"],
"@type": "CreativeWork",
"name": rec["license_id"],
"description": rec["license_title"],
}
crate.add_jsonld(license)
Add creators and their Organizations#
We also need the ROR to create an affiliation for each organization found
def get_ror(institutions, org):
ror = {}
for name in rename_cols.values():
if not institutions.loc[institutions[name].isin([org])].empty:
ror_url = institutions.loc[institutions[name].isin([org])]["ror"].values[0]
ror = {
"@id": ror_url,
"@type": "Organization",
"name": org,
"url": ror_url
}
return ror
list_authors = []
list_orcids = []
for creator in rec["creators"]:
creator['givenName'] = creator.pop('first_name')
creator['familyName'] = creator.pop('last_name')
list_authors.append(creator['givenName'] + " " + creator['familyName'])
org_name = creator.pop("organisation")
ror = get_ror(institutions, org_name)
creator["affiliation"] = {"@id": ror["url"]}
orcid = creator.pop("orcid")
print(creator)
crate.add(Person(crate, "https://orcid.org/" + orcid, properties=creator))
list_orcids.append({ "@id": "https://orcid.org/" + orcid })
crate.add_jsonld(ror)
# Add the list of authors
crate.update_jsonld({
"@id": "./",
"author": list_orcids,
})
rec.pop("creators")
Add data#
for toc in (x for x in rec["resources"] if x["name"] == "table_of_contents"):
pass
toc["url"]
for resource in rec["resources"]:
print(resource["name"])
df = pd.read_csv(toc["url"], sep="|")
df = df.drop([0, 0])
df.head()
df.columns
rec
Prepare Temporal coverage if available#
if "temporal" in rec.keys():
temporal_coverage = rec["temporal"]["start"] + "/" + rec["temporal"]["end"]
else:
temporal_coverage = None
Prepare Spatial coverage if available#
def get_geoshape(spatial):
if spatial["spatial_type"] == "wkt":
geo = shapely.wkt.loads(spatial["value"])
if hasattr(geo, 'geoms'):
# take the first one
geo = geo.geoms[0]
geo = geo.wkt.replace("POLYGON", "").replace("(","").replace(")","").strip()
geolocation = { "@type": "GeoShape", "@id": geo, "polygon": geo}
return geolocation
else:
# Not implemented yet
return None
if "spatial" in rec:
geolocation = get_geoshape(rec["spatial"])
rec["spatial"]
Go through each data and add it in the RO-Crate#
# "https://admin.ckan-archive-test.sigma2.no" +
for row in df.itertuples(index=False, name=None):
resource2add = {df.columns[i].strip() : row[i] for i, _ in enumerate(row)}
url = resource2add.pop("http_url").strip()
resource2add["name"] = resource2add["filename"].strip()
resource2add["s3_url"] = "s3:/" + resource2add["s3_url"].strip()
resource2add["sdDatePublished"] = rec["cron"]["completed_date"] # Is it correct?
resource2add["dateCreated"] = rec["metadata_created"]
resource2add["dateModified"] = rec["metadata_modified"]
resource2add["contentSize"] = resource2add.pop("size")
resource2add["encodingFormat"] = resource2add.pop("format")
if geolocation is not None:
resource2add["location"] = geolocation
if temporal_coverage is not None:
resource2add["temporalCoverage"] = temporal_coverage
## properties we remove because we do not know where to fit them
resource2add.pop("filename") # was put in "name"
resource2add.pop("fixity") # Checksum?
resource2add.pop("s3_url") # may be ut in different distribution but only valid for datasets and not files
print("properties = ", resource2add)
resource = crate.add_file(url, fetch_remote = False, properties=resource2add)
Add metadata to RO#
Add the title and description#
title = rec.pop("title")
description = rec.pop("notes")
crate.update_jsonld({
"@id": "./",
"description": description,
"title": title,
"name": title,
})
Add the publisher and creator#
publisher_name = rec.pop("publisher") ## WEIRD!
publisher_name = "Sigma2 AS"
publisher_url = "https://www.wikidata.org/wiki/Q12008197"
publisher = {
"@id": publisher_url,
"@type": "Organization",
"name": publisher_name,
"url": publisher_url
}
crate.add_jsonld(publisher)
crate.update_jsonld(
{
"@id": "./",
"publisher": { "@id": publisher_url },
})
Add the creator of the RO-Crate#
crate.update_jsonld(
{
"@id": "ro-crate-metadata.json",
"creator": { "@id": publisher_url },
})
Add Publication date#
date_published = datetime.strptime(rec.pop("release_date"), '%Y-%m-%dT%H:%M:%S.%f')
crate.update_jsonld({
"@id": "./",
"datePublished": date_published.strftime("%Y-%m-%d") ,
})
Add citation#
doi = "https://doi.org/" + rec.pop("doi")
cite_as = " and ".join(list_authors) + ", " + title + ", " + publisher_name + ", " + date_published.strftime("%Y") + ". " + doi + "."
crate.update_jsonld({
"@id": "./",
"identifier": doi,
"url": doi,
"cite-as": cite_as ,
})
Add studySubject, keywords, etc.#
def find_subject(subject, groups):
for group in groups:
if subject == group["id"]:
return group
return None
study_subjects = []
for subject in rec["subject"]:
info_on_subject = find_subject(subject, rec["groups"])
study_subjects.append({
"@id": "http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/" + info_on_subject["name"]
})
keywords = []
for tag in rec["tags"]:
keywords.append(tag["display_name"])
keywords = ", ".join(keywords)
crate.update_jsonld({
"@id": "./",
"about": study_subjects,
"keywords": keywords,
})
Add version#
crate.update_jsonld({
"@id": "./",
"version": rec["version"],
})
Add Language#
#crate.update_jsonld({
# "@id": ,
# "@type": "Language",
#})
rec.keys()
rec["theme"]
Write to disk#
crate.write("ro-crate")
from rocrateValidator import validate as validate
v = validate.validate("ro-crate")
v.validator()