Create RO-Crate from NIRD dataset

Create RO-Crate from NIRD dataset#

Context#

Purpose#

We are demonstrating how to create a RO-Crate for a dataset by combining data stored in the NIRD archive (Norwegian Research Data Archive) with metadata derived from the RiOMar dataset. This example uses the rocrate Python library and adheres to the RO-Crate v1.1 specification, though it does not include a specific RO-Crate profile.

Requirements#

To run this notebook:

NIRD Archive Account: You will need an account on the NIRD archive.
Development Status: The NIRD archive is currently under development. The notebook will be updated once the archive becomes fully operational.
Access Limitations: At this stage, only members of the development or testing team of the new NIRD archive can execute this notebook.

We will update this notebook as soon as the NIRD archive is fully functional and publicly available.

Contributions#

Notebook#

Anne Fouilloux (author), Simula Research Laboratory (Norway), @annefou
XX (reviewer)

Biblipgraphy and other interesting resources#

pip install rocrate

import requests
import json
from rocrate.rocrate import ROCrate
from rocrate.model.person import Person
import pandas as pd
from datetime import datetime
import geopandas
import shapely
import xarray as xr
import numpy as np
import s3fs

Open RiOMar data to get metadata#

url = "https://data-fair2adapt.ifremer.fr/riomar/small.zarr"

ds = xr.open_zarr(url)
ds

Get metadata from RiOMAR#

Get bounding box in WKT#

Latitudes with values of -1 are NaN

minlat = ds.nav_lat_rho.where(ds.nav_lat_rho > -1, np.nan).min().values
maxlat = ds.nav_lat_rho.max().values
minlon = ds.nav_lon_rho.min().values
maxlon = ds.nav_lon_rho.max().values
print(minlat, maxlat, minlon, maxlon)

geometry_wkt = shapely.geometry.box(minlon, minlat, maxlon, maxlat).wkt
geometry_wkt

time range

ts = pd.to_datetime(str(ds.time_counter.min().values)) 
te = pd.to_datetime(str(ds.time_counter.max().values)) 
date_start = ts.strftime('%Y.%m.%d')
date_end = te.strftime('%Y.%m.%d')
date_start, date_end

Get information for the provenance#

prov = {
      "@id": "https://doi.org/10.5281/zenodo.13898339",
      "@type": "SoftwareApplication",
      "url": "https://www.croco-ocean.org",
      "name": "CROCO, Coastal and Regional Ocean COmmunity",
      "version": "CROCO GAMA model v2.0.1 https://doi.org/10.5281/zenodo.13898339"
}

Get Institutions#

#fs = s3fs.S3FileSystem(anon=True)
client_kwargs={'endpoint_url': 'https://pangeo-eosc-minioapi.vm.fedcloud.eu/'}

s3 = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs)

filename = "afouilloux-fair2adapt/institution-v3.json"
with s3.open(filename, 'r') as file:
    data = file.read()
    institutions = json.loads(data)
institutions = pd.DataFrame.from_dict(institutions)
institutions.head()

Split name in several columns#

rename_cols = {}
for name in institutions["name"].apply(pd.Series).columns:
    rename_cols[name] = "name-" + name
rename_cols

institutions = institutions.join( institutions["name"].apply(pd.Series))
institutions = institutions.rename(columns=rename_cols)
institutions.head()

Access the NIRD Archive via its API#

To be able to access the NIRD Archive via its API, you first need to register to the NIRD Archive and request a token.
Please note that the NIRD archive is still under development and registration to https://admin.ckan-archive-test.sigma2.no is reserved to administrators.
However, once the NIRD archive opens and you obtain your token, and can create a file called nird.json that will look like this:

   {
"token": "xxxx"
}

try:
    with open("./nird.json") as config:
        token = json.load(config)["token"]
except FileNotFoundError:
    print("config.json not found!")

url = "https://admin.ckan-archive-test.sigma2.no/api/3/action/package_search"
response = requests.get(url, headers={ "Authorization": token})

list_records = response.json()["result"]["results"]
list_records

DOI to find in the NIRD archive#

doi = "10.82969/2025.hkfi3bn9.v2"

for rec in (x for x in list_records if x["doi"] == doi):
    pass

print(rec)

rec

Create a new RO-Crate#

crate = ROCrate()

Add the license for the RO-Crate#

crate.update_jsonld(
{
    "@id": "./",
    "license": { "@id":  rec["license_url"]},
})
license = {
                "@id": rec["license_url"],
                "@type": "CreativeWork",
                "name": rec["license_id"],
                "description": rec["license_title"],
                }
crate.add_jsonld(license)

Add creators and their Organizations#

We also need the ROR to create an affiliation for each organization found

def get_ror(institutions, org):
    ror = {}
    for name in rename_cols.values():
        if not institutions.loc[institutions[name].isin([org])].empty:
            ror_url = institutions.loc[institutions[name].isin([org])]["ror"].values[0]
            ror = {
                "@id": ror_url,
                "@type": "Organization",
                "name": org,
                "url": ror_url
                }
    return ror

list_authors = []
list_orcids = []
for creator in rec["creators"]:
    creator['givenName'] = creator.pop('first_name')
    creator['familyName'] = creator.pop('last_name') 
    list_authors.append(creator['givenName'] + " " +  creator['familyName'])
    org_name = creator.pop("organisation")
    ror = get_ror(institutions, org_name)
    creator["affiliation"] = {"@id": ror["url"]}
    orcid = creator.pop("orcid")
    print(creator)
    crate.add(Person(crate, "https://orcid.org/" + orcid, properties=creator))
    list_orcids.append({ "@id":  "https://orcid.org/" + orcid })
    crate.add_jsonld(ror)

# Add the list of authors
crate.update_jsonld({
    "@id": "./",
    "author": list_orcids,
})

rec.pop("creators")

Add data#

for toc in (x for x in rec["resources"] if x["name"] == "table_of_contents"):
    pass
toc["url"]

for resource in rec["resources"]:
    print(resource["name"])

df = pd.read_csv(toc["url"], sep="|")
df = df.drop([0, 0])
df.head()

df.columns

rec

Prepare Temporal coverage if available#

if "temporal" in rec.keys():
    temporal_coverage = rec["temporal"]["start"] + "/" + rec["temporal"]["end"] 
else:
    temporal_coverage = None

Prepare Spatial coverage if available#

def get_geoshape(spatial):
    if spatial["spatial_type"] == "wkt":
        geo = shapely.wkt.loads(spatial["value"])
        if hasattr(geo, 'geoms'):
            # take the first one
             geo = geo.geoms[0]
        geo = geo.wkt.replace("POLYGON", "").replace("(","").replace(")","").strip()   
        geolocation = { "@type": "GeoShape", "@id": geo, "polygon": geo}
        return geolocation
    else:
        # Not implemented yet
        return None

if "spatial" in rec:
    geolocation = get_geoshape(rec["spatial"])

rec["spatial"]

Go through each data and add it in the RO-Crate#

# "https://admin.ckan-archive-test.sigma2.no" + 
for row in df.itertuples(index=False, name=None):
    resource2add = {df.columns[i].strip() : row[i] for i, _ in enumerate(row)}
    url = resource2add.pop("http_url").strip()
    resource2add["name"] = resource2add["filename"].strip()
    resource2add["s3_url"] = "s3:/" + resource2add["s3_url"].strip()
    resource2add["sdDatePublished"] = rec["cron"]["completed_date"]     # Is it correct?
    resource2add["dateCreated"] = rec["metadata_created"]
    resource2add["dateModified"] = rec["metadata_modified"]
    resource2add["contentSize"] = resource2add.pop("size")
    resource2add["encodingFormat"] = resource2add.pop("format")
    if geolocation is not None:
        resource2add["location"] = geolocation
    if temporal_coverage is not None:
        resource2add["temporalCoverage"] = temporal_coverage
    
    ## properties we remove because we do not know where to fit them
    resource2add.pop("filename")   # was put in "name"
    resource2add.pop("fixity") # Checksum?
    resource2add.pop("s3_url")  # may be ut in different distribution but only valid for datasets and not files
    print("properties = ", resource2add)
    resource = crate.add_file(url, fetch_remote = False, properties=resource2add)

Add metadata to RO#

Add the title and description#

title = rec.pop("title")
description = rec.pop("notes")
crate.update_jsonld({
    "@id": "./",
    "description": description,
    "title": title,
    "name": title,
})

Add the publisher and creator#

publisher_name = rec.pop("publisher")  ## WEIRD! 
publisher_name = "Sigma2 AS"
publisher_url = "https://www.wikidata.org/wiki/Q12008197"
publisher = {
                "@id": publisher_url,
                "@type": "Organization",
                "name": publisher_name,
                "url": publisher_url
                }
crate.add_jsonld(publisher)
crate.update_jsonld(
{
    "@id": "./",
    "publisher": { "@id": publisher_url },
})

Add the creator of the RO-Crate#

crate.update_jsonld(
{
    "@id": "ro-crate-metadata.json",
    "creator": { "@id": publisher_url },
})

Add Publication date#

date_published =  datetime.strptime(rec.pop("release_date"), '%Y-%m-%dT%H:%M:%S.%f')

crate.update_jsonld({
    "@id": "./",
    "datePublished":  date_published.strftime("%Y-%m-%d") ,
})

Add citation#

doi = "https://doi.org/" + rec.pop("doi")
cite_as = " and ".join(list_authors) + ", " + title + ", " + publisher_name + ", " + date_published.strftime("%Y") + ". " +  doi + "."

crate.update_jsonld({
    "@id": "./",
    "identifier": doi,
    "url": doi,
    "cite-as":  cite_as ,
})

Add studySubject, keywords, etc.#

def find_subject(subject, groups):
    for group in groups:
        if subject == group["id"]:
            return group
    return None

study_subjects = []
for subject in rec["subject"]:
    info_on_subject = find_subject(subject, rec["groups"])     
    study_subjects.append({
         "@id": "http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/" + info_on_subject["name"]
    })

keywords = []
for tag in rec["tags"]:
    keywords.append(tag["display_name"])
    
keywords = ", ".join(keywords)

crate.update_jsonld({
    "@id": "./",
    "about": study_subjects,
    "keywords":  keywords,
})

Add version#

crate.update_jsonld({
    "@id": "./",
    "version": rec["version"],
})

Add Language#

#crate.update_jsonld({
#    "@id": ,
#    "@type": "Language",
#})

rec.keys()

rec["theme"]

Write to disk#

crate.write("ro-crate")

from rocrateValidator import validate as validate

v = validate.validate("ro-crate")
v.validator()