Skip to content

Datapackage from Dataverse

ODAM: Datapackage from a Dataverse repository


Example of a session with R showing how it is possible to retrieve an ODAM datapackage using a keyword from Data INRAE1, a Dataverse repository.

1 Institut National de Recherche pour l'Agriculture, l'Alimentation et l'Environnement. (2018). Data INRAE. https://doi.org/10.14758/9T8G-WJ20


Introduction to Dataverse with R

packages <- c('httr', 'jsonlite', 'jsonvalidate', 'dataverse' )
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(packages, rownames(installed.packages())), 
                    repos='http://cran.rstudio.com')
}

library(dataverse)
library(httr)
library(jsonvalidate)
library(jsonlite)

# User Parameters
dataverse_server <- "data.inra.fr"
dataset_keyword <- "tomato"

url_schema  <- 'https://inrae.github.io/ODAM/json-schema/odam-data-package.json'

# Define the dataverse server as an env. variable
Sys.setenv("DATAVERSE_SERVER" = dataverse_server)

# Data Search 
ds <- dataverse::dataverse_search(dataset_keyword, "dataset")

# Check if a dataset found with the right type
if ( dim(ds)[1]==0 ) 
   stop("No dataset found")

if ( length("dataset" %in% ds$type)==0 )
   stop("No dataset found with the right type")


# Get list of files 
ds <- ds[ ds$type == "dataset", ]

# search for 'datapackage.json'
ids <- c()
for (i in ds$global_id) {
     if (length(dataverse::dataset_files(i))==0)
         next
     dflist <-  as.data.frame(t(simplify2array(dataverse::dataset_files(i))))
     if (sum(dflist$label  %in% "datapackage.json"))
         ids <- c(ids,i)
}

# check if successful search
if ( length(ids)==0 )
   stop("No datapackage found")

# if many, take the first
id <- ids[1]
cat(" Dataset found : ",id,"\n")
dflist <-  as.data.frame(t(simplify2array(dataverse::dataset_files(id))))

# Get the datapackage.json content file
idx <- which("datapackage.json" %in% dflist$label)
file_id <- dflist$dataFile[[idx]]$id
dp_json <- rawToChar(dataverse::get_file(file_id))

if (is.null(dp_json))
   stop("the datapackage returned as null")

# Get the ODAM data package schema
response <- httr::GET(url_schema, config(sslversion=6,ssl_verifypeer=1))
if (response$status_code != 200)
   stop("Error while getting the ODAM data package schema")

schema <- httr::content(response, as ='text')

# Validate the JSON against the ODAM data package schema
if (! jsonvalidate::json_validate(dp_json, schema) )
    stop("the returned datapackage is not a valid ODAM datapackage")

# Parse the JSON object to a data.frame
dp <- jsonlite::fromJSON(dp_json)

# View metadata
sapply( ls(dflist), function(x){ dflist[[as.character(x)]][[idx]] })

# Do something with dp ...
dp$resources[ c('name', 'title', 'cv_term') ]
$`categories`
[1] "data"        "datapackage" "odam"        "TSV"        

$dataFile
$dataFile$`id`
[1] 100141

$dataFile$persistentId
[1] "doi:10.15454/95JUTK/SLKZUA"

$dataFile$pidURL
[1] "https://doi.org/10.15454/95JUTK/SLKZUA"

$dataFile$filename
[1] "datapackage.json"

$dataFile$contentType
[1] "application/json"

$dataFile$filesize
[1] 46674

$dataFile$description
[1] "ODAM datapackage based on JSON Schema"

$dataFile$storageIdentifier
[1] "s3://prod-datainra:17352703ae5-a43a037ccfaa"

$dataFile$rootDataFileId
[1] -1

$dataFile$md5
[1] "16fc7594899b640028b3d9c634e85d1e"

$dataFile$checksum
$dataFile$checksum$`type`
[1] "MD5"

$dataFile$checksum$value
[1] "16fc7594899b640028b3d9c634e85d1e"



$datasetVersionId
[1] 261953

$description
[1] "ODAM datapackage based on JSON Schema"

$label
[1] "datapackage.json"

$restricted
[1] FALSE

$version
[1] 1

                name                                   title    cv_term.label                               cv_term.path
1             plants                          Plant features      whole plant  http://purl.obolibrary.org/obo/PO_0000003
2            samples                         Sample features organ harvesting http://purl.obolibrary.org/obo/OBI_1110046
3           aliquots                       Aliquots features organ harvesting http://purl.obolibrary.org/obo/OBI_1110046
4    cellwall_metabo      Cell wall Compound quantifications  chemical entity http://purl.obolibrary.org/obo/CHEBI_24431
5  cellwall_metaboFW Cell Wall Compound quantifications (FW)  chemical entity http://purl.obolibrary.org/obo/CHEBI_24431
6           activome                       Activome Features  chemical entity http://purl.obolibrary.org/obo/CHEBI_24431
7              pools                Pools of remaining pools organ harvesting http://purl.obolibrary.org/obo/OBI_1110046
8         qMS_metabo             MS Compounds quantification  chemical entity http://purl.obolibrary.org/obo/CHEBI_24431
9        qNMR_metabo            NMR Compounds quantification  chemical entity http://purl.obolibrary.org/obo/CHEBI_24431
10    plato_hexosesP                       Hexoses Phosphate  chemical entity http://purl.obolibrary.org/obo/CHEBI_24431
11         lipids_AG                               Lipids AG  chemical entity http://purl.obolibrary.org/obo/CHEBI_24431
12         AminoAcid                             Amino Acids  chemical entity http://purl.obolibrary.org/obo/CHEBI_24431