R and Python session examples
Session example with R using a datapackage¶
In this example, the datapackage is directly generated from an ODAM repository.
But it is also possible to retrieve it from a Dataverse repository from a keyword. See an Example
json_validate: Validate a json file
packages <- c('httr', 'jsonlite', 'jsonvalidate' )
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())),
repos='http://cran.rstudio.com')
}
library(httr)
library(jsonvalidate)
library(jsonlite)
options(width=256)
# URL of the ODAM data repository
odam_url <- 'https://pmb-bordeaux.fr/getdata'
# ID of the dataset
dataset <- 'frim1'
# Get the ODAM data package schema
URL <- 'https://inrae.github.io/ODAM/json-schema/odam-data-package.json'
response <- httr::GET(URL, config(sslversion=6,ssl_verifypeer=1))
schema <- httr::content(response, as ='text')
# Get structural metadata information in datapackage format (json) for a dataset
# directly from its ODAM repository. As the option links is set to 1, we will have
# the absolute reference for data files (see 'path' below)
URL <- sprintf('%s/query/%s/datapackage?links=1', odam_url, dataset)
response <- httr::GET(URL, config(sslversion=6,ssl_verifypeer=1))
dp_json <- httr::content(response, as ='text')
# Validate the JSON against the ODAM data package schema
jsonvalidate::json_validate(dp_json, schema)
[1] TRUE
# Parse the JSON object to a data.frame
dp <- jsonlite::fromJSON(dp_json)
# View licenses
print(dp$licenses, right=F)
name path title
1 ODC-BY-1.0 https://opendatacommons.org/licenses/by/ Open Data Commons Attribution License 1.0
resources <- dp$resources
# List some metadata about the dataset
resources[ c("name", "title", "identifier", "obtainedFrom", "joinkey", "path") ]
name title identifier obtainedFrom joinkey path
1 plants Plant features PlantID <NA> <NA> https://pmb-bordeaux.fr/getdata/tsv/frim1/plants
2 samples Sample features SampleID plants PlantID https://pmb-bordeaux.fr/getdata/tsv/frim1/samples
3 aliquots Aliquots features AliquotID samples SampleID https://pmb-bordeaux.fr/getdata/tsv/frim1/aliquots
4 cellwall_metabo Cell wall Compound quantifications AliquotID aliquots AliquotID https://pmb-bordeaux.fr/getdata/tsv/frim1/cellwall_metabo
5 cellwall_metaboFW Cell Wall Compound quantifications (FW) AliquotID aliquots AliquotID https://pmb-bordeaux.fr/getdata/tsv/frim1/cellwall_metaboFW
6 activome Activome Features AliquotID aliquots AliquotID https://pmb-bordeaux.fr/getdata/tsv/frim1/activome
7 pools Pools of remaining pools PoolID samples SampleID https://pmb-bordeaux.fr/getdata/tsv/frim1/pools
8 qMS_metabo MS Compounds quantification PoolID pools PoolID https://pmb-bordeaux.fr/getdata/tsv/frim1/qMS_metabo
9 qNMR_metabo NMR Compounds quantification PoolID pools PoolID https://pmb-bordeaux.fr/getdata/tsv/frim1/qNMR_metabo
10 plato_hexosesP Hexoses Phosphate AliquotID aliquots AliquotID https://pmb-bordeaux.fr/getdata/tsv/frim1/plato_hexosesP
11 lipids_AG Lipids AG AliquotID aliquots AliquotID https://pmb-bordeaux.fr/getdata/tsv/frim1/lipids_AG
12 AminoAcid Amino Acids AliquotID aliquots AliquotID https://pmb-bordeaux.fr/getdata/tsv/frim1/AminoAcid
# Read the 'samples' data file - index=2
index <- 2
M <- read.table(url(resources[ "path" ]$path[index]),
header= resources$dialect$header[index],
sep=resources$dialect$delimiter[index] )
# Display an extract
M[1:10,]
SampleID PlantID Truss DevStage FruitAge HarvestDate HarvestHour FruitPosition FruitDiameter FruitHeight FruitFW FruitDW DW
1 1 A26 T5 FF.01 08DPA 40379 0.5 2 NA NA 0.72 0.090216 NA
2 1 C2 T5 FF.01 08DPA 40379 0.5 3 NA NA 0.56 0.070168 NA
3 1 D15 T5 FF.01 08DPA 40379 0.5 4 NA NA 0.78 0.097734 NA
4 1 E19 T5 FF.01 08DPA 40379 0.5 4 NA NA 0.66 0.082698 NA
5 1 E34 T5 FF.01 08DPA 40379 0.5 3 NA NA 0.7 0.087710 NA
6 1 E38 T5 FF.01 08DPA 40379 0.5 3 NA NA 0.7 0.087710 NA
7 1 H29 T5 FF.01 08DPA 40379 0.5 5 NA NA 1.24 0.155372 NA
8 1 H34 T5 FF.01 08DPA 40379 0.5 4 NA NA 0.86 0.107758 NA
9 1 H52 T5 FF.01 08DPA 40379 0.5 5 NA NA 0.77 0.096481 NA
10 1 H61 T5 FF.01 08DPA 40379 0.5 5 NA NA 0.56 0.070168 NA
# Get the categories for the 'samples' data subset
categories <- resources$schema$categories[[index]]
categories
# List the 'quantitative' variables for the 'samples' data subset
categories$fields[ categories$name == "quantitative" ][[1]]
name fields
1 identifier SampleID
2 factor DevStage, FruitAge
3 quantitative FruitDiameter, FruitHeight, FruitFW, FruitDW, DW
4 qualitative Truss
[1] "FruitDiameter" "FruitHeight" "FruitFW" "FruitDW" "DW"
# Get the list of data subsets that were obtained from the 'samples'
index <- 2
subsets <- resources$schema$foreignKeys[[index]]$reference[,1]
if (! is.null(subsets) )
resources[ resources$name %in% subsets,
c("name", "title", "identifier", "obtainedFrom") ]
# Get the list of data subsets that were obtained from the 'aliquots'
index <- 3
subsets <- resources$schema$foreignKeys[[index]]$reference[,1]
if (! is.null(subsets) )
resources[ resources$name %in% subsets,
c("name", "title", "identifier", "obtainedFrom") ]
# List of data subsets that were obtained from the 'samples'
name title identifier obtainedFrom
3 aliquots Aliquots features AliquotID samples
7 pools Pools of remaining pools PoolID samples
# List of data subsets that were obtained from the 'aliquots'
name title identifier obtainedFrom
4 cellwall_metabo Cell wall Compound quantifications AliquotID aliquots
5 cellwall_metaboFW Cell Wall Compound quantifications (FW) AliquotID aliquots
6 activome Activome Features AliquotID aliquots
10 plato_hexosesP Hexoses Phosphate AliquotID aliquots
11 lipids_AG Lipids AG AliquotID aliquots
12 AminoAcid Amino Acids AliquotID aliquots
# getMergedDataset :
# Returns the data subset resulting from the merging of each data subset from which
# the previous data subset was obtained (i.e. going back up the chain of linking
# data subsets)
# Method : Performs a 'Rigth Join' between the different tables (data subsets)
getMergedDataset <- function( resources, subset, verbose=FALSE )
{
M <- NULL
s <- subset
while( sum(resources$name %in% s) ) {
if (verbose) cat ("data subset ",s,"\n")
# index for subset s
i1 <- which(resources$name %in% s)
# get metadata from subset s
m1 <- resources[i1, ]
# Read data for subset s
M1 <- read.table(url(resources[ "path" ]$path[i1]),
header= resources$dialect$header[i1],
sep=resources$dialect$delimiter[i1] )
if (! is.null(M))
# Merge the subset s - Right Join
M <- merge(M1, M, by = m1$identifier, all.y=TRUE)
else
M <- M1
# the data subset from which it was obtained.
s <- m1$obtainedFrom
# End of the chain ?
if (is.na(s)) break
}
unique(M)
}
# Merge the 'plants', 'samples', 'aliquots' and 'activome' data subsets
M <- getMergedDataset(resources, 'activome', TRUE)
dim(M)
# Display column names of the merged data subset
colnames(M)
data subset activome
data subset aliquots
data subset samples
data subset plants
[1] 1272 55
[1] "PlantID" "Rank" "PlantNum" "Treatment" "SampleID"
[6] "Truss" "DevStage" "FruitAge" "HarvestDate" "HarvestHour"
[11] "FruitPosition" "FruitDiameter" "FruitHeight" "FruitFW" "FruitDW"
[16] "DW" "AliquotID" "PGM" "cFBPase" "PyrK"
[21] "CitS" "PFP" "Aconitase" "PFK" "FruK"
[26] "pFBPase" "GluK" "NAD_ISODH" "Enolase" "NADP_ISODH"
[31] "PEPC" "Aldolase" "Succ_CoA_ligase" "NAD_MalDH" "AlaAT"
[36] "Fumarase" "AspAT" "NADP_GluDH" "NAD_GAPDH" "NADP_GAPDH"
[41] "NAD_GluDH" "TPI" "PGK" "Neutral_Inv" "Acid_Inv"
[46] "G6PDH" "UGPase" "SuSy" "NAD_ME" "ShiDH"
[51] "NADP_ME" "PGI" "StarchS" "AGPase" "SPS"
Session example with Python using a datapackage¶
datapackage-py
from datapackage import Package
import requests
import json
import jsonschema
import pandas as pd
# URL of the ODAM data repository
odam_url = 'https://pmb-bordeaux.fr/getdata'
# ID of the dataset
dataset = 'frim1'
# Read the datapackage.json as a dict
url = odam_url + '/query/' + dataset+'/datapackage?links=1'
dp = Package(url)
# Read the ODAM data package schema as a dict
url_schema = 'https://inrae.github.io/ODAM/json-schema/odam-data-package.json'
response = requests.get(url_schema)
schema = json.loads(response.text)
# if the package is a valid ODAM data package, then shows its resource names
if jsonschema.Draft7Validator(schema).is_valid(dp.descriptor) == True:
print(dp.resource_names)
else:
print("Error: datapackage is not a valid ODAM datapackage")
['plants', 'samples', 'aliquots', 'cellwall_metabo', 'cellwall_metaboFW',
'activome', 'pools', 'qMS_metabo', 'qNMR_metabo', 'plato_hexosesP', 'lipids_AG',
'AminoAcid']
# Get the 'plants' subsets into a data.frame
id = dp.resource_names.index('plants')
df = pd.DataFrame(dp.resources[id].read(keyed=True))
# Print the data.frame
df
PlantID Rank PlantNum Treatment
0 A1 A 1 Control
1 A2 A 2 Control
2 A3 A 3 Control
3 A4 A 4 Control
4 A5 A 5 Control
.. ... ... ... ...
547 G65 G 548 WaterStress
548 G66 G 549 WaterStress
549 G67 G 550 WaterStress
550 G68 G 551 WaterStress
551 G69 G 552 WaterStress
[552 rows x 4 columns]
# Get the headers of the first resource (id=0)
dp.resources[0]._Resource__table_options
{'scheme': None,
'format': 'csv',
'encoding': 'utf-8',
'pick_fields': None,
'skip_fields': None,
'pick_rows': [],
'skip_rows': [],
'delimiter': '\t',
'doublequote': False,
'lineterminator': '\n',
'skipinitialspace': True}
# Get the resource descriptor of the 'samples'
id = dp.resource_names.index('samples')
res = dp.resources[id]._Resource__current_descriptor
# List some attributes
for x in ['path', 'profile', 'name', 'title', 'identifier', 'obtainedFrom', 'joinkey']:
print("%s: %s" % (x, res[x]))
path: https://pmb-bordeaux.fr/getdata/tsv/frim1/samples
profile: https://inrae.github.io/ODAM/json-schema/odam-data-resource.json
name: samples
title: Sample features
identifier: SampleID
obtainedFrom: plants
joinkey: PlantID
# Get the categories of the 'samples'
pd.DataFrame(res['schema']['categories'])
name fields
0 identifier [SampleID]
1 factor [DevStage, FruitAge]
2 quantitative [FruitDiameter, FruitHeight, FruitFW, FruitDW,...
3 qualitative [Truss]
# Get the data subsets that were obtained from the 'samples'
print(pd.DataFrame(res['schema']['foreignKeys']))
the_list = []
for x in res['schema']['foreignKeys']:
the_list += [ x['reference']['resource'] ]
the_list
fields reference
0 SampleID {'resource': 'aliquots', 'fields': 'SampleID'}
1 SampleID {'resource': 'pools', 'fields': 'SampleID'}
['aliquots', 'pools']
# Merge the 'samples' subset with the data subset from which it was obtained
id = dp.resource_names.index('samples')
res = dp.resources[id]._Resource__current_descriptor
id_from = dp.resource_names.index(res['obtainedFrom'])
df_from = pd.DataFrame(dp.resources[id_from].read(keyed=True))
df = pd.DataFrame(dp.resources[id].read(keyed=True))
# Right join
merged_inner = pd.merge(left=df_from, right=df, how='right',
left_on=res['joinkey'], right_on=res['joinkey'])
merged_inner
PlantID Rank PlantNum Treatment ... FruitHeight FruitFW FruitDW DW
0 A1 A 1 Control ... 10.42 0.81 0.098091 None
1 A1 A 1 Control ... 31.77 21.43 1.470098 None
2 A1 A 1 Control ... 46.85 64.05 4.18887 None
3 A1 A 1 Control ... 43.35 66.64 3.338664 None
4 A2 A 2 Control ... 44.93 66.98 3.355698 None
... ... ... ... ... ... ... ... ...
1282 G67 G 550 WaterStress ... 45.72 70.23 None None
1283 G68 G 551 WaterStress ... 36.56 38.6 None None
1284 G69 G 552 WaterStress ... 39.1 45.47 2.63726 None
1285 G69 G 552 WaterStress ... 46.59 65.73 3.227343 None
1286 G69 G 552 WaterStress ... 40.4 58.51 4.908989 None
[1287 rows x 16 columns]