Skip to content

R and Python session examples

Session example with R using a datapackage

In this example, the datapackage is directly generated from an ODAM repository.
But it is also possible to retrieve it from a Dataverse repository from a keyword. See an Example

json_validate: Validate a json file

packages <- c('httr', 'jsonlite', 'jsonvalidate' )
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(packages, rownames(installed.packages())), 
                    repos='http://cran.rstudio.com')
}

library(httr)
library(jsonvalidate)
library(jsonlite)

options(width=256)

# URL of the ODAM data repository
odam_url <- 'https://pmb-bordeaux.fr/getdata'

# ID of the dataset
dataset <- 'frim1'

# Get the ODAM data package schema
URL <- 'https://inrae.github.io/ODAM/json-schema/odam-data-package.json'
response <- httr::GET(URL, config(sslversion=6,ssl_verifypeer=1))
schema <- httr::content(response, as ='text')

# Get structural metadata information in datapackage format (json) for a dataset
# directly from its ODAM repository. As the option links is set to 1, we will have
# the absolute reference for data files (see 'path' below)
URL <- sprintf('%s/query/%s/datapackage?links=1', odam_url, dataset)
response <- httr::GET(URL, config(sslversion=6,ssl_verifypeer=1))
dp_json <- httr::content(response, as ='text')

# Validate the JSON against the ODAM data package schema
jsonvalidate::json_validate(dp_json, schema)
[1] TRUE

# Parse the JSON object to a data.frame
dp <- jsonlite::fromJSON(dp_json)

# View licenses
print(dp$licenses, right=F)
  name       path                                     title                                    
1 ODC-BY-1.0 https://opendatacommons.org/licenses/by/ Open Data Commons Attribution License 1.0

resources <- dp$resources

# List some metadata about the dataset
resources[ c("name", "title", "identifier", "obtainedFrom", "joinkey", "path") ]
                name                                   title identifier obtainedFrom   joinkey                                                        path
1             plants                          Plant features    PlantID         <NA>      <NA>            https://pmb-bordeaux.fr/getdata/tsv/frim1/plants
2            samples                         Sample features   SampleID       plants   PlantID           https://pmb-bordeaux.fr/getdata/tsv/frim1/samples
3           aliquots                       Aliquots features  AliquotID      samples  SampleID          https://pmb-bordeaux.fr/getdata/tsv/frim1/aliquots
4    cellwall_metabo      Cell wall Compound quantifications  AliquotID     aliquots AliquotID   https://pmb-bordeaux.fr/getdata/tsv/frim1/cellwall_metabo
5  cellwall_metaboFW Cell Wall Compound quantifications (FW)  AliquotID     aliquots AliquotID https://pmb-bordeaux.fr/getdata/tsv/frim1/cellwall_metaboFW
6           activome                       Activome Features  AliquotID     aliquots AliquotID          https://pmb-bordeaux.fr/getdata/tsv/frim1/activome
7              pools                Pools of remaining pools     PoolID      samples  SampleID             https://pmb-bordeaux.fr/getdata/tsv/frim1/pools
8         qMS_metabo             MS Compounds quantification     PoolID        pools    PoolID        https://pmb-bordeaux.fr/getdata/tsv/frim1/qMS_metabo
9        qNMR_metabo            NMR Compounds quantification     PoolID        pools    PoolID       https://pmb-bordeaux.fr/getdata/tsv/frim1/qNMR_metabo
10    plato_hexosesP                       Hexoses Phosphate  AliquotID     aliquots AliquotID    https://pmb-bordeaux.fr/getdata/tsv/frim1/plato_hexosesP
11         lipids_AG                               Lipids AG  AliquotID     aliquots AliquotID         https://pmb-bordeaux.fr/getdata/tsv/frim1/lipids_AG
12         AminoAcid                             Amino Acids  AliquotID     aliquots AliquotID         https://pmb-bordeaux.fr/getdata/tsv/frim1/AminoAcid

# Read the 'samples' data file - index=2
index <- 2
M <- read.table(url(resources[ "path" ]$path[index]), 
                header= resources$dialect$header[index], 
                sep=resources$dialect$delimiter[index] )

# Display an extract 
M[1:10,]
   SampleID PlantID Truss DevStage FruitAge HarvestDate HarvestHour FruitPosition FruitDiameter FruitHeight FruitFW  FruitDW DW
1         1     A26    T5    FF.01    08DPA       40379         0.5             2            NA          NA    0.72 0.090216 NA
2         1      C2    T5    FF.01    08DPA       40379         0.5             3            NA          NA    0.56 0.070168 NA
3         1     D15    T5    FF.01    08DPA       40379         0.5             4            NA          NA    0.78 0.097734 NA
4         1     E19    T5    FF.01    08DPA       40379         0.5             4            NA          NA    0.66 0.082698 NA
5         1     E34    T5    FF.01    08DPA       40379         0.5             3            NA          NA     0.7 0.087710 NA
6         1     E38    T5    FF.01    08DPA       40379         0.5             3            NA          NA     0.7 0.087710 NA
7         1     H29    T5    FF.01    08DPA       40379         0.5             5            NA          NA    1.24 0.155372 NA
8         1     H34    T5    FF.01    08DPA       40379         0.5             4            NA          NA    0.86 0.107758 NA
9         1     H52    T5    FF.01    08DPA       40379         0.5             5            NA          NA    0.77 0.096481 NA
10        1     H61    T5    FF.01    08DPA       40379         0.5             5            NA          NA    0.56 0.070168 NA

# Get the categories for the 'samples' data subset
categories <- resources$schema$categories[[index]]
categories

# List the 'quantitative' variables for the 'samples' data subset
categories$fields[ categories$name == "quantitative" ][[1]]
          name                                           fields
1   identifier                                         SampleID
2       factor                               DevStage, FruitAge
3 quantitative FruitDiameter, FruitHeight, FruitFW, FruitDW, DW
4  qualitative                                            Truss

[1] "FruitDiameter" "FruitHeight"   "FruitFW"       "FruitDW"       "DW"           

# Get the list of data subsets that were obtained from the 'samples'
index <- 2
subsets <- resources$schema$foreignKeys[[index]]$reference[,1]
if (! is.null(subsets) ) 
   resources[ resources$name %in% subsets,
                       c("name", "title", "identifier", "obtainedFrom") ]

# Get the list of data subsets that were obtained from the 'aliquots'
index <- 3
subsets <- resources$schema$foreignKeys[[index]]$reference[,1]
if (! is.null(subsets) ) 
   resources[ resources$name %in% subsets,
                       c("name", "title", "identifier", "obtainedFrom") ]
# List of data subsets that were obtained from the 'samples'
        name                     title  identifier  obtainedFrom
3   aliquots         Aliquots features   AliquotID       samples
7      pools  Pools of remaining pools      PoolID       samples

# List of data subsets that were obtained from the 'aliquots'
                name                                   title identifier obtainedFrom
4    cellwall_metabo      Cell wall Compound quantifications  AliquotID     aliquots
5  cellwall_metaboFW Cell Wall Compound quantifications (FW)  AliquotID     aliquots
6           activome                       Activome Features  AliquotID     aliquots
10    plato_hexosesP                       Hexoses Phosphate  AliquotID     aliquots
11         lipids_AG                               Lipids AG  AliquotID     aliquots
12         AminoAcid                             Amino Acids  AliquotID     aliquots

# getMergedDataset : 
#    Returns the data subset resulting from the merging of each data subset from which
#    the previous data subset was obtained (i.e. going back up the chain of linking 
#    data subsets)
#    Method : Performs a 'Rigth Join' between the different tables (data subsets)
getMergedDataset <- function( resources, subset, verbose=FALSE )
{
   M <- NULL
   s <- subset
   while( sum(resources$name %in% s) ) {
      if (verbose) cat ("data subset ",s,"\n")
     # index for subset s
       i1 <- which(resources$name %in% s)
     # get metadata from subset s
       m1 <- resources[i1, ]
     # Read data for subset s
       M1 <- read.table(url(resources[ "path" ]$path[i1]), 
                        header= resources$dialect$header[i1], 
                        sep=resources$dialect$delimiter[i1] )
       if (! is.null(M))
         # Merge the subset s - Right Join
           M <- merge(M1, M, by = m1$identifier, all.y=TRUE)
       else
           M <- M1
     # the data subset from which it was obtained.
       s <- m1$obtainedFrom
     # End of the chain ?
       if (is.na(s)) break
   }
   unique(M)
}

# Merge the 'plants', 'samples', 'aliquots' and 'activome' data subsets 
M <- getMergedDataset(resources, 'activome', TRUE)
dim(M)

# Display column names of the merged data subset
colnames(M)
data subset  activome 
data subset  aliquots 
data subset  samples 
data subset  plants 

[1] 1272   55

 [1] "PlantID"         "Rank"            "PlantNum"        "Treatment"       "SampleID"       
 [6] "Truss"           "DevStage"        "FruitAge"        "HarvestDate"     "HarvestHour"    
[11] "FruitPosition"   "FruitDiameter"   "FruitHeight"     "FruitFW"         "FruitDW"        
[16] "DW"              "AliquotID"       "PGM"             "cFBPase"         "PyrK"           
[21] "CitS"            "PFP"             "Aconitase"       "PFK"             "FruK"           
[26] "pFBPase"         "GluK"            "NAD_ISODH"       "Enolase"         "NADP_ISODH"     
[31] "PEPC"            "Aldolase"        "Succ_CoA_ligase" "NAD_MalDH"       "AlaAT"          
[36] "Fumarase"        "AspAT"           "NADP_GluDH"      "NAD_GAPDH"       "NADP_GAPDH"     
[41] "NAD_GluDH"       "TPI"             "PGK"             "Neutral_Inv"     "Acid_Inv"       
[46] "G6PDH"           "UGPase"          "SuSy"            "NAD_ME"          "ShiDH"          
[51] "NADP_ME"         "PGI"             "StarchS"         "AGPase"          "SPS"            

Session example with Python using a datapackage

datapackage-py

from datapackage import Package
import requests
import json
import jsonschema
import pandas as pd 

# URL of the ODAM data repository
odam_url = 'https://pmb-bordeaux.fr/getdata'

# ID of the dataset
dataset = 'frim1'

# Read the datapackage.json as a dict 
url = odam_url + '/query/' + dataset+'/datapackage?links=1'
dp = Package(url)

# Read the ODAM data package schema as a dict 
url_schema = 'https://inrae.github.io/ODAM/json-schema/odam-data-package.json'
response = requests.get(url_schema)
schema = json.loads(response.text)

# if the package is a valid ODAM data package, then shows its resource names
if jsonschema.Draft7Validator(schema).is_valid(dp.descriptor) == True:
    print(dp.resource_names)
else:
    print("Error: datapackage is not a valid ODAM datapackage")
['plants', 'samples', 'aliquots', 'cellwall_metabo', 'cellwall_metaboFW', 
 'activome', 'pools', 'qMS_metabo', 'qNMR_metabo', 'plato_hexosesP', 'lipids_AG',
 'AminoAcid']

# Get the 'plants' subsets into a data.frame
id = dp.resource_names.index('plants')
df = pd.DataFrame(dp.resources[id].read(keyed=True))

# Print the data.frame
df
    PlantID Rank PlantNum    Treatment
0        A1    A        1      Control
1        A2    A        2      Control
2        A3    A        3      Control
3        A4    A        4      Control
4        A5    A        5      Control
..      ...  ...      ...          ...
547     G65    G      548  WaterStress
548     G66    G      549  WaterStress
549     G67    G      550  WaterStress
550     G68    G      551  WaterStress
551     G69    G      552  WaterStress

[552 rows x 4 columns]

# Get the headers of the first resource (id=0)
dp.resources[0]._Resource__table_options
{'scheme': None,
 'format': 'csv',
 'encoding': 'utf-8',
 'pick_fields': None,
 'skip_fields': None,
 'pick_rows': [],
 'skip_rows': [],
 'delimiter': '\t',
 'doublequote': False,
 'lineterminator': '\n',
 'skipinitialspace': True}

# Get the resource descriptor of the 'samples'
id = dp.resource_names.index('samples')
res = dp.resources[id]._Resource__current_descriptor

# List some attributes
for x in ['path', 'profile', 'name', 'title', 'identifier', 'obtainedFrom', 'joinkey']:
    print("%s: %s" % (x, res[x]))
path: https://pmb-bordeaux.fr/getdata/tsv/frim1/samples
profile: https://inrae.github.io/ODAM/json-schema/odam-data-resource.json
name: samples
title: Sample features
identifier: SampleID
obtainedFrom: plants
joinkey: PlantID

# Get the categories of the 'samples'
pd.DataFrame(res['schema']['categories'])
           name                                             fields
0    identifier                                         [SampleID]
1        factor                               [DevStage, FruitAge]
2  quantitative  [FruitDiameter, FruitHeight, FruitFW, FruitDW,...
3   qualitative                                            [Truss]

# Get the data subsets that were obtained from the 'samples'
print(pd.DataFrame(res['schema']['foreignKeys']))

the_list = []
for x in res['schema']['foreignKeys']:
    the_list += [ x['reference']['resource'] ]

the_list
     fields                                       reference
0  SampleID  {'resource': 'aliquots', 'fields': 'SampleID'}
1  SampleID     {'resource': 'pools', 'fields': 'SampleID'}

['aliquots', 'pools']

# Merge the 'samples' subset with the data subset  from which it was obtained 
id = dp.resource_names.index('samples')
res = dp.resources[id]._Resource__current_descriptor
id_from = dp.resource_names.index(res['obtainedFrom'])
df_from = pd.DataFrame(dp.resources[id_from].read(keyed=True))
df      = pd.DataFrame(dp.resources[id].read(keyed=True))

# Right join
merged_inner = pd.merge(left=df_from, right=df, how='right', 
                        left_on=res['joinkey'], right_on=res['joinkey'])
merged_inner
     PlantID Rank PlantNum    Treatment  ... FruitHeight FruitFW   FruitDW    DW
0         A1    A        1      Control  ...       10.42    0.81  0.098091  None
1         A1    A        1      Control  ...       31.77   21.43  1.470098  None
2         A1    A        1      Control  ...       46.85   64.05   4.18887  None
3         A1    A        1      Control  ...       43.35   66.64  3.338664  None
4         A2    A        2      Control  ...       44.93   66.98  3.355698  None
     ...  ...      ...          ...  ...         ...     ...       ...   ...
1282     G67    G      550  WaterStress  ...       45.72   70.23      None  None
1283     G68    G      551  WaterStress  ...       36.56    38.6      None  None
1284     G69    G      552  WaterStress  ...        39.1   45.47   2.63726  None
1285     G69    G      552  WaterStress  ...       46.59   65.73  3.227343  None
1286     G69    G      552  WaterStress  ...        40.4   58.51  4.908989  None

[1287 rows x 16 columns]