Google search
ODAM: Find a Dataverse dataset via Google Search¶
The purpose of this R script is to test if a dataset can be "findable" by a search engine such as Google. For this test, we have chosen our example dataset 'frim1'. The searched keywords are thus 'frim1' associated with 'dataset'. See the code and its results.
- Step 1: Start by searching for the keywords in Google Search
- Step 2 - If found links in step 1, select those that correspond to a Dataverse dataset
- Step 3 - If found some in step 2, then retrieve the metadata
See below the code and its results.
packages <- c('RCurl', 'XML', 'dataverse' )
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())),
repos='http://cran.rstudio.com')
}
library(RCurl)
library(XML)
library(dataverse)
# Search terms
searchTerms <- "frim1 dataset"
language <- "en"
# Collapse search terms.
entry <- paste(searchTerms, collapse="+")
# Do the search
siteHTML <- RCurl::getForm("http://www.google.com/search",
hl="en", lr="", q=entry,
btnG="Search")
# Extract nodes with the links
html <- XML::htmlTreeParse(siteHTML, useInternalNodes = TRUE, error=function (...){})
nodes <- XML::getNodeSet(html, "//div[@class='kCrYT']//a")
# Get all the links
links <- sapply(nodes, function(x) x <- XML::xmlAttrs(x)[["href"]])
links <- links[grep("google.com/search", links, invert = TRUE)]
links <- gsub('/url.q=', '', gsub('&sa.+$', '', links))
# harmonizes the writing of URLs
links <- gsub("%3D", "=", gsub( "%3F", "?", links ) )
# Compile into a data.frame
df <- data.frame(
label=sapply( XML::getNodeSet(html, "//div[@class='BNeawe vvjwJb AP7Wnd']"), function(x) { XML::xmlValue(x) } ),
links=links,
stringsAsFactors=FALSE
)
print(df,right=F)
label
1 FRIM - Fruit Integrative Modelling - Experimental ... - Data Inra
2 [PDF] pe-pa – tomato frim1 - CBIB
3 Home · inrae/ODAM Wiki · GitHub
4 [PDF] Data Capture ? Daniel Jacob INRA UMR 1332 BFP
5 [PDF] Modeling the growth of tomato fruits based on enzyme ... - HAL-CNRS
6 FRIM - Fruit Integrative Modelling | Zenodo
7 Rodam package - CRAN
8 Odam: Open Data, Access and Mining - SlideShare
9 Make your data great now - SlideShare
links
1 https://data.inra.fr/dataset.xhtml%3FpersistentId%3Ddoi:10.15454/95JUTK
2 http://services.cbib.u-bordeaux.fr/MERYB/DATA/protocols_upload/1_5_Analytical_177.pdf
3 https://github.com/INRA/ODAM/wiki
4 https://hal.archives-ouvertes.fr/hal-02070883/file/Howto_FAIR_DataLifecycle_Aprill2019.pdf
5 https://hal-cnrs.archives-ouvertes.fr/hal-02611223/document
6 https://zenodo.org/record/154041
7 https://cran.r-project.org/web/packages/Rodam/vignettes/Rodam.html
8 https://www.slideshare.net/danieljacob771282/odam-open-data-access-and-mining
9 https://www.slideshare.net/danieljacob771282/make-your-data-great-now
# is there a link to a dataverse site?
ret <- grep( "dataset.xhtml.persistentId=doi", links )
if ( length(ret) ) {
# Bingo !
doi <- paste0( 'doi:', gsub("^http.+doi:", "", links[ret[1]]) )
dataverse <- gsub( "/dataset.+", "", links[ret[1]] )
# Retrieving Dataset and File Metadata
Sys.setenv("DATAVERSE_SERVER" = dataverse)
dataset <- dataverse::get_dataset(doi)
# Display the dataset information
dataset
}
Dataset (261843):
Version: 4.2, RELEASED
Release Date: 2020-06-08T14:57:17Z
License: NONE
17 Files:
label version id contentType
1 datapackage.json 1 99623 application/json
2 frim1.zip 3 96878 application/zip
# Get more metadata
if ( length(ret) ) {
metadata <- dataverse::dataset_metadata(doi)
metadata$fields[ ! metadata$fields$multiple |
metadata$fields$typeClass=="controlledVocabulary",
c("typeName", "value") ]
}
typeName value
1 title FRIM - Fruit Integrative Modelling
2 alternativeURL https://pmb-bordeaux.fr/dataexplorer/?ds=frim1
6 kindOfData Dataset
7 dataOrigin experimental data
8 subject Computer science, Information management, Omics, Plant Health and Pathology
9 lifeCycleStep Study design, Data collection
12 language English
14 productionDate 2010
15 productionPlace INVENIO, Ste Livrade, France
17 depositor Jacob, Daniel
18 dateOfDeposit 2018-10-08