Get a set of taxon names

Read a file with taxon names. The function read.csv will read them as a data frame or table.

taxa <- read.csv("../data-raw/plant_genera.txt", header = FALSE)
str(taxa)
#> 'data.frame':    20 obs. of  1 variable:
#>  $ V1: chr  "Abrotanella" "Acaena" "Acanthopsis" "Acer" ...

Make a character vector of taxon names


taxa <- taxa$V1
str(taxa)
#>  chr [1:20] "Abrotanella" "Acaena" "Acanthopsis" "Acer" "Achillea" "Achlys" ...

Match taxon names to OpenTree Taxonomy (OTT)

We will use the tnrs_match_names function from the rotl R package.

taxa_tnrs <- rotl::tnrs_match_names(taxa)
ls(taxa_tnrs)
#> [1] "approximate_match" "flags"             "is_synonym"       
#> [4] "number_matches"    "ott_id"            "search_string"    
#> [7] "unique_name"

It generates a dat frame with teh results of the TNRS match to the OpenTree Taxonomy (OTT).

We want the OTT id numbers.

taxa_ott_ids <- taxa_tnrs$ott_id

Get an OpenTree synthetic subtree for each genus

To extract a synthetic subtree containing all descendants of any taxon, we will use the function tol_subtree, from the rotl R package, again.

subtree_taxon1 <- rotl::tol_subtree(taxa_ott_ids[1], label_format = "name")
subtree_taxon1
#> 
#> Phylogenetic tree with 23 tips and 1 internal nodes.
#> 
#> Tip labels:
#>   Abrotanella_rosulata, Abrotanella_inconspicua, Abrotanella_linearifolia, Abrotanella_purpurea, Abrotanella_fertilis, Abrotanella_forsteroides, ...
#> Node labels:
#>   Abrotanella
#> 
#> Unrooted; no branch lengths.

The function just allows one OTT id at a time, to get all subtrees at once, we can use a for loop or a handy lapply function.

subtree_taxa_all_name <- lapply(taxa_ott_ids, rotl::tol_subtree, label_format = "name")
#> Error: HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("node34943"), parent = "node34939"), list(children_from_taxon = list("node34941"), parent = "node34940")))), mrca = "mrcaott189412ott213652")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
subtree_taxa_all_id <- lapply(taxa_ott_ids, rotl::tol_subtree, label_format = "id")
#> Error: HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("node34943"), parent = "node34939"), list(children_from_taxon = list("node34941"), parent = "node34940")))), mrca = "mrcaott189412ott213652")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).

It is easier to catch errors with a for loop:

subtree_taxa_all_id <- vector(mode = "list")
for (n in taxa_ott_ids){
  subtree <- try(rotl::tol_subtree(n, label_format = "id"))
  # print(subtree)
  subtree_taxa_all_id <- c(subtree_taxa_all_id, list(subtree))
  
}
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("node34943"), parent = "node34939"), list(children_from_taxon = list("node34941"), parent = "node34940")))), mrca = "mrcaott189412ott213652")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("ott6123725"), parent = "node27222"), list(children_from_taxon = list("node27228"), parent = "node27227"), list(children_from_taxon = list("node27252"), parent = "node27251"), list(children_from_taxon = list("node27258"), parent = "node27257"), list(children_from_taxon = list("node27262"), parent = "node27261"), list(children_from_taxon = list("node27313"), parent = "node27312"), list(children_from_taxon = list(
#>     "node27318"), parent = "node27316"), list(children_from_taxon = list("node27322"), parent = "node27321"), list(children_from_taxon = list("node27326"), parent = "node27325"), list(children_from_taxon = list("node27339"), parent = "node27335"), list(children_from_taxon = list("node27338"), parent = "node27336")))), mrca = "mrcaott23895ott49697")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> 
Progress [-----------------------------------] 0/2 (  0) ?s
Progress [==================================] 2/2 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: ott4416, ott395142
#> 
Progress [-----------------------------------] 0/1 (  0) ?s
Progress [==================================] 1/1 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: ott480490
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("node20107"), parent = "node20106"), list(children_from_taxon = list("node20152"), parent = "node20151")))), mrca = "mrcaott2441ott44343")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> 
Progress [-----------------------------------] 0/2 (  0) ?s
Progress [==================================] 2/2 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: ott662962, ott804933
#> 
Progress [-----------------------------------] 0/1 (  0) ?s
Progress [==================================] 1/1 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: ott301496
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_1647@tree1` = list(attachment_points = list(list(children_from_taxon = list("node10"), parent = "node9"), list(children_from_taxon = list("node14"), parent = "node13")))), mrca = "mrcaott93631ott93641")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> 
Progress [-----------------------------------] 0/3 (  0) ?s
Progress [==================================] 3/3 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: ott441629, ott487617, ott869834
#> 
Progress [----------------------------------] 0/23 (  0) ?s
Progress [================================] 23/23 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: ott197343, ott197337, ott229165, ott737256, ott258933, ott1057538,
#> ott54669, ott650740, ott737242, ott476268, ott748378, ott116325, ott679999,
#> ott357132, ott781600, ott983159, ott144323, ott748370, ott488393, ott505054,
#> ott70786, ott737251, ott3994178
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("node27549"), parent = "node27548"), list(children_from_taxon = list("node27553"), parent = "node27552"), list(children_from_taxon = list("node27555"), parent = "node27554"), list(children_from_taxon = list("node27558"), parent = "node27557")))), mrca = "mrcaott105992ott139602")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> 
Progress [-----------------------------------] 0/3 (  0) ?s
Progress [==================================] 3/3 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: ott117369, ott639085, ott639096
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`pg_2608@tree6288` = list(attachment_points = list(list(children_from_taxon = list("node1091405"), parent = "node1091399"), list(children_from_taxon = list("node1091401"), parent = "node1091400")))), mrca = "mrcaott41288ott41290")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
length(subtree_taxa_all_id)
#> [1] 20

Let’s do the same with scientific names:

subtree_taxa_all_name <- vector(mode = "list")
for (n in taxa_ott_ids){
  subtree <- try(rotl::tol_subtree(n, label_format = "name"))
  # print(subtree)
  subtree_taxa_all_name <- c(subtree_taxa_all_name, list(subtree))
  
}
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("node34943"), parent = "node34939"), list(children_from_taxon = list("node34941"), parent = "node34940")))), mrca = "mrcaott189412ott213652")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("ott6123725"), parent = "node27222"), list(children_from_taxon = list("node27228"), parent = "node27227"), list(children_from_taxon = list("node27252"), parent = "node27251"), list(children_from_taxon = list("node27258"), parent = "node27257"), list(children_from_taxon = list("node27262"), parent = "node27261"), list(children_from_taxon = list("node27313"), parent = "node27312"), list(children_from_taxon = list(
#>     "node27318"), parent = "node27316"), list(children_from_taxon = list("node27322"), parent = "node27321"), list(children_from_taxon = list("node27326"), parent = "node27325"), list(children_from_taxon = list("node27339"), parent = "node27335"), list(children_from_taxon = list("node27338"), parent = "node27336")))), mrca = "mrcaott23895ott49697")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> 
Progress [-----------------------------------] 0/2 (  0) ?s
Progress [==================================] 2/2 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: Achillea formosa, Achillea pindicola
#> 
Progress [-----------------------------------] 0/1 (  0) ?s
Progress [==================================] 1/1 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: Achnatherum pekinense
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("node20107"), parent = "node20106"), list(children_from_taxon = list("node20152"), parent = "node20151")))), mrca = "mrcaott2441ott44343")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> 
Progress [-----------------------------------] 0/2 (  0) ?s
Progress [==================================] 2/2 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: Aerides odorata, Aerides multiflora
#> 
Progress [-----------------------------------] 0/1 (  0) ?s
Progress [==================================] 1/1 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: Afzelia bella
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_1647@tree1` = list(attachment_points = list(list(children_from_taxon = list("node10"), parent = "node9"), list(children_from_taxon = list("node14"), parent = "node13")))), mrca = "mrcaott93631ott93641")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> 
Progress [-----------------------------------] 0/3 (  0) ?s
Progress [==================================] 3/3 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: Alchemilla procumbens, Alchemilla fischeri, Alchemilla filicaulis
#> 
Progress [----------------------------------] 0/23 (  0) ?s
Progress [================================] 23/23 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: Allium obtusum, Allium howellii, Allium peninsulare, Allium bolanderi,
#> Allium rotundum, Allium schoenoprasum subsp. schoenoprasum, Allium vodopjanovae,
#> Allium victorialis, Allium sanbornii, Allium cyaneum, Allium ampeloprasum,
#> Allium ericetorum, Allium platyspathum, Allium materculae, Allium cepa,
#> Allium cupuliferum, Allium bourgeaui, Allium sativum, Allium taquetii, Allium
#> scorodoprasum, Allium carinatum, Allium neapolitanum, Allium cassium
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`ot_311@tree1` = list(attachment_points = list(list(children_from_taxon = list("node27549"), parent = "node27548"), list(children_from_taxon = list("node27553"), parent = "node27552"), list(children_from_taxon = list("node27555"), parent = "node27554"), list(children_from_taxon = list("node27558"), parent = "node27557")))), mrca = "mrcaott105992ott139602")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
#> 
#> 
Progress [-----------------------------------] 0/3 (  0) ?s
Progress [==================================] 3/3 (100)  0s
                                                            
#> Warning in collapse_singles(tr, show_progress): Dropping singleton nodes with
#> labels: Alnus acuminata, Alnus jorullensis, Alnus hirsuta
#> Error : HTTP failure: 400
#> list(contesting_trees = list(`pg_2608@tree6288` = list(attachment_points = list(list(children_from_taxon = list("node1091405"), parent = "node1091399"), list(children_from_taxon = list("node1091401"), parent = "node1091400")))), mrca = "mrcaott41288ott41290")[/v3/tree_of_life/subtree] Error: node_id was not found (broken taxon).
length(subtree_taxa_all_name)
#> [1] 20

Check which ones errored and which ones produced a subtree:

sapply(subtree_taxa_all_name, class)
#>  [1] "phylo"     "try-error" "phylo"     "try-error" "phylo"     "phylo"    
#>  [7] "phylo"     "phylo"     "try-error" "phylo"     "phylo"     "phylo"    
#> [13] "phylo"     "try-error" "phylo"     "phylo"     "try-error" "phylo"    
#> [19] "phylo"     "try-error"

is_error <-   sapply(subtree_taxa_all_name, class) %in% "try-error"
is_phylo <-   sapply(subtree_taxa_all_name, class) %in% "phylo"

Out of the 20 taxon OTT ids tried, 6 were unsuccessful and the remainder 14 retrieved a subtree successfully.

We will make a new list object containing subtrees only, excluding the errors:

subtree_taxa_phylo_name <- subtree_taxa_all_name[is_phylo]
length(subtree_taxa_phylo_name)
#> [1] 14
names(subtree_taxa_phylo_name) <- taxa[is_phylo]

subtree_taxa_phylo_id <- subtree_taxa_all_id[is_phylo]
length(subtree_taxa_phylo_id)
#> [1] 14
names(subtree_taxa_phylo_id) <- taxa[is_phylo]

Save the trees:

assign("subtrees", get("subtree_taxa_phylo_name"))
save(list = "subtrees", file = "../data-raw/subtrees.RData")

Now, we can make a vector containing all taxon names that are tip labels in the synthetic subtrees:

subtaxa_name <- sapply(subtree_taxa_phylo_name, "[", "tip.label")
length(subtaxa_name)
#> [1] 14
names(subtaxa_name) <- names(subtree_taxa_phylo_name)

subtaxa_id <- sapply(subtree_taxa_phylo_id, "[", "tip.label")
length(subtaxa_id)
#> [1] 14
names(subtaxa_id) <- names(subtree_taxa_phylo_id)
subtaxa_id[1]
#> $Abrotanella
#>  [1] "ott148523"  "ott365007"  "ott365009"  "ott365025"  "ott425771" 
#>  [6] "ott425773"  "ott425777"  "ott425779"  "ott425781"  "ott425786" 
#> [11] "ott786604"  "ott900584"  "ott900587"  "ott932307"  "ott1005999"
#> [16] "ott1006002" "ott1006005" "ott1006011" "ott1006014" "ott7607732"
#> [21] "ott7607733" "ott7607734" "ott7607735"

Get GBIF taxon names of tree tip labels (necessary??)

Get GBIF data for tips in tree

We will use the function occ_search from the rgbif R package. Let’s try it first with the scientific names in subtaxa_name:


name <- gsub("_", " ", subtaxa_name[[1]][1])
name_gbif <- rgbif::occ_search(scientificName = name, fields=c('name','decimalLatitude', 'decimalLongitude'), limit = 100)
ls(name_gbif)
#> [1] "data"      "facets"    "hierarchy" "media"     "meta"
ls(name_gbif$data)
#> [1] "decimalLatitude"  "decimalLongitude"
name_gbif$data
#> # A tibble: 8 x 2
#>   decimalLongitude decimalLatitude
#>              <dbl>           <dbl>
#> 1             169.           -52.6
#> 2             169.           -52.5
#> 3             169.           -52.5
#> 4             169.           -52.5
#> 5             169.           -52.6
#> 6             169.           -52.6
#> 7             169            -52.6
#> 8             169.           -52.5

Next, we will do it in a loop:

# for each phylo object i
for (i in 1:length(subtaxa_name)){
  print(i)
  # create an empty list to hold results
  subtaxa_gbif <- vector(mode = "list")
  # for each tip label "name" in phylo object i
  for (name in subtaxa_name[[i]]){
    print(name)
    # replace underscores by spaces in name
    name <- gsub("_", " ", name)
    # get gbif records for "name", with a try
    name_gbif <- try(rgbif::occ_search(scientificName = name, 
                                   fields=c('order',
                                            'family',
                                            'genus',
                                            'acceptedScientificName',
                                            'decimalLatitude', 
                                            'decimalLongitude', 
                                            "elevation", 
                                            "familyKey", 
                                            "genusKey", 
                                            "taxonKey"),
                                   limit = 5))
    # if try was successful and produced a gbif object
    # and data is not empty (i.e., there are gbif records for "name")
    if(inherits(name_gbif, "gbif") & !is.null(name_gbif$data)){
      # sometimes, there are records but they are not retrieved
      # if data was more than 0 rows
      if(nrow(name_gbif$data)){
        # create a data frame object containing "name" and gbif data for "name"
        name_gbif <- cbind(name, name_gbif$data)
        
      }
    }
  # concatenate the result of each "name" into a list
  subtaxa_gbif <- c(subtaxa_gbif, list(name_gbif))
  }
  # use subtaxon names to name the list of results for all subtaxa in taxa
  names(subtaxa_gbif) <- subtaxa_name[[i]]
  # save results as an R object with name = names(subtaxa_name)[i]
  assign(names(subtaxa_name)[i], get("subtaxa_gbif"))
  save(list = names(subtaxa_name)[i], file = paste0("../data-raw/", names(subtaxa_name)[i], ".RData"))
  # which ones are data frames
  is_data <- sapply(subtaxa_gbif, class) %in% "data.frame"
  # merge all data into a single table
  tab <- dplyr::bind_rows(subtaxa_gbif[is_data])
  write.csv(tab, file = paste0("../data-raw/", names(subtaxa_name)[i], ".csv"))
}