rm(list = ls())
knitr::opts_chunk$set(
    message = FALSE,
    warning = FALSE,
    cache = FALSE
)

1 Introduction

This script performs candidate detection using Gradient Forest (GF). Gradient Forest is a machine learning method introduced by Ellis, Smith, and Pitcher (2012). Fitzpatrick and Keller (2015) describe how GF can be used to:
- Analyse and map spatial variation in allele frequencies as a function of environmental gradients (outlier detection via GEA).
- Project patterns of genomic variation under different climatic scenarios (genomic offset).

More specifically, GF uses Random Forest to fit an ensemble of regression trees modelling changes in allele frequencies across sites and derives monotonic, nonlinear functions of environmental predictors. The empirical nonlinear turnover functions are constructed by distributing the R² values of all SNPs across predictor gradients in proportion to their importance and along each gradient according to the density of raw split importance values. Split importance values for all SNP models are then aggregated into a genome-wide turnover function for each variable, weighted by predictor importance and model fit for each SNP (Fitzpatrick et al., 2021).

GF is considered both univariate and multivariate, as it models one response variable at a time against predictors, while summarising results into cumulative turnover functions. This allows the contribution of multiple climatic variables to be evaluated simultaneously for a single SNP. GF can also be applied across multiple response variables to summarise the effects of explanatory variables on multiple responses (Ellis et al., 2012).

In this script, we apply the GF algorithm for outlier detection following Fitzpatrick et al. (2021) and Archambeau et al. (2025). GF is run on genomic datasets both corrected and uncorrected for population structure. The analysis is performed for each SNP, producing a turnover function per SNP using all predictors.

To ensure robustness, the model is run five times for each dataset. Candidate SNPs are defined as the 1% or 5%, depending on the species, of the SNPs consistently detected across all five runs, as results from individual runs may vary slightly.

1.1 Formatting the genomic and the climatic data

To perform the Gradient Forest analysis following Fitzpatrick et al. (2021), the data must be formatted with populations in rows and SNPs in columns. It is crucial that the order of populations is consistent between the genomic and climate data files.

1.1.1 Climatic data

#climatic data
Past_climatic <- read.csv("Data/Species/Taxus_baccata/Climatic_data/new_selection/Past_new_6_Climatic_data_scale_df.csv",sep=";",dec=",")
vars <- colnames(Past_climatic[,-c(1:2)])

1.1.2 Genomic data

1.1.2.1 GF raw

#genomic data
load("Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_29pop_adapcon_gentree_475_8616.Rdata")

genomic_matrix <- data_allelic_frequencies_29pop_adapcon_gentree_475_8616

1.1.2.2 GF corrected

Gradient Forest (GF) does not have an option to directly correct for population structure. To account for population structure in GF, we must use a genomic dataset that has already been corrected. The scaled population structure matrix from BayPass cannot be used, as GF requires a data frame rather than a matrix. Instead, we can use the genotypic dataset corrected for population structure via latent factors in LFMM.

We load the corrected LFMM genomic dataframe. The genomic data is at the individual level for 475 individuals, 8,616 imputed SNPs and with MAC correction.

load("Data/Species/Taxus_baccata/genetic_new/GEA/Genomic_matrix_corrected_from_LFMM_T_adapcon_gentree.Rdata")
corrected_geno_data <- Genomic_matrix_corrected_from_LFMM_T_adapcon_gentree

corrected_geno_data_ind <- rownames_to_column(corrected_geno_data, "VCF_ID")

We need to transform the genomic data at the individual level to the population level

#add the population info
#meta_data
meta_data_pop <- read.csv("Data/Species/Taxus_baccata/Populations/taxus_sample_29pop.csv",h=T,sep=";",dec=",")

meta_data_vcf <- read.csv("Data/Species/Taxus_baccata/Samples/samples_taxus_baccata_adapcon_gentree.csv",h=T,sep=";",dec=",")

geno_pop_info <- merge(meta_data_vcf,corrected_geno_data_ind, "VCF_ID" )

#formatting the genomic data
data_frequencies_num <- geno_pop_info[,-c(1:3)] %>%
  apply(2,as.numeric) /2

#dataset with all information and genomic data in the right format
data_frequencies_num_tot <- data.frame(geno_pop_info[,c(1:3)],data_frequencies_num)

#calculation of allelic frequencies
allelic_frequencies <-data_frequencies_num_tot %>% dplyr::select(-c("VCF_ID","Country","Population")) %>%
  group_by(data_frequencies_num_tot$Population) %>%
  summarise_at(vars(everything()),funs(mean),na.rm=T) %>%
    ungroup() %>%
as.data.frame()

#Pop with row.names
allelic_frequencies_f <- allelic_frequencies %>% column_to_rownames('data_frequencies_num_tot$Population')

2 Run the models

In this step, we will run the GF algorithm five times because, due to the stochastic nature of the machine learning process, the set of candidate SNPs may vary between runs. To ensure robustness, we will only consider SNPs that are consistently identified as candidates across all five runs. This approach suggest that the selected candidates reflect true signals rather than random variation.

Run_GF_and_select_outliers <- function(genomic_matrix, climatic_data, ntree, cores,nbr_loci_distrib,vars,x,path){
  #GF function
  runGF <- function(alFreq,  envTab, vars, ntree, 
                  cores, indLoci){
  require(data.table)
  require(gradientForest)
  require(parallel)
  require(foreach)
  require(doParallel)

  if(identical(envTab$Population,rownames(alFreq))==F){
    stop("Populations are not in the same order in the genomic and climatic tables.")
  }
  
  # create custom object to hold output 
  gfOutObj <- setClass("gfOutObj", slots = c(alFreq="data.frame", imp="list"))

  # run in parallel if fitting SNPs individually
  if(indLoci==T & !is.na(cores)){
    # fit gf model to each SNP individually
    cl <- makeCluster(cores, setup_strategy = "sequential")
    registerDoParallel(cl)

    gfMods <- foreach(k=1:ncol(alFreq), .verbose=F, .packages=c("gradientForest"), .errorhandling = c("pass")) %dopar%{
      locus <- data.frame(alFreq[,k])
      names(locus) <- colnames(alFreq)[k]
      gf.mod <- gradientForest(data.frame(envTab[, vars], locus), 
                               predictor.vars=vars, response.vars=colnames(alFreq)[k], 
                               corr.threshold=0.5, ntree=ntree, trace=T)
    if(!is.null(gf.mod)){
        imps <- importance(gf.mod)
        imps <- imps[order(names(imps))]
        data.frame(imps, SNP = colnames(alFreq)[k])
      }
    }
    
    stopCluster(cl)
    return(gfOutObj(alFreq = data.frame(alFreq), imp = gfMods))
  } else {
    # run all SNPs at once if not fitting individually
    gf.mod <- gradientForest(data.frame(envTab[, vars], alFreq), 
                             predictor.vars=vars, response.vars=colnames(alFreq), 
                             corr.threshold=0.5, ntree=ntree, trace=T)
    
    return(gfOutObj(alFreq = data.frame(alFreq), imp = gfMods))
  }
}
  #run GF
  GF_test <- runGF(genomic_matrix,climatic_data,vars,ntree=ntree, 
                  cores=cores, indLoci=T)
  
  #extract the loci correlated to the climate
  Extract_correlation_loci_climate<- GF_test@imp
loci_correlated_climate <- Filter(function(x) !inherits(x, "error"),  Extract_correlation_loci_climate)

#extracting R^2 values
gfR2tab <- function(gfMods.list){
  gfMods.list <- gfMods.list
  i=1
  while(is.null(gfMods.list[[i]])){i=i+1}
  tab <- do.call(rbind, gfMods.list)
  vrNm <- rep(row.names(tab)[1:nrow(gfMods.list[[i]])], 
              nrow(tab)/nrow(gfMods.list[[i]]))
  tab <- data.frame(variable=vrNm, tab)
  tab <- reshape2::dcast(tab, SNP~variable, value.var="imps")
  totalR2 <- rowSums(tab[,-1])
  return(data.frame(tab, totalR2=totalR2))}

dataset_R2_loci_climate <- gfR2tab(loci_correlated_climate)

  #select randomly the SNPs, we selected 20% of all SNPs to create the null distribution
name_neutral_snps <- sample(dataset_R2_loci_climate$SNP,nbr_loci_distrib,replace = F)

neutral_snps_set <- dataset_R2_loci_climate %>% 
    filter(SNP %in% name_neutral_snps)

#hist neutral 
 neutral_R2_distrib<-hist(neutral_snps_set$totalR2)
 
 #name
neutral_R2_distrib<-hist(neutral_snps_set$totalR2)

#save the histogram
 png(filename=paste0(path,x,"neutral_R2_distrib",".png"))
 
# a histogram we want to save
hist(neutral_snps_set$totalR2)
# call this function to save the file 
dev.off()
 
#empirical pvalues
empirical_pvalues <- sapply(1:nrow(dataset_R2_loci_climate), function(x, dataset_R2_loci_climate, name_neutral_snps, neutral_snps_set){
    snps2Rank <- rbind(dataset_R2_loci_climate[x,], neutral_snps_set) %>% 
      distinct() %>% 
      dplyr::select(-SNP)
    P <- apply(snps2Rank, 2, function(y){
      rankSNP <- frank(y)
      return(1-rankSNP[1]/length(rankSNP))
    })}, dataset_R2_loci_climate, neutral_snps, neutral_snps_set)
  

  # format output as data.frame
  empirical_pvalues_df <- t(empirical_pvalues)
  colnames(empirical_pvalues_df) <- paste("pval_", colnames(empirical_pvalues_df), sep="")
  empirical_pvalues_df <- data.frame(dataset_R2_loci_climate, empirical_pvalues_df)

    #visualise the pvalues distribution
  pvalues_distribution <- hist(empirical_pvalues_df$pval_totalR2)
  
  #save the histogram
png(filename=paste0(path,"pvalues_distribution",x,".png"))

# a histogram we want to save
hist(empirical_pvalues_df$pval_totalR2)

# call this function to save the file 
dev.off()
  # Return the pvalues 
  return(empirical_pvalues_df)
}

We can load the run to avoid rerunning the previous step.

3 Identification of candidate loci

3.1 Thresholds

Now we want to identify candidate SNPs. To do that, we applied 2 types of thresholds depending on the species:
- rank based 5%
- rank based 1%

for(i in 1: length(GF_data)){
  genomic_data <- get(GF_data[i])
  model_name <- GF_models[i]

for(x in 1:5){
  data_name <- paste0("Run",x,"_",model_name)
  
  data <- get(data_name)
  
   #top 1%
 outliers_top1perc_GF <- data[,c(1,15)] %>% 
  arrange(pval_totalR2) %>%
slice(1:(0.01*8616)) %>%
  as.data.frame()
 
 assign(paste0("Run",x,"_",model_name,"_top1SNP"),outliers_top1perc_GF)
 
   #top 5%
 outliers_top5perc_GF <- data[,c(1,15)] %>% 
  arrange(pval_totalR2) %>%
slice(1:(0.05*nrow(data))) %>%
  as.data.frame()
 
 assign(paste0("Run",x,"_",model_name,"_top5SNP"),outliers_top5perc_GF)
  }
}

3.2 Graphical visualisation

An important step is to compare the results across different runs for each threshold. We created Venn diagrams to visualise the number of candidate SNPs shared between runs.

for(i in 1: length(GF_data)){
  genomic_data <- get(GF_data[i])
  model_name <- GF_models[i]
  
#top 1%
grid.newpage()
ven <- venn.diagram(
  x = list(
    get(paste0("Run1_", model_name, "_top1SNP"))[, 1], 
    get(paste0("Run2_", model_name, "_top1SNP"))[, 1], 
    get(paste0("Run3_", model_name, "_top1SNP"))[, 1], 
    get(paste0("Run4_", model_name, "_top1SNP"))[, 1], 
    get(paste0("Run5_", model_name, "_top1SNP"))[, 1]
  ),
  category.names = c("RUN1", "RUN2", "RUN3", "RUN4", "RUN5"),
  filename = NULL,
  fill = c("#E69F00", "#56B4E9", "#009E73", "#D55E00","#CC79A7"),
  alpha = 0.30,
  print.mode = c("raw","percent"),  
  sigdigs = 0,  
  margin = 0.1,
  cat.fontface = "italic",
  cat.fontsize = 14, 
  main = paste0("Venn Diagram: Top 1% Candidate SNPs Across ",model_name," Runs"),
  main.fontface = "bold",
  main.cex = 1.5,
  cex = 1.5 
)
grid.draw(ven)

#top 5% 
grid.newpage()
ven <- venn.diagram(
  x = list(
    get(paste0("Run1_", model_name, "_top5SNP"))[, 1], 
    get(paste0("Run2_", model_name, "_top5SNP"))[, 1], 
    get(paste0("Run3_", model_name, "_top5SNP"))[, 1], 
    get(paste0("Run4_", model_name, "_top5SNP"))[, 1], 
    get(paste0("Run5_", model_name, "_top5SNP"))[, 1]
  ),
  category.names = c("RUN1", "RUN2", "RUN3", "RUN4", "RUN5"),
  filename = NULL,
  fill = c("#E69F00", "#56B4E9", "#009E73", "#D55E00","#CC79A7"),
  alpha = 0.30,
  print.mode = c("raw","percent"),  
  sigdigs = 0,  
  margin = 0.1,
  cat.fontface = "italic",
  cat.fontsize = 14, 
  main = paste0("Venn Diagram: Top 5% Candidate SNPs Across ",model_name," Runs"),
  main.fontface = "bold",
  main.cex = 1.5,
  cex = 1.5
)
# Draw the Venn diagram
grid.draw(ven)
}

4 Save candidates

As candidates, we selected and saved the top candidates:

4.1 1% of SNPs for downstream analysis

for(i in 1: length(GF_data)){

  genomic_data <- get(GF_data[i])
  model_name <- GF_models[i]

#Select only the candidates identified in all 5 runs
outliers_rank_based_top1perc <- Reduce(intersect, list(get(paste0("Run1_",model_name,"_top1SNP"))[,1], get(paste0("Run2_",model_name,"_top1SNP"))[,1], get(paste0("Run3_",model_name,"_top1SNP"))[,1], get(paste0("Run4_",model_name,"_top1SNP"))[,1], get(paste0("Run5_",model_name,"_top1SNP"))[,1]))

#number of outliers
length(outliers_rank_based_top1perc)

assign(paste0("outliers_rank_based_top1perc_",model_name),outliers_rank_based_top1perc)
#save
   save(list = paste0("outliers_rank_based_top1perc_",model_name), file = paste0("Results/species/taxus/GEA_new_var/outliers/outliers_rank_based_top1perc_", model_name, ".Rdata"))
}

4.2 5% of SNPs for downstream analysis as a relax threshold candidates

for(i in 1: length(GF_data)){

  genomic_data <- get(GF_data[i])
  model_name <- GF_models[i]
  
#Select only the outliers identified in all 5 runs
outliers_rank_based_top5perc <- Reduce(intersect, list(get(paste0("Run1_",model_name,"_top5SNP"))[,1], get(paste0("Run2_",model_name,"_top5SNP"))[,1], get(paste0("Run3_",model_name,"_top5SNP"))[,1], get(paste0("Run4_",model_name,"_top5SNP"))[,1], get(paste0("Run5_",model_name,"_top5SNP"))[,1]))

#number of outliers
length(outliers_rank_based_top5perc)

assign(paste0("outliers_rank_based_top5perc_",model_name),outliers_rank_based_top5perc)

#save
   save(list = paste0("outliers_rank_based_top5perc_",model_name), file = paste0("Results/species/taxus/GEA_new_var/outliers/outliers_rank_based_top5perc_", model_name, ".Rdata"))
}

Session info

packages <- c(
  "gradientForest", "dplyr", "tidyr", "writexl",
  "VennDiagram", "radiant.data", "textshape",
  "doParallel","foreach","data.table"
)
info <- sessionInfo()
packages_used <- info$otherPkgs[packages]
data.frame(
  Package = names(packages_used),
  Version = sapply(packages_used, function(x) x$Version)
)

##                       Package Version
## gradientForest gradientForest  0.1-37
## dplyr                   dplyr   1.1.4
## tidyr                   tidyr   1.3.1
## writexl               writexl   1.5.0
## VennDiagram       VennDiagram   1.7.3
## radiant.data     radiant.data   1.6.3
## textshape           textshape   1.7.5
## doParallel         doParallel  1.0.17
## foreach               foreach   1.5.2
## data.table         data.table  1.15.0

Gradient_forest_candidate_detection

Thomas Francisco

2024-04-30