rm(list = ls())
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
cache = FALSE
)
This script processes climatic data from the raw output of ClimateDT (or any program providing raster TIFF data) into a dataframe containing standardised values for selected variables under past, present, and future climatic conditions. High-resolution climatic data (30 arc-seconds) were obtained from the Climate Downscaling Tool (ClimateDT; Marchi et al., 2024), a dynamic web tool that provides elevation-adjusted historical and future climatic data. The period 1901–1950 was used as the reference for local adaptation of populations (except for F. excelsior, see Main Text), and the period 1991–2020 was used to compute GDI. Future climatic data were also extracted for the 2041–2070 period under the socio-economic pathway SSP3-7.0; however, these were not used in the present study but in Guadaño-Peyrot et al. (2026) to compute genomic offset.
To identify the final set of climatic predictors for downstream analyses, we first focused on a pre-selected set of variables relevant to each species’ ecology. For past climatic data, forward selection was performed using the ordiR2step function from the vegan R package (Oksanen et al., 2001) to identify variables most closely related to genomic variation across populations. Next, variables with high collinearity (correlation coefficient > 0.75) were removed, prioritising retention of the most informative variables identified during forward selection. Finally, variance inflation factors (VIFs) were calculated, and additional variables were removed to ensure all VIF values were below 10.
After these selection steps, the retained climatic variables were standardised, and the processed datasets for past, present, and future scenarios were exported in multiple formats for downstream analyses.
meta_data_pop <- read.csv("Data/Species/Taxus_baccata/Populations/taxus_sample_29pop.csv",h=T,sep=";",dec=",")
meta_data_vcf <- read.csv("Data/Species/Taxus_baccata/samples/samples_taxus_baccata_adapcon_gentree.csv",h=T,sep=";",dec=",")
The first step involved reviewing the literature on the target species to define a set of climatic variables that could potentially drive adaptive genetic variation across its range. The aim was to retain only biologically informative variables, reducing the number of predictors tested and allowing refinement of hypotheses prior to analysis. The goal was not to limit the dataset to only 2–3 variables, nor to retain all available variables, but to pre-select a moderate set—typically 10–15 variables depending on the species and the depth of relevant literature.
For T. baccata, the initial pre-selection included the following variables: Bio 1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, 16, 17, 18, MGSP, AHM, SHM.
Load past climatic data
#Create an R object where all the climatic rasters are.
ras.bio <- stack(list.files("Data/Climatic/1901-1950_raster/", pattern = ".tif$", full.names = T))
#Names of the raster (because they are not in the classic order Bio 1, 2 etc they are order by number so bio 1, 10, 11 etc)
names(ras.bio) <- unlist(strsplit(unlist(lapply(strsplit(list.files("Data/Climatic/1901-1950_raster/", pattern = ".tif$", full.names = T), split = "Data/Climatic/1901-1950_raster/"), function(x) x[2])), split = ".tif"))
#We extracted the climatic values for each populations from the raster based on their coordinates
coords<- data.frame(apply(meta_data_pop[,c(5:4)], 2, as.numeric))#longitude then latitude
past_climatic_data_raster <- data.frame(meta_data_pop$Population,raster::extract(ras.bio, coords))#Use raster::function_name to avoid confusion with tidyr functions that have the same function
colnames(past_climatic_data_raster)<- c("Population",names(ras.bio))
#Add country info
past_climatic_data_f <- merge(meta_data_pop[,c(1,2)],past_climatic_data_raster, "Population")
Do the violin plot in a loop for each climatic variables
#Input data for the loop
data <- past_climatic_data_f
regions <-past_climatic_data_f$Country
bioclim_vars <- c("bio1", "bio2", "bio3", "bio4","bio5","bio6","bio7","bio9", "bio10","bio12","bio13","bio14","bio15","bio16", "bio17", "bio18","AHM", "MGSP", "SHM", "GDD5")
unit_var <- c("°c","°c","index","°c","°c","°c","°c","°c","°c","mm","mm","mm","index","mm","mm","mm","°c/mm","mm","°c/mm","°c*days" )
#The loop
for (x in 1:length(bioclim_vars)) {
var <- bioclim_vars[x]
unit <- unit_var[x]
data_subset <- data.frame(data[, paste0(var)])
colnames(data_subset) <- c("var1")
plot <- ggplot(data_subset, aes(x =var,y= var1)) +
geom_violin(trim = FALSE) +
geom_point(aes(color = regions), shape = 16, position = position_jitter(seed = 1, width = 0.2)) +
labs(colour="Countries",x="variable",y=paste0("values ","(",unit,")"),title=paste0("Violin Plot of ",var," differences across populations"))+
theme(plot.title = element_text(hjust = 0.5))
#Save
png(paste0("Results/species/taxus/climate/past_climatic_data/violin_plot/violin_plot_",var,".png"));print(plot)
;dev.off()
}
Example of violin plot:
The selection of climatic variables was considered through three more steps:
To identify which climatic variables explain the greatest proportion of genetic variation, we conducted a redundancy analysis (RDA; see Redundancy_analyses_candidate_detection_example script for details). Using the ordiR2step function from the vegan R package, we started with a null model containing no explanatory variables and progressively built a full model including all potential explanatory variables. Variables were retained based on their contribution to increasing model R², using a significance threshold of p < 0.05 with 1,000 permutations. The most important variables explaining genomic variation were then identified, and twenty independent runs were performed to assess the consistency of variable selection.
#Genomic data
load("Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_29pop_adapcon_gentree_475_8616.Rdata")
genomic_matrix <- data_allelic_frequencies_29pop_adapcon_gentree_475_8616
Prior to downstream analyses, climatic variables were scaled to standardise their range and variance.
past_climatic_data_scale <- past_climatic_data_f[,-c(1,2)]%>%
apply(2,as.numeric) %>% as.data.frame %>%
mutate(across(where(is.numeric), scale))
We can perform the RDA selection:
#Null model
RDA_null <- rda(genomic_matrix~1,past_climatic_data_scale )
#Model with all the variables
RDA_full <- rda(
formula=genomic_matrix~bio1+bio2+bio3+bio4+bio5+bio6+bio7+bio9+bio10+bio12+bio13+bio14+bio15+bio16+bio17+bio18+AHM+MGSP+SHM+GDD5,data = past_climatic_data_scale,scale=F)
#Stepwise by selecting only variables that increase the adjusted R^2 of the model and that have a pvalue lower than 0.05
Variable_selection_pv0.05 <- ordiR2step(RDA_null, RDA_full, Pin = 0.05, R2permutations = 1000, R2scope = T)
#Variable_selection_pv0.05$anova
names(Variable_selection_pv0.05$CCA$envcentre)
#we want, following Archambeau et al 2025, to investigate if the permutation stepwise identified the same variables of interest across runs:
#we run 20 independent stepwise models.
nbmodels <- 20
# Stepwise selection with ordiR2step function
#rep20_variable_selec_pv0.05 <- lapply(1:nbmodels, function(x) {
#mod <- ordiR2step(RDA_null, RDA_full, Pin = 0.05, R2permutations = 1000, R2scope = T)
#return(names(mod$CCA$envcentre))}) %>%
#setNames(paste0("model",1:nbmodels)) %>%
#ldply(function(x) data.frame(variables=x),.id="models") %>%
#dplyr::summarise(count(variables)) %>%
#setNames(c("variable","count"))
To reduce multicollinearity, only variables with low pairwise correlation were retained. Specifically, variables with an absolute correlation coefficient < 0.75 were kept from the pre-selected set of climatic variables.
#Function to do it
#Matrix of correlation
correlation_function <-function(data,threshold){
data_correlation <- subset(data,select= -c(Population,Country))
rownames(data_correlation) <- data$Populations
correlation <- cor(data_correlation)
correlation[abs(correlation) <= threshold] <- 0
corr_plot <- corrplot(correlation, method = "number", addrect = 2, col = c("red", "white", "red"), type = "lower", tl.col = "black", tl.cex = 0.6, number.cex = 0.6)
#Save
png("Results/species/taxus/climate/selection_var/corrplot_multicollinearity.png");corr_plot <- corrplot(correlation, method = "number", addrect = 2, col = c("red", "white", "red"), type = "lower", tl.col = "black", tl.cex = 0.6, number.cex = 0.6);dev.off()
#Plot the corrplot
corr_plot
}
#Right order
past_climatic_data_order <- past_climatic_data_f[,c(1,2,4,13:19,5:12,3,20,21,22)]
#Correlation past/present
data_present <- past_climatic_data_order
threshold <- 0.75
correlation_past <-correlation_function(data_present,threshold)
To further reduce multicollinearity, we calculated the variance inflation factor (VIF) for the retained variables, ensuring that no highly correlated variables were included in the final dataset.
#Subset previous step
RDA_subset_step4 <- rda(formula=genomic_matrix~bio1+bio2+bio3+bio4+bio5+bio9+bio12+bio15,data = past_climatic_data_scale,scale=F)
#anova.cca(RDA_subset_step4)
#RsquareAdj(RDA_subset_step4)
##Vif
vif.cca(RDA_subset_step4)
## bio1 bio2 bio3 bio4 bio5 bio9 bio12
## 28.981154 166.091110 64.987887 65.456002 46.882875 3.491931 1.427049
## bio15
## 1.769682
We observed that the VIF for some variables in the subset from previous step was excessively high (e.g., over 160 for BIO2). Considering this and the importance of certain variables for explaining genetic variation, we adjusted the subset to ensure that all VIF values were below 10.
#Adjusted subset based on VIF
RDA_adjusted_subset <- rda(formula=genomic_matrix~bio1+bio2+bio4+bio9+bio12+bio15,data = past_climatic_data_scale,scale=F)
#anova.cca(RDA_adjusted_subset)
#RsquareAdj(RDA_adjusted_subset)
##Vif
vif.cca(RDA_adjusted_subset)
## bio1 bio2 bio4 bio9 bio12 bio15
## 1.861089 2.470835 2.780922 2.708000 1.357544 1.768915
Based on the climatic variable selection, the following variables were retained for present and future periods: Bio 1, 2, 4, 9, 12, 15 for downstream T. baccata analyses.
#Create an R object where all the climatic rasters are.
ras.bio_present <- stack(list.files("Data/Climatic/1991-2020_raster_climateDT/", pattern = ".tif$", full.names = T))
names(ras.bio_present) <- unlist(strsplit(unlist(lapply(strsplit(list.files("Data/Climatic/1991-2020_raster_climateDT/", pattern = ".tif$", full.names = T), split = "Data/Climatic/1991-2020_raster_climateDT/"), function(x) x[2])), split = ".tif"))
#We extracted the climatic values for each populations from the raster based on their coordinates
coords<- data.frame(apply(meta_data_pop[,c(5:4)], 2, as.numeric))
present_climatic_data_Adapon_pop <- data.frame(Population=meta_data_pop$Population,raster::extract(ras.bio_present, coords))
#Add country info
present_climatic_data_Adapon_pop_f <- merge(meta_data_pop[,c(1,2)],present_climatic_data_Adapon_pop, "Population")
#Order
Present_new_6_Climatic_data_scale <-present_climatic_data_Adapon_pop_f[,c(1,2,3,6:8,4,5)]
#Name bio
colnames(Present_new_6_Climatic_data_scale) <- c("Population","Country","Annual_Tc","Diurnal_range_Tc","Tc_Seasonality","Tc_driest_quarter","Annual_P","P_Seasonality")
##Future Climatic Data
Multiple climatic models are available for future projections.
Selecting a single model without prior information could lead to bias,
as predictions can differ substantially among models. To reduce this
risk, we selected five widely used models for the 2041–2070 period under
the SSP3-7.0 scenario:
- GFDL-ESM4
- IPSL-CM6A-LR
- MPI-ESM1-2-HR - MPI-ESM2-0
- UKESM1-0-LL
The first step involved loading the future climatic data for each of the selected climatic models.
#List of the model for the loop
list_models<- c("GFDL_ESM4","IPSL_CM6A_LR","MPI_ESM1_2_HR","MRI_ESM2_0","UKESM1_0_LL")
for(x in 1:length(list_models)){
model <- list_models[x]
#Create an R object where all the climatic rasters are
ras.bio_future <- stack(list.files(paste0("Data/Climatic/Future_climate/2041_2070_",model,"_ssp370/"), pattern = ".tif$", full.names = T))
names(ras.bio_future) <- unlist(strsplit(unlist(lapply(strsplit(list.files(paste0("Data/Climatic/Future_climate/2041_2070_",model,"_ssp370/"), pattern = ".tif$", full.names = T), split = paste0("Data/Climatic/Future_climate/2041_2070_",model,"_ssp370/")), function(x) x[2])), split = ".tif"))
future_climatic_data <- data.frame(Population=meta_data_pop$Population,raster::extract(ras.bio_future, coords))
colnames(future_climatic_data)=c("Population",names(ras.bio_future))
new_colnames<- paste(colnames(future_climatic_data[,-1]),model, sep = "_")
colnames(future_climatic_data)[-1] <- new_colnames
#Add country info
future_climatic_data_f <- merge(meta_data_pop[,c(1,2)],future_climatic_data, "Population")
assign(paste0("Future_data_",model), future_climatic_data_f)
}
The next step involved grouping the data by climatic variable across all selected climatic models.
#List of the climatic var
list_climatic_var <- c("bio1","bio12","bio15","bio2","bio4","bio9")
for(x in 1:length(list_climatic_var)){
climatic_var <- list_climatic_var[x]
pos_clim <- x+2
data_set_clim <- data.frame(Population=Future_data_GFDL_ESM4$Population,Country=Future_data_GFDL_ESM4$Country,Future_data_GFDL_ESM4[,pos_clim],Future_data_IPSL_CM6A_LR[,pos_clim],Future_data_MPI_ESM1_2_HR[,pos_clim],Future_data_MRI_ESM2_0[,pos_clim],Future_data_UKESM1_0_LL[,pos_clim])
colnames(data_set_clim)<- c("Population","Country",paste0(climatic_var,"_GFDL_ESM4"),paste0(climatic_var,"_IPSL_CM6A_LR"),paste0(climatic_var,"_MPI_ESM1_2_HR"),paste0(climatic_var,"_MRI_ESM2_0"),paste0(climatic_var,"_UKESM1_0_LL"))
assign(paste0(climatic_var,"_future_climatic_data"),data_set_clim)
}
Next, we generated violin plots of climatic values for each population across the different climatic models. A separate plot was created for each bioclimatic variable of interest.
#Input data for the loop
unit_var <- c("°c","mm","Coefficient of Variation","°c","sd°c x100","°c")
list_dataset <- c("bio1_future_climatic_data","bio12_future_climatic_data","bio15_future_climatic_data","bio2_future_climatic_data","bio4_future_climatic_data","bio9_future_climatic_data")
#Results: the loop
for (x in 1:length(list_dataset)) {
dataset <- get(list_dataset[x])
unit <- unit_var[x]
var_name <- list_climatic_var[x]
data_violin <- gather(dataset,key="dataset",value="Value",-Population,-Country)
#Add the name of the model
data_violin_f <- data_violin %>%
mutate(models = gsub("^[^_]+_", "", dataset))
#Create scatterplot to compare the bioclimatic variable between the two periods
plot <- ggplot(data_violin_f, aes(x = models,y= Value)) +
geom_violin(trim = FALSE) +
geom_point(aes(color = data_violin_f$Country), shape = 16, position = position_jitter(seed = 1, width = 0.2),size=2) +
labs(colour="Country",x=var_name,y=paste0("values ","(",unit,")"),title=paste0("Violin Plot of ",var_name," differences across climatic models"))+
theme_classic() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 8.3),
axis.text.y = element_text(size = 12),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12),
legend.key = element_rect(fill = "transparent", color = NA),
panel.grid.major = element_line(color = "grey"),
panel.grid.minor = element_line(color = "grey"),
panel.grid.major.x = element_line(color = "grey"),
panel.grid.major.y = element_line(color = "grey")
) +
scale_x_discrete(labels = c("label1" = "name1", "label2" = "name2")) +
guides(color = guide_legend(override.aes = list(size = 4))) #Increase legend point size
#Save
pdf(paste0("Results/species/taxus/climate/future_climatic_data/violin_plot_comp_models/Comparison_",var_name,"between_climatic_models.pdf"));print(plot)
;dev.off()
#Print the plot
print(plot)
}
Correlation values across climatic models
list_names <- c("Bio 1","Bio 12","Bio 15","Bio 2","Bio 4","Bio 9")
for(x in 1:length(list_dataset)){
df <- get(list_dataset[x])
df_num <- df[,-c(1,2)] %>% mutate(across(as.numeric))
names_corr <- paste0("correlation_",list_dataset[x])
title <- (paste0("Correlation ",list_dataset[x]))
name_title <- list_names[x]
#Group for each bio var with the values of Bayes factor only
corr_bio <- cor(df[, grepl("bio", names(df))])
#Name the corr_bio with names_corr
assign(names_corr,corr_bio)
names_corr
#Plot corrplot
corr_plot <- corrplot(get(names_corr), method = "number", addrect = 2, col = c("red", "white", "red"), type = "lower", tl.col = "black", tl.cex = 0.8, number.cex = 0.7,mar=c(0,0,1,0),title=paste0("Correlation ",name_title," future climatic data"))
#Save
pdf(paste0("Results/species/taxus/climate/future_climatic_data/Correlation/correlation_",list_dataset[x],".pdf"));corr_plot <- corrplot(get(names_corr), method = "number", addrect = 2, col = c("red", "white", "red"), type = "lower", tl.col = "black", tl.cex = 0.8, number.cex = 0.7,mar=c(0,0,1,0), title=paste0("Correlation ",name_title," future climatic data"));dev.off()
}
The final step involved creating a data frame with the retained climatic variables standardised for downstream analyses. Present and future climatic values were standardised using the past data as a reference, allowing comparisons of values between periods to build indices such as the genomic offset.
#Dataframe not scaled
Past_clim_data_Europe_pop <- past_climatic_data_order[,c(1:4,6,10,12,15)]
#Creation of the scaled matrix
Past_new_6_Climatic_data_scale <-past_climatic_data_order[,c(3,4,6,10,12,15)] %>%
scale()
Past_new_6_Climatic_data_scale_df <- data.frame(past_climatic_data_order[,c(1,2)],Past_new_6_Climatic_data_scale)
colnames(Past_new_6_Climatic_data_scale_df) <- c("Population","Country","Annual_Tc","Diurnal_range_Tc","Tc_Seasonality","Tc_driest_quarter","Annual_P","P_Seasonality")
df_to_scale <- past_climatic_data_order[,c(3,4,6,10,12,15)]
colnames(df_to_scale) <- c("Annual_Tc","Diurnal_range_Tc","Tc_Seasonality","Tc_driest_quarter","Annual_P","P_Seasonality")
scale_env_value_new_cli <- attr(scale(df_to_scale), 'scaled:scale')
center_env_value_new_cli <- attr(scale(df_to_scale), 'scaled:center')
#Scale
load("Data/Species/Taxus_baccata/Climatic_data/new_selection/scale_env_value_new_cli.Rdata")
load("Data/Species/Taxus_baccata/Climatic_data/new_selection/center_env_value_new_cli.Rdata")
Present_climatic_data_T_adapcon_gentree_scaled <- data.frame(Present_new_6_Climatic_data_scale[,c(1,2)],scale(Present_new_6_Climatic_data_scale[,-c(1,2)], center = center_env_value_new_cli, scale = scale_env_value_new_cli))
The raster containing the standardised climatic variables was saved for use in the RDA-based Genomic Offset analysis
Finally, future climatic data were scaled using the mean and standard deviation of the past climatic data.
for(x in 1:length(list_models)){
model <- list_models[x]
data <- get(paste0("Future_data_",model))
data_final <- data.frame(data[,c(1:3,6:8,4,5)])
colnames(data_final) <- c("Population","Country","Annual_Tc","Diurnal_range_Tc","Tc_Seasonality","Tc_driest_quarter","Annual_P","P_Seasonality")
Future_climatic_data_scaled <- data.frame(data_final[,c(1,2)],scale(data_final[,-c(1,2)], center = center_env_value_new_cli, scale = scale_env_value_new_cli))
assign(paste0("future_climatic_data_",model),data_final)
assign(paste0("future_climatic_data_scaled_",model),Future_climatic_data_scaled)
#Non scaled
write_xlsx(get(paste0("future_climatic_data_", model)),paste0("Data/Species/Taxus_baccata/Climatic_data/new_selection/future_climatic_data_",model,".xlsx"))
#Save scale df
write_xlsx(get(paste0("future_climatic_data_scaled_",model)),paste0("Data/Species/Taxus_baccata/Climatic_data/new_selection/future_climatic_data_scaled_",model,".xlsx"))
save(list=paste0("future_climatic_data_scaled_",model),file=paste0("Data/Species/Taxus_baccata/Climatic_data/new_selection/future_climatic_data_scaled_",model,".Rdata"))
}
Next, we loaded the folder containing raster files and created a raster stack for all climatic variables for each climatic model.
for(x in 1:length(list_models)){
model <- list_models[x]
future_climatic_data_raster <- stack(list.files(paste0("Data/Climatic/Future_climate/2041_2070_",model,"_ssp370/"), pattern = ".tif$", full.names = T))
future_climatic_data <- data.frame(meta_data_pop$Population,raster::extract(future_climatic_data_raster, coords))
names(future_climatic_data_raster) <- unlist(strsplit(unlist(lapply(strsplit(list.files(paste0("Data/Climatic/Future_climate/2041_2070_",model,"_ssp370/"), pattern = ".tif$", full.names = T), split = paste0("Data/Climatic/Future_climate/2041_2070_",model,"_ssp370/")), function(x) x[2])), split = ".tif"))
assign(paste0("future_climatic_data_raster_",model),future_climatic_data_raster)
#Save file
save(list=paste0("future_climatic_data_raster_",model),file=paste0("Data/Species/Taxus_baccata/Genomic_offset/RDA/future_climatic_data_raster_",model,".Rdata"),F=T)
}
For the LFMM analysis, we created a data frame containing climatic values at the individual level.
#Load meta_data at individual level
climatic_data_indivdual_level_new_selection <- merge(meta_data_vcf[,-2],past_climatic_data_order[,c(1:4,6,10,12,15)],"Population")
climatic_data_indivdual_level_scaled_new_selection <- climatic_data_indivdual_level_new_selection[,-c(1:3)] %>%
scale()
climatic_data_indivdual_level_scaled_new_selection_df <- data.frame(climatic_data_indivdual_level_new_selection[,c(1:3)],climatic_data_indivdual_level_scaled_new_selection)
#Rename columns
colnames(climatic_data_indivdual_level_scaled_new_selection_df) <- c("Population","VCF_ID","Country","Annual_Tc","Diurnal_range_Tc","Tc_Seasonality","Tc_driest_quarter","Annual_P","P_Seasonality")
Session info
packages <- c(
"dplyr", "tidyverse", "ggplot2", "corrplot",
"dismo", "raster", "vegan", "plyr",
"writexl", "rasterize", "sf", "FactoMineR"
)
info <- sessionInfo()
packages_used <- info$otherPkgs[packages]
data.frame(
Package = names(packages_used),
Version = sapply(packages_used, function(x) x$Version)
)
## Package Version
## dplyr dplyr 1.1.4
## tidyverse tidyverse 2.0.0
## ggplot2 ggplot2 3.5.2
## corrplot corrplot 0.92
## dismo dismo 1.3-14
## raster raster 3.6-26
## vegan vegan 2.6-4
## plyr plyr 1.8.9
## writexl writexl 1.5.0
## rasterize rasterize 0.1
## sf sf 1.0-15
## FactoMineR FactoMineR 2.9