rm(list = ls())
knitr::opts_chunk$set(
    message = FALSE,
    warning = FALSE,
    cache = FALSE
)

library(VennDiagram)
library(dplyr)
library(genetics)
library(corrplot)
library(tidyr)
library(writexl)

1 Introduction

This script enables the visualisation and selection of outlier loci for use in GDI and genomic offset calculations. We applied six GEA methods to identify SNPs potentially associated with climatic variables: RDA, pRDA, BayPass, LFMM, GF-raw, and GF-corrected (see previous scripts).
Each method produced a set of candidate SNPs. To define the final set of outliers, we retained only SNPs that were identified by at least two methods and were not in high linkage disequilibrium (LD).
We considered two levels of stringency for outlier detection:
Less conservative (LC): FDR < 10%, top 5%, and BF > 8
More conservative (MC): FDR < 5%, top 1%, and BF > 10

In addition to these outlier sets, we defined a set of random markers that follow the same allele frequency distribution as the outliers while matching their number of SNPs. This provides a neutral baseline for comparison.

Finally, we also retained all candidate SNPs identified by at least one GEA method, regardless of overlap.

These different SNP sets (stringent outliers, random markers, and full candidate sets) are used in downstream analyses, including GDI and genomic offset estimation, to assess the robustness of predictions across marker subsets.

2 Load candidates SNPs

#Names of the files
##LC
list_data_LC <- c("outliers_RDA_FDR10perc_T_Adapcon_gentree","outliers_pRDA_FDR10perc_T_Adapcon_gentree","outliers_T_adapcon_gentree_BAYPASS_BF_8","outliers_T_adapcon_gentree_LFMM_10perc","outliers_rank_based_top5perc_GF_raw","outliers_rank_based_top5perc_GF_corrected")

##MC
list_data_MC <- c("outliers_RDA_FDR5perc_T_Adapcon_gentree","outliers_pRDA_FDR5perc_T_Adapcon_gentree","outliers_T_adapcon_gentree_BAYPASS_BF_10","outliers_T_adapcon_gentree_LFMM_5perc","outliers_rank_based_top1perc_GF_raw","outliers_rank_based_top1perc_GF_corrected")

#name of the output
names_set <- c("RDA","pRDA","BAYPASS","LFMM","GF_raw","GF_corrected")

#load data
for(x in 1:length(names_set)){
  
  set_LC <- list_data_LC[x]
  load(paste0("Results/species/taxus/GEA_new_var/outliers/",set_LC,".Rdata"))
  name <- names_set[x]
assign(paste0(name,"_set_LC"),get(set_LC))  

set_MC <- list_data_MC[x]
  load(paste0("Results/species/taxus/GEA_new_var/outliers/",set_MC,".Rdata"))
  name <- names_set[x]
assign(paste0(name,"_set_MC"),get(set_MC))  
}

We merged every set in a list

#list of candidates
list_candidates_LC<-list(RDA=RDA_set_LC[,1],pRDA=pRDA_set_LC[,1],BAYPASS=BAYPASS_set_LC[,1],LFMM=LFMM_set_LC[,1],GF_raw=GF_raw_set_LC,GF_corrected=GF_corrected_set_LC)

list_candidates_MC<-list(RDA=RDA_set_MC[,1],pRDA=pRDA_set_MC[,1],BAYPASS=BAYPASS_set_MC[,1],LFMM=LFMM_set_MC[,1],GF_raw=GF_raw_set_MC,GF_corrected=GF_corrected_set_MC)

#list snps_names
##LC
list_all_candidates_snps_LC <- unlist(list_candidates_LC)
list_all_candidates_snps_LC <- data.frame(list_all_candidates_snps_LC)
write_xlsx(list_all_candidates_snps_LC,path="Results/species/taxus/GEA_new_var/outliers/set/list_all_candidates_snps_LC.xlsx") 

##MC
list_all_candidates_snps_MC <- unlist(list_candidates_MC)
list_all_candidates_snps_MC <- data.frame(list_all_candidates_snps_MC)
write_xlsx(list_all_candidates_snps_MC,path="Results/species/taxus/GEA_new_var/outliers/set/list_all_candidates_snps_MC.xlsx") 
colnames(RDA_set_MC) <- c("Loci","p.value")
colnames(pRDA_set_MC) <- c("Loci","p.value")

3 Plot to visualise the overlapping candidates SNPs

We chose to visualise overlaps among candidate SNP sets using an UpSet plot rather than Venn diagrams. While Venn diagrams can only clearly represent up to five sets, our analysis involves six GEA methods, making them difficult to interpret. Functions such as ggVennDiagram become increasingly complex and less informative when more than five sets are included.
In contrast, the UpSet plot provides a clearer and more informative representation by:
displaying only the observed overlaps between methods (omitting zero intersections), allowing easy comparison of overlap sizes across multiple sets and highlighting subsets of SNPs detected by specific combinations of GEA methods.
This makes the UpSet approach particularly useful for our multi-method outlier detection framework.

list_threshold <- c("LC","MC")

for(i in 1:length(list_threshold)){
  threshold <- list_threshold[i]
all_snps <- unique(c(get(paste0("RDA_set_",threshold))[,1],get(paste0("pRDA_set_",threshold))[,1],get(paste0("BAYPASS_set_",threshold))[,1],get(paste0("LFMM_set_",threshold))[,1],get(paste0("GF_raw_set_",threshold)),get(paste0("GF_corrected_set_",threshold))))

snp_df <- data.frame(
  SNP = all_snps,
  RDA = as.integer(all_snps %in% get(paste0("RDA_set_",threshold))$Loci),
  pRDA = as.integer(all_snps %in% get(paste0("pRDA_set_",threshold))$Loci),
  LFMM = as.integer(all_snps %in% get(paste0("LFMM_set_",threshold))$SNP),
  BAYPASS = as.integer(all_snps %in% get(paste0("BAYPASS_set_",threshold))$Loci),
  GF_raw = as.integer(all_snps %in% get(paste0("GF_raw_set_",threshold))),
  GF_corrected = as.integer(all_snps %in% get(paste0("GF_corrected_set_",threshold)))
)
# Create the UpSet plot
plot <- UpSetR::upset(snp_df, 
      sets = c("RDA", "pRDA", "LFMM", "BAYPASS", "GF_raw", "GF_corrected"),
      main.bar.color = "#45B6AA",
      matrix.color = "#D45176",
      keep.order = T)

print(plot)
#save 
pdf(paste0("Results/species/taxus/GEA_new_var/outliers/figure/upset_plot_",threshold,"_set.pdf")); print(plot);dev.off()
}

4 Set of outlier SNPs

4.1 Overlapping SNPs by at least 2 GEA methods

Now we can create a data frame with only the overlapping candidate of at least 2 GEA methods:

for(i in 1:length(list_threshold)){
  
  threshold <- list_threshold[i]
  
comm_RDA_pRDA <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("RDA","pRDA")] %>% Reduce(intersect, .))
comm_RDA_BAYPASS <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("RDA","BAYPASS")] %>% Reduce(intersect, .))
comm_RDA_GF_raw <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_raw","RDA")] %>% Reduce(intersect, .))
comm_RDA_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("LFMM","RDA")] %>% Reduce(intersect, .))
comm_pRDA_BAYPASS <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("pRDA","BAYPASS")] %>% Reduce(intersect, .))
comm_pRDA_GF_raw <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_raw","pRDA")] %>% Reduce(intersect, .))
comm_pRDA_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("LFMM","pRDA")] %>% Reduce(intersect, .))
comm_BAYPASS_GF_raw <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_raw","BAYPASS")] %>% Reduce(intersect, .))
comm_BAYPASS_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("LFMM","BAYPASS")] %>% Reduce(intersect, .))
comm_GF_raw_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_raw","LFMM")] %>% Reduce(intersect, .))
comm_GF_corrected_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","LFMM")] %>% Reduce(intersect, .))
comm_BAYPASS_GF_corrected <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","BAYPASS")] %>% Reduce(intersect, .))
comm_pRDA_GF_corrected <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","pRDA")] %>% Reduce(intersect, .))
comm_RDA_GF_corrected <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","RDA")] %>% Reduce(intersect, .))
comm_GF_corrected_GF_raw <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","GF_raw")] %>% Reduce(intersect, .))

outliers_set <- data.frame(rbind(comm_RDA_pRDA,comm_RDA_BAYPASS,comm_RDA_GF_raw,comm_RDA_LFMM,comm_pRDA_BAYPASS,comm_pRDA_GF_raw,comm_pRDA_LFMM,comm_BAYPASS_GF_raw,comm_BAYPASS_LFMM,comm_GF_raw_LFMM,comm_GF_corrected_LFMM,comm_BAYPASS_GF_corrected,comm_pRDA_GF_corrected,comm_RDA_GF_corrected,comm_GF_corrected_GF_raw)) %>% unique()

assign(paste0("outliers_set_",threshold),outliers_set)

#number of candidates identified by at least 2 methods
print(nrow(get(paste0("outliers_set_",threshold))))
}

## [1] 136
## [1] 58

save(outliers_set_LC, file="Results/species/taxus/GEA_new_var/outliers/set/outliers_set_LC.Rdata")

4.2 Linkage desequilibrium

During candidate SNP selection, we observed that some outliers were located on the same contig. This can bias downstream analyses, as clusters of SNPs in strong linkage disequilibrium (LD) may artificially increase the weight of particular genomic regions. This issue is especially problematic when the number of SNPs per contig is not proportional to contig size, as stochastic variation in marker density can lead to the overrepresentation of certain contigs. The extent to which this is problematic depends on the study system.

To mitigate this bias, we applied an LD pruning step within contigs or within 2,000 bp, retaining only SNPs that were not in high LD with one another (|r| < 0.7). This reduces the inflation of signals due to physically linked SNPs.

Linkage desequilibrium calculation:

for(i in 1:length(list_threshold)){
  threshold <- list_threshold[i]
#load genotypic data at the individual level
load("Data/Species/Taxus_baccata/genetic_new/GEA/vcf_data_GEA.Rdata")
genomic_matrix_LD <- vcf_data_GEA

#subset the genomic matrix to only the outliers
candidates_to_LD <- data.frame(genomic_matrix_LD[get(paste0("outliers_set_",threshold))$outliers])

#put the data into the correct format
genotypic_data<- makeGenotypes(candidates_to_LD)

#Run the LD calculation
LD_estimate <- LD(genotypic_data)

#save the R² values
matrix_LD <- LD_estimate$`R^2`
matrix_LD_format<- matrix_LD[,-1]

#filtered the LD to only take LD lower than -0.7 or above 0.7
matrix_ld_filtered <- matrix_LD_format
matrix_ld_filtered[abs(matrix_LD_format) < 0.7] <- 0

# Plot the modified LD matrix using corrplot
corrplot(matrix_ld_filtered, method = "number", addrect = 2, col = c("red","white", "red"), type = "upper", tl.col = "black", tl.cex = 0.6, number.cex = 0.5, cl.pos="n")

# Save in a dataframe the SNP with LD above 0.7
LD0.7<- which(matrix_LD_format > 0.7 & matrix_LD_format, arr.ind = TRUE)

# Create a dataframe to retain snp in LD with others  
LD_df <- data.frame(
  snp_names_1 = rownames(matrix_LD_format)[LD0.7[, 1]],
  snp_names_2 = colnames(matrix_LD_format)[LD0.7[, 2]],
  LD = matrix_LD_format[LD0.7]
)

#create a new column contig and remove the rows where the SNPs in LD are in different contigs
LD_df_contig_bis <- LD_df %>%
  mutate(contig1 = sub("_[^_]*$", "", snp_names_1)) %>% 
  mutate(contig2 = sub("_[^_]*$", "", snp_names_2))%>%
  filter(contig1 == contig2)

print(nrow(LD_df)-nrow(LD_df_contig_bis))

assign(paste0("LD_df_contig_",threshold),LD_df_contig_bis)

#save
png(paste0("Results/species/taxus/GEA_new_var/outliers/figure/LD_matrix_SNPs_",threshold,"_set.png"));corrplot(matrix_ld_filtered, method = "number", addrect = 2, col = c("red","white", "red"), type = "upper", tl.col = "black", tl.cex = 0.6, number.cex = 0.5, cl.pos="n");dev.off()
}

## [1] 11

## [1] 0

Now that we have a list of the LD associations between SNPs, we can proceed to remove the SNPs with weaker signals, specifically those with the highest p-values from the RDA analysis.

for(i in 1:length(list_threshold)){
  
  threshold <- list_threshold[i]
  LD_df_contig <- get(paste0("LD_df_contig_",threshold))
  
#we need to output the names of the outliers in LD to see their pvalues and keep the lowest ones
LD_snps <- data.frame(snp_names = c(LD_df_contig$snp_names_1,LD_df_contig$snp_names_2)) %>% unique() 

#load pvalues RDA
load("Data/Species/Taxus_baccata/genetic_new/GEA/pvalues_RDA_snp.Rdata")

p_values_RDA_snp_1 <- merge(LD_snps, pvalues_RDA_snp,"snp_names"); colnames(p_values_RDA_snp_1)=c("snp_names_1","pvalues_1")
p_values_RDA_snp_2 <- merge(LD_snps, pvalues_RDA_snp,"snp_names"); colnames(p_values_RDA_snp_2)=c("snp_names_2","pvalues_2")

pval_final_1 <- merge(LD_df_contig,p_values_RDA_snp_1,"snp_names_1")
pval_final_2 <- merge(LD_df_contig,p_values_RDA_snp_2,"snp_names_2")

pval_merge <- cbind(pval_final_1,pval_final_2[,4])

pval_merge_final <- pval_merge[,c(1,4,2,5)]; colnames(pval_merge_final) <- c("snp_names_1","pvalues_1","snp_names_2","pvalues_2")

pval_merge_final$worst_snp <- ifelse(pval_merge_final$pvalues_1 > pval_merge_final$pvalues_2, pval_merge_final$snp_names_1, pval_merge_final$snp_names_2) #select the snp with the farest pvalues from 0 (the weakest signal)

#snp to remove
SNP_to_remove <- pval_merge_final$worst_snp %>% unique()

nrow(data.frame(SNP_to_remove))

  outliers_set <- get(paste0("outliers_set_",threshold))
  
#save
outliers_set_final_overlapping_no_LD_new_var <- outliers_set[!(outliers_set$outliers %in% SNP_to_remove),]
print(nrow(outliers_set_final_overlapping_no_LD_new_var%>% as.data.frame))

assign(paste0("outliers_set_final_overlapping_no_LD_",threshold,"_new_var"),outliers_set_final_overlapping_no_LD_new_var)
}

## [1] 100
## [1] 39

5 Random set of SNPs

We can also extract a random set of SNPs to calculate the genomic offset (GO) and compare them with the candidate markers. To generate this random set, we randomly select the same number of SNPs as the candidate set (less conservative set) from the full set of SNPs after removing all candidate SNPs identified by at least one GEA method.

In addition, we may wish to ensure that the selected random SNPs match the allele-frequency distribution of the outliers. This is important because the distribution of allele frequencies can influence observed patterns (analogous to GWAS). For example, if we have 44 outliers with 10 SNPs at allele frequencies 0–0.1, 22 at 0.5–0.6, and 12 at 0.9–1.0, we should select random SNPs that preserve these proportions of allele-frequency bins.

5.1 Random with same allelic frequencies

First, we need to calculate the average allelic frequency across populations for each SNP:

load("Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_29pop_adapcon_gentree_475_8616.Rdata")

genomic_matrix <- data.frame(data_allelic_frequencies_29pop_adapcon_gentree_475_8616)

#we need to keep only snps also present in the CG pop genomic data
#load CG pop genomic data
load("Data/Species/Taxus_baccata/genetic_new/allelic_frequencies_CG_pop.Rdata")
common_snps <- intersect(colnames(data_allelic_frequencies_29pop_adapcon_gentree_475_8616), colnames(allelic_frequencies_CG_pop))

#retained_snps 
genomic_data_outliers_common_CG <- genomic_matrix[,common_snps]

We can calculate the allelic frequency of the outliers to assign them to allelic frequency classes.

#calculation of the mean allelic frequency across populations for the outliers
genomic_data_outliers_mean <- genomic_matrix[,(colnames(genomic_matrix) %in% outliers_set_final_overlapping_no_LD_LC_new_var_df$get.paste0..outliers_set_final_overlapping_no_LD_...threshold..)] %>% colMeans() %>% as.data.frame() 
genomic_data_outliers_mean$SNP_name <- row.names(genomic_data_outliers_mean)
colnames(genomic_data_outliers_mean) <- c("mean_allelic_frequency","SNP_name")

Then we need to identify classes of distribution of allele frequencies across populations for the outliers, and select more or less the same distribution for the neutral ones.

# Define bins and labels
bins <- seq(0, 1, by = 0.1)
bin_labels <- paste(head(bins, -1), tail(bins, -1), sep = "-")

# Function to count frequencies within bins for a given row
count_bins <- function(row, bins, bin_labels) {
  bin_counts <- table(cut(row, breaks = bins, labels = bin_labels, include.lowest = TRUE))
  # Ensure all bins are present
  bin_counts <- as.data.frame(bin_counts)
  bin_counts <- bin_counts %>% mutate(Var1 = as.character(Var1))
  missing_bins <- setdiff(bin_labels, bin_counts$Var1)
  if (length(missing_bins) > 0) {
    missing_df <- data.frame(Var1 = missing_bins, Freq = 0)
    bin_counts <- rbind(bin_counts, missing_df)
  }
  # Order by bin labels
  bin_counts <- bin_counts[order(match(bin_counts$Var1, bin_labels)), ]
  return(bin_counts$Freq)
}
# Apply the function to each row (SNP)
binned_counts <- apply(data.frame(genomic_data_outliers_mean[,1]), 1, function(row) count_bins(row, bins, bin_labels))

# Convert the result to a DataFrame
binned_counts_df <- as.data.frame(t(binned_counts))
colnames(binned_counts_df) <- bin_labels

#now we can calculate the proportion of outliers for each class of allelic frequencies
proportion_outliers <- binned_counts_df %>% colSums() %>% as.data.frame
proportion_outliers$freq_class <- bin_labels
colnames(proportion_outliers) <- c("num_outliers","freq_class")

We then need to remove all SNPs identified as candidates by at least one GEA method to select random SNPs from this set:

#we also removed outliers snps 
#remove all the snps identified by candidates by at least 1 GEA method
all_candidates_set <- unique(list_all_candidates_snps_LC$list_all_candidates_snps_LC)
outliers_set <- unique(outliers_set_final_overlapping_no_LD_LC_new_var_df)

#df with all the candidates identified by at least 1 GEA method without the 100 outliers
list_candidates_minus_outliers <- setdiff(all_candidates_set, outliers_set$outliers_set_final_overlapping_no_LD_LC_new_var)

df_outliers <- data.frame(list_all_candidates_snps_LC)

unique_outliers <-df_outliers$list_all_candidates_snps_LC

genomic_data_wo_outliers <- genomic_data_outliers_common_CG[, !(colnames(genomic_data_outliers_common_CG) %in% unique_outliers)]

Finally, we can calculate the mean allele frequency of each population for the neutral set of SNPs, calculate their class, and randomly identify SNPs that match the class of allele frequencies of the outliers.

genomic_data_wo_outliers_mean <- genomic_data_wo_outliers %>% colMeans() %>% as.data.frame() 
genomic_data_wo_outliers_mean$name_snps <- rownames(genomic_data_wo_outliers_mean)

colnames(genomic_data_wo_outliers_mean) <- c("mean_allelic_frequency","name_snps")

# Bin the neutral SNPs into frequency classes
neutral_snps <- genomic_data_wo_outliers_mean %>%
  mutate(freq_class = cut(mean_allelic_frequency, breaks = bins, labels = bin_labels, include.lowest = TRUE))

# Randomly select the same number of neutral SNPs for each class
set.seed(73) # for reproducibility
selected_neutral_snps <- neutral_snps %>%
  group_by(freq_class) %>%
  group_map(~ slice_sample(.x, n = proportion_outliers$num_outliers[proportion_outliers$freq_class == unique(.x$freq_class)], replace = FALSE), .keep = TRUE) %>%
  bind_rows()

# Print the selected neutral SNPs
print(selected_neutral_snps)
random_set_taxus_LC <- selected_neutral_snps

random_set_taxus <- random_set_taxus_LC
#save(random_set_taxus, file="Results/species/taxus/GEA_new_var/outliers/random_set_taxus.Rdata")

load("Results/species/taxus/GEA_new_var/outliers/random_set_taxus.Rdata")
df <- merge(random_set_taxus,random_set_taxus_LC,"name_snps")
df_candidates <- data.frame("name_snps"=all_candidates_set)
check_outliers<- merge(df_candidates,random_set_taxus,"name_snps")

Session info

packages <- c(
  "VennDiagram", "dplyr", "genetics", "corrplot",
  "tidyr", "writexl"
)
info <- sessionInfo()
packages_used <- info$otherPkgs[packages]
data.frame(
  Package = names(packages_used),
  Version = sapply(packages_used, function(x) x$Version)
)

##                 Package   Version
## VennDiagram VennDiagram     1.7.3
## dplyr             dplyr     1.1.4
## genetics       genetics 1.3.8.1.3
## corrplot       corrplot      0.92
## tidyr             tidyr     1.3.1
## writexl         writexl     1.5.0

Outlier_selection

Thomas Francisco

2024-04-30