1 Introduction

This script enables the visualisation and selection of outlier loci to be used for genomic offset calculations. We applied six GEA methods to identify SNPs potentially associated with climatic variables: RDA, pRDA, BAYPASS, LFMM, GF-raw, and GF-corrected.

Each method produced a set of candidate SNPs. To define our final set of outliers, we retained only SNPs that: were identified by at least two methods, and were not in high linkage disequilibrium (LD).

We considered two levels for outlier detection:
- Less conservative (LC): FDR < 10%, top 5%, and BF > 8
- More conservative (MC): FDR < 5%, top 1%, and BF > 10

In addition to the outlier sets, we defined a set of random markers that follow the same allele frequency distribution as the outliers, while matching their number of SNPs. This provides a neutral baseline for comparison.

Finally, we also retained two broader reference sets: all SNPs from the filtered genomic dataset, and all candidate SNPs identified by at least one GEA method, regardless of overlap.

These different SNP sets (stringent outliers, random markers, all candidates, and all SNPs) will be used in downstream analyses, including genomic offset estimation, to evaluate the robustness of predictions across marker subsets.

2 Load candidates SNPs

#Names of the files
##LC
list_data_LC <- c("outliers_RDA_FDR10perc_T_Adapcon_gentree","outliers_pRDA_FDR10perc_T_Adapcon_gentree","outliers_T_adapcon_gentree_BAYPASS_BF_8","outliers_T_adapcon_gentree_LFMM_10perc","outliers_rank_based_top5perc_GF_raw","outliers_rank_based_top5perc_GF_corrected")

##MC
list_data_MC <- c("outliers_RDA_FDR5perc_T_Adapcon_gentree","outliers_pRDA_FDR5perc_T_Adapcon_gentree","outliers_T_adapcon_gentree_BAYPASS_BF_10","outliers_T_adapcon_gentree_LFMM_5perc","outliers_rank_based_top1perc_GF_raw","outliers_rank_based_top1perc_GF_corrected")

#name of the output
names_set <- c("RDA","pRDA","BAYPASS","LFMM","GF_raw","GF_corrected")

#load data
for(x in 1:length(names_set)){
  
  for(i in 1:length(list_data_LC))
  set_LC <- list_data_LC[x]
  load(paste0("C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/",set_LC,".Rdata"))
  name <- names_set[x]
assign(paste0(name,"_set_LC"),get(set_LC))  

set_MC <- list_data_MC[x]
  load(paste0("C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/",set_MC,".Rdata"))
  name <- names_set[x]
assign(paste0(name,"_set_MC"),get(set_MC))  
}

We merged every set in a list

  #list of candidates
list_candidates_LC<-list(RDA=RDA_set_LC[,1],pRDA=pRDA_set_LC[,1],BAYPASS=BAYPASS_set_LC[,1],LFMM=LFMM_set_LC[,1],GF_raw=GF_raw_set_LC,GF_corrected=GF_corrected_set_LC)

list_candidates_MC<-list(RDA=RDA_set_MC[,1],pRDA=pRDA_set_MC[,1],BAYPASS=BAYPASS_set_MC[,1],LFMM=LFMM_set_MC[,1],GF_raw=GF_raw_set_MC,GF_corrected=GF_corrected_set_MC)

#list snps_names
##LC
list_all_candidates_snps_LC <- unlist(list_candidates_LC)
list_all_candidates_snps_LC <- data.frame(list_all_candidates_snps_LC)
write_xlsx(list_all_candidates_snps_LC,path="C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/set/list_all_candidates_snps_LC.xlsx") 

##MC
list_all_candidates_snps_MC <- unlist(list_candidates_MC)
list_all_candidates_snps_MC <- data.frame(list_all_candidates_snps_MC)
write_xlsx(list_all_candidates_snps_MC,path="C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/set/list_all_candidates_snps_MC.xlsx") 
colnames(RDA_set_MC) <- c("Loci","p.value")
colnames(pRDA_set_MC) <- c("Loci","p.value")

3 Plot to visualise the overlapping candidates SNPs

3.1 Vendiagram plot

We can plot the sets in a Venn diagram to visualise the overlapping candidates (not shown here).

3.2 Upset plot

We chose to visualise overlaps among candidate SNP sets using an UpSet graph rather than Venn diagrams. While Venn diagrams can only clearly represent up to five sets, our analysis involves six GEA methods. Using functions such as ggVennDiagram quickly becomes difficult to interpret when more than five sets are included. In contrast, the UpSet graph provides a cleaner and more informative representation by: displaying only the observed overlaps between methods (omitting zero intersections), allowing easy comparison of overlap sizes across multiple sets, highlighting subsets of SNPs detected by specific combinations of GEA methods. This makes the UpSet approach particularly usefull for our multi-method outlier detection framework.

list_threshold <- c("LC","MC")

for(i in 1:length(list_threshold)){
  
  threshold <- list_threshold[i]
#create an object with all the snps to then create a df 0/1 for each method for all the snps to plot the upset graph
all_snps <- unique(c(get(paste0("RDA_set_",threshold))[,1],get(paste0("pRDA_set_",threshold))[,1],get(paste0("BAYPASS_set_",threshold))[,1],get(paste0("LFMM_set_",threshold))[,1],get(paste0("GF_raw_set_",threshold)),get(paste0("GF_corrected_set_",threshold))))

#df with 0 the snp is not identified by the method and 1 it is
snp_df <- data.frame(
  SNP = all_snps,
  RDA = as.integer(all_snps %in% get(paste0("RDA_set_",threshold))$Loci),
  pRDA = as.integer(all_snps %in% get(paste0("pRDA_set_",threshold))$Loci),
  LFMM = as.integer(all_snps %in% get(paste0("LFMM_set_",threshold))$SNP),
  BAYPASS = as.integer(all_snps %in% get(paste0("BAYPASS_set_",threshold))$Loci),
  GF_raw = as.integer(all_snps %in% get(paste0("GF_raw_set_",threshold))),
  GF_corrected = as.integer(all_snps %in% get(paste0("GF_corrected_set_",threshold)))
)

# Create the UpSet plot
plot <- UpSetR::upset(snp_df, 
      sets = c("RDA", "pRDA", "LFMM", "BAYPASS", "GF_raw", "GF_corrected"),
      main.bar.color = "#45B6AA",
      matrix.color = "#D45176",
      keep.order = T)

print(plot)

#save 
pdf(paste0("C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/figure/upset_plot_",threshold,"_set.pdf")); print(plot);dev.off()
}

4 Set of outlier SNPs

4.1 Overlapping SNPs by at least 2 GEA methods

Now we can create a data frame with only the overlapping candidate of at least 2 GEA methods:

for(i in 1:length(list_threshold)){
  
  threshold <- list_threshold[i]
  
comm_RDA_pRDA <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("RDA","pRDA")] %>% Reduce(intersect, .))
comm_RDA_BAYPASS <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("RDA","BAYPASS")] %>% Reduce(intersect, .))
comm_RDA_GF_raw <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_raw","RDA")] %>% Reduce(intersect, .))
comm_RDA_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("LFMM","RDA")] %>% Reduce(intersect, .))
comm_pRDA_BAYPASS <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("pRDA","BAYPASS")] %>% Reduce(intersect, .))
comm_pRDA_GF_raw <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_raw","pRDA")] %>% Reduce(intersect, .))
comm_pRDA_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("LFMM","pRDA")] %>% Reduce(intersect, .))
comm_BAYPASS_GF_raw <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_raw","BAYPASS")] %>% Reduce(intersect, .))
comm_BAYPASS_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("LFMM","BAYPASS")] %>% Reduce(intersect, .))
comm_GF_raw_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_raw","LFMM")] %>% Reduce(intersect, .))
comm_GF_corrected_LFMM <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","LFMM")] %>% Reduce(intersect, .))
comm_BAYPASS_GF_corrected <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","BAYPASS")] %>% Reduce(intersect, .))
comm_pRDA_GF_corrected <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","pRDA")] %>% Reduce(intersect, .))
comm_RDA_GF_corrected <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","RDA")] %>% Reduce(intersect, .))
comm_GF_corrected_GF_raw <- data.frame(outliers=get(paste0("list_candidates_",threshold))[c("GF_corrected","GF_raw")] %>% Reduce(intersect, .))

outliers_set <- data.frame(rbind(comm_RDA_pRDA,comm_RDA_BAYPASS,comm_RDA_GF_raw,comm_RDA_LFMM,comm_pRDA_BAYPASS,comm_pRDA_GF_raw,comm_pRDA_LFMM,comm_BAYPASS_GF_raw,comm_BAYPASS_LFMM,comm_GF_raw_LFMM,comm_GF_corrected_LFMM,comm_BAYPASS_GF_corrected,comm_pRDA_GF_corrected,comm_RDA_GF_corrected,comm_GF_corrected_GF_raw)) %>% unique()

assign(paste0("outliers_set_",threshold),outliers_set)

#number of candidates identified by at least 2 methods
print(nrow(get(paste0("outliers_set_",threshold))))
}

## [1] 136
## [1] 58

save(outliers_set_LC, file="C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/set/outliers_set_LC.Rdata")

4.2 Linkage desequilibrium

During candidate SNP selection, we observed that some outliers were located on the same contig. This situation can bias downstream analyses because clusters of SNPs in strong linkage disequilibrium (LD) artificially increase the weight of particular genomic regions. This issue is especially problematic if the number of SNPs per contig is not proportional to contig size, as stochastic variation in marker density could lead to overrepresentation of certain contigs. This may be of concern or not depending on the study system. To mitigate this bias, we applied a LD pruning step within contigs, retaining only SNPs that were not in high LD with one another (< |0.7|). This ensures to reduces the inflation of signals due to physically linked SNPs.

Linkage desequilibrium calculation:

for(i in 1:length(list_threshold)){
  
  threshold <- list_threshold[i]

#load genotypic data at the individual level
load("C:/Users/tfrancisco/Documents/Thesis/Data/Species/Taxus_baccata/genetic_new/GEA/vcf_data_GEA.Rdata")
genomic_matrix_LD <- vcf_data_GEA

#subset the genomic matrix to only the outliers
candidates_to_LD <- data.frame(genomic_matrix_LD[get(paste0("outliers_set_",threshold))$outliers])

#put the data into the right format
genotypic_data<- makeGenotypes(candidates_to_LD)

#RUN the LD calculation
LD_estimate <- LD(genotypic_data)

#save the R² values
matrix_LD <- LD_estimate$`R^2`
matrix_LD_format<- matrix_LD[,-1] #remove the first row

#filtered the LD to only take LD lower than -0.7 or above 0.7
matrix_ld_filtered <- matrix_LD_format
matrix_ld_filtered[abs(matrix_LD_format) < 0.7] <- 0

# Plot the modified LD matrix using corrplot
corrplot(matrix_ld_filtered, method = "number", addrect = 2, col = c("red","white", "red"), type = "upper", tl.col = "black", tl.cex = 0.6, number.cex = 0.5, cl.pos="n")


# Save in a dataframe the SNP with LD above 0.7
LD0.7<- which(matrix_LD_format > 0.7 & matrix_LD_format, arr.ind = TRUE)

# Create a dataframe to retain snp in LD with others  
LD_df <- data.frame(
  snp_names_1 = rownames(matrix_LD_format)[LD0.7[, 1]],
  snp_names_2 = colnames(matrix_LD_format)[LD0.7[, 2]],
  LD = matrix_LD_format[LD0.7]
)

#create a new column contig and remove the rows where the SNPs in LD are in different contigs
LD_df_contig_bis <- LD_df %>%
  mutate(contig1 = sub("_[^_]*$", "", snp_names_1)) %>% 
  mutate(contig2 = sub("_[^_]*$", "", snp_names_2))%>%
  filter(contig1 == contig2)

print(nrow(LD_df)-nrow(LD_df_contig_bis))

assign(paste0("LD_df_contig_",threshold),LD_df_contig_bis)

#save
png(paste0("C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/figure/LD_matrix_SNPs_",threshold,"_set.png"));corrplot(matrix_ld_filtered, method = "number", addrect = 2, col = c("red","white", "red"), type = "upper", tl.col = "black", tl.cex = 0.6, number.cex = 0.5, cl.pos="n");dev.off()
}

## [1] 11

## [1] 0

For the less conservative (LC) set, 11 candidate SNPs were found to be in linkage disequilibrium (LD) with SNPs located on other contigs. We decided to retain these associations, since our filtering strategy focuses on removing LD within contigs rather than across different contigs. However, some of these 11 SNPs may also be in LD with other SNPs on the same contig; if this is the case, we will remove them to avoid overrepresentation of specific genomic regions.

For the more conservative (MC) set, no candidate SNPs were detected in LD with SNPs on other contigs, so no further filtering was necessary.

Now that we have a list of the LD associations between SNPs, we can proceed to remove the SNPs with weaker signals, specifically those with the highest p-values from the RDA analysis.

for(i in 1:length(list_threshold)){
  
  threshold <- list_threshold[i]
  LD_df_contig <- get(paste0("LD_df_contig_",threshold))
  
#we need to output the names of the outliers in LD to see their pvalues and keep the lowest ones
LD_snps <- data.frame(snp_names = c(LD_df_contig$snp_names_1,LD_df_contig$snp_names_2)) %>% unique() 

#load pvalues RDA
load("C:/Users/tfrancisco/Documents/Thesis/Data/Species/Taxus_baccata/genetic_new/GEA/pvalues_RDA_snp.Rdata")


p_values_RDA_snp_1 <- merge(LD_snps, pvalues_RDA_snp,"snp_names"); colnames(p_values_RDA_snp_1)=c("snp_names_1","pvalues_1")
p_values_RDA_snp_2 <- merge(LD_snps, pvalues_RDA_snp,"snp_names"); colnames(p_values_RDA_snp_2)=c("snp_names_2","pvalues_2")

pval_final_1 <- merge(LD_df_contig,p_values_RDA_snp_1,"snp_names_1")
pval_final_2 <- merge(LD_df_contig,p_values_RDA_snp_2,"snp_names_2")


pval_merge <- cbind(pval_final_1,pval_final_2[,4])

pval_merge_final <- pval_merge[,c(1,4,2,5)]; colnames(pval_merge_final) <- c("snp_names_1","pvalues_1","snp_names_2","pvalues_2")

pval_merge_final$worst_snp <- ifelse(pval_merge_final$pvalues_1 > pval_merge_final$pvalues_2, pval_merge_final$snp_names_1, pval_merge_final$snp_names_2) #select the snp with the farest pvalues from 0 (the weakest signal)

#snp to remove
SNP_to_remove <- pval_merge_final$worst_snp %>% unique()

nrow(data.frame(SNP_to_remove))

  outliers_set <- get(paste0("outliers_set_",threshold))
  
#save
outliers_set_final_overlapping_no_LD_new_var <- outliers_set[!(outliers_set$outliers %in% SNP_to_remove),]
print(nrow(outliers_set_final_overlapping_no_LD_new_var%>% as.data.frame))

assign(paste0("outliers_set_final_overlapping_no_LD_",threshold,"_new_var"),outliers_set_final_overlapping_no_LD_new_var)
}

## [1] 100
## [1] 39

Based on an LD threshold of 0.7, we removed 36 SNPs from the LC set and 19 from the MC set.

The final number of outliers retained for further analysis is 100 for the LC set and 39 for the MC set.

5 Random set of SNPs

We can also extract a random set of SNPs to calculate the genomic offset (GO) and compare them with the candidate markers. To generate this random set, we randomly select the same number of SNPs as the candidate set (less conservative set) from the full set of SNPs after removing all candidate SNPs identified by at least one GEA method.

In addition, we may wish to ensure that the selected random SNPs match the allele-frequency distribution of the outliers. This is important because the distribution of allele frequencies can influence observed patterns (analogous to GWAS). For example, if we have 44 outliers with 10 SNPs at allele frequencies 0–0.1, 22 at 0.5–0.6, and 12 at 0.9–1.0, we should select random SNPs that preserve these proportions of allele-frequency bins.

5.1 Random with same allelic frequencies

5.1.1 New random selection

New random selection with the same classes of allelic frequencies: First, we need to calculate the average allelic frequency across populations for each SNP:

load("C:/Users/tfrancisco/Documents/Thesis/Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_29pop_adapcon_gentree_475_8616.Rdata")

genomic_matrix <- data.frame(data_allelic_frequencies_29pop_adapcon_gentree_475_8616)

#we need to keep only snps also present in the CG pop genomic data
#load CG pop genomic data
load("C:/Users/tfrancisco/Documents/Thesis/Data/Species/Taxus_baccata/genetic_new/allelic_frequencies_CG_pop.Rdata")
common_snps <- intersect(colnames(data_allelic_frequencies_29pop_adapcon_gentree_475_8616), colnames(allelic_frequencies_CG_pop))

#retained_snps 
genomic_data_outliers_common_CG <- genomic_matrix[,common_snps]

We can calculate the allelic frequency of the outliers to assign them to allelic frequency classes.

#calculation of the mean allelic frequency across populations for the outliers
genomic_data_outliers_mean <- genomic_matrix[,(colnames(genomic_matrix) %in% outliers_set_final_overlapping_no_LD_LC_new_var_df$get.paste0..outliers_set_final_overlapping_no_LD_...threshold..)] %>% colMeans() %>% as.data.frame() 
genomic_data_outliers_mean$SNP_name <- row.names(genomic_data_outliers_mean)
colnames(genomic_data_outliers_mean) <- c("mean_allelic_frequency","SNP_name")

Then we need to identify classes of distribution of allele frequencies across populations for the outliers, and select more or less the same distribution for the neutral ones.

# Define bins and labels
bins <- seq(0, 1, by = 0.1)
bin_labels <- paste(head(bins, -1), tail(bins, -1), sep = "-")

# Function to count frequencies within bins for a given row
count_bins <- function(row, bins, bin_labels) {
  bin_counts <- table(cut(row, breaks = bins, labels = bin_labels, include.lowest = TRUE))
  # Ensure all bins are present
  bin_counts <- as.data.frame(bin_counts)
  bin_counts <- bin_counts %>% mutate(Var1 = as.character(Var1))
  missing_bins <- setdiff(bin_labels, bin_counts$Var1)
  if (length(missing_bins) > 0) {
    missing_df <- data.frame(Var1 = missing_bins, Freq = 0)
    bin_counts <- rbind(bin_counts, missing_df)
  }
  # Order by bin labels
  bin_counts <- bin_counts[order(match(bin_counts$Var1, bin_labels)), ]
  return(bin_counts$Freq)
}
# Apply the function to each row (SNP)
binned_counts <- apply(data.frame(genomic_data_outliers_mean[,1]), 1, function(row) count_bins(row, bins, bin_labels))

# Convert the result to a DataFrame
binned_counts_df <- as.data.frame(t(binned_counts))
colnames(binned_counts_df) <- bin_labels

#now we can calculate the proportion of outliers for each class of allelic frequencies
proportion_outliers <- binned_counts_df %>% colSums() %>% as.data.frame
proportion_outliers$freq_class <- bin_labels
colnames(proportion_outliers) <- c("num_outliers","freq_class")

We then need to remove all SNPs identified as candidates by at least 1 GEA method to select random SNPs from this set:

#we also removed outliers snps 
#remove all the snps identified by candidates by at least 1 GEA method
all_candidates_set <- unique(list_all_candidates_snps_LC$list_all_candidates_snps_LC)
outliers_set <- unique(outliers_set_final_overlapping_no_LD_LC_new_var_df)

#df with all the candidates identified by at least 1 GEA method without the 100 outliers
list_candidates_minus_outliers <- setdiff(all_candidates_set, outliers_set$outliers_set_final_overlapping_no_LD_LC_new_var)

df_outliers <- data.frame(list_all_candidates_snps_LC)

unique_outliers <-df_outliers$list_all_candidates_snps_LC

genomic_data_wo_outliers <- genomic_data_outliers_common_CG[, !(colnames(genomic_data_outliers_common_CG) %in% unique_outliers)]

Finally, we can calculate the mean allele frequency of each population for the neutral set of SNPs, calculate their class, and randomly identify SNPs that match the class of allele frequencies of the outliers.

genomic_data_wo_outliers_mean <- genomic_data_wo_outliers %>% colMeans() %>% as.data.frame() 
genomic_data_wo_outliers_mean$name_snps <- rownames(genomic_data_wo_outliers_mean)

colnames(genomic_data_wo_outliers_mean) <- c("mean_allelic_frequency","name_snps")

# Bin the neutral SNPs into frequency classes
neutral_snps <- genomic_data_wo_outliers_mean %>%
  mutate(freq_class = cut(mean_allelic_frequency, breaks = bins, labels = bin_labels, include.lowest = TRUE))

# Randomly select the same number of neutral SNPs for each class
#set.seed(71) the one used
set.seed(73) # for reproducibility
selected_neutral_snps <- neutral_snps %>%
  group_by(freq_class) %>%
  group_map(~ slice_sample(.x, n = proportion_outliers$num_outliers[proportion_outliers$freq_class == unique(.x$freq_class)], replace = FALSE), .keep = TRUE) %>%
  bind_rows()

# Print the selected neutral SNPs
print(selected_neutral_snps)

random_set_taxus_LC <- selected_neutral_snps

random_set_taxus <- random_set_taxus_LC
#save(random_set_taxus, file="C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/random_set_taxus.Rdata")

load("C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/random_set_taxus.Rdata")

df <- merge(random_set_taxus,random_set_taxus_LC,"name_snps")

df_candidates <- data.frame("name_snps"=all_candidates_set)

check_outliers<- merge(df_candidates,random_set_taxus,"name_snps")

5.1.2 Random following allelic frequency

We also performed a new selection of random SNPs.

First, we need to calculate the mean allelic frequency across populations for each SNP:

load("C:/Users/tfrancisco/Documents/Thesis/Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_29pop_adapcon_gentree_475_8616.Rdata")

genomic_matrix <- data.frame(data_allelic_frequencies_29pop_adapcon_gentree_475_8616)

We can calculate the allelic frequency of the outliers to assign them to classes of allelic frequency

#calculation of the mean allelic frequency across populations for the outliers
genomic_data_outliers_mean <- genomic_matrix[,(colnames(genomic_matrix) %in% outliers_set_final_overlapping_no_LD_LC_new_var_df$get.paste0..outliers_set_final_overlapping_no_LD_...threshold..)] %>% colMeans() %>% as.data.frame() 
genomic_data_outliers_mean$SNP_name <- row.names(genomic_data_outliers_mean)
colnames(genomic_data_outliers_mean) <- c("mean_allelic_frequency","SNP_name")

Then, we need to identify classes of distribution of allelic frequency across populations for the outliers and select more or less the same distribution for the neutral ones.

# Define bins and labels
bins <- seq(0, 1, by = 0.1)
bin_labels <- paste(head(bins, -1), tail(bins, -1), sep = "-")

# Function to count frequencies within bins for a given row
count_bins <- function(row, bins, bin_labels) {
  bin_counts <- table(cut(row, breaks = bins, labels = bin_labels, include.lowest = TRUE))
  # Ensure all bins are present
  bin_counts <- as.data.frame(bin_counts)
  bin_counts <- bin_counts %>% mutate(Var1 = as.character(Var1))
  missing_bins <- setdiff(bin_labels, bin_counts$Var1)
  if (length(missing_bins) > 0) {
    missing_df <- data.frame(Var1 = missing_bins, Freq = 0)
    bin_counts <- rbind(bin_counts, missing_df)
  }
  # Order by bin labels
  bin_counts <- bin_counts[order(match(bin_counts$Var1, bin_labels)), ]
  return(bin_counts$Freq)
}

# Apply the function to each row (SNP)
binned_counts <- apply(data.frame(genomic_data_outliers_mean[,1]), 1, function(row) count_bins(row, bins, bin_labels))

# Convert the result to a DataFrame
binned_counts_df <- as.data.frame(t(binned_counts))
colnames(binned_counts_df) <- bin_labels

#now we can calculate the proportion of outliers for each class of allelic frequencies
proportion_outliers <- binned_counts_df %>% colSums() %>% as.data.frame
proportion_outliers$freq_class <- bin_labels
colnames(proportion_outliers) <- c("num_outliers","freq_class")

Then, we need to remove all the snps identified as candidate by at least 1 GEA method to select random snp from this set:

#we also removed outliers snps 

#remove all the snps identified by candidates by at least 1 GEA method
all_candidates_set <- unique(list_all_candidates_snps_LC$list_all_candidates_snps_LC)
outliers_set <- unique(outliers_set_final_overlapping_no_LD_LC_new_var_df)

list_candidates_minus_outliers <- setdiff(all_candidates_set, outliers_set$outliers_set_final_overlapping_no_LD_LC_new_var)

df_outliers <- data.frame(list_all_candidates_snps_LC)

unique_outliers <-df_outliers$list_all_candidates_snps_LC %>% unique()

genomic_data_wo_outliers <- genomic_matrix[, !(colnames(genomic_matrix) %in% unique_outliers)]

#save all the outliers
#save(unique_outliers, file="C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/unique_outliers.Rdata")

Finally, we can calculate the mean allelic frequency of each populations for the neutral set of snps and calculate their class and identify randomly snps matching the classes of allelic frequencies of the outliers.

genomic_data_wo_outliers_mean <- genomic_data_wo_outliers %>% colMeans() %>% as.data.frame() 
genomic_data_wo_outliers_mean$name_snps <- rownames(genomic_data_wo_outliers_mean)

colnames(genomic_data_wo_outliers_mean) <- c("mean_allelic_frequency","name_snps")

# Bin the neutral SNPs into frequency classes
neutral_snps <- genomic_data_wo_outliers_mean %>%
  mutate(freq_class = cut(mean_allelic_frequency, breaks = bins, labels = bin_labels, include.lowest = TRUE))

# Randomly select the same number of neutral SNPs for each class
#set.seed(71) the one used
set.seed(73) # for reproducibility
selected_neutral_snps <- neutral_snps %>%
  group_by(freq_class) %>%
  group_map(~ slice_sample(.x, n = proportion_outliers$num_outliers[proportion_outliers$freq_class == unique(.x$freq_class)], replace = FALSE), .keep = TRUE) %>%
  bind_rows()

# Print the selected neutral SNPs
print(selected_neutral_snps)

random_set_taxus_not_overlap_both_dataset <- selected_neutral_snps

#random_set_taxus_not_overlap_both_dataset_V2 <- random_set_taxus_not_overlap_both_dataset

#save(random_set_taxus_not_overlap_both_dataset_V2, #file="C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/random_set_taxus_not_overlap_both_dataset_V2.Rdata")

load("C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/random_set_taxus.Rdata")

load("C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/random_set_taxus_not_overlap_both_dataset_V2.Rdata")

df <- merge(random_set_taxus,random_set_taxus_LC,"name_snps")

df_candidates <- data.frame("name_snps"=all_candidates_set)

check_outliers<- merge(df_candidates,random_set_taxus_not_overlap_both_dataset,"name_snps")

df_merge <- merge(random_set_taxus,random_set_taxus_not_overlap_both_dataset_V2,"name_snps")

5.2 Random SNPs without taking the same allelic frequencies

We can also generate a set of random SNPs without considering the effect of allelic frequency.

#We can use the dataset of snps overlapping between both dataset and without the outliers:
genomic_data_wo_outliers

Now that we have the neutral set, we can randomly select the same number of SNPs as the candidate set used in the genomic offset (GO) calculation.

#set.seed(4)
#nb_candidates <- 100
#random_neutral_set_SNPs_T_adapcon_gentree <- sample(genomic_data_wo_outliers,nb_candidates,replace = F)

#save(random_neutral_set_SNPs_T_adapcon_gentree, file="C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/random_neutral_set_SNPs_T_adapcon_gentree.Rdata")
set.seed(190)
nb_candidates <- 100
random_neutral_set_SNPs_T_adapcon_gentree_bis <- sample(genomic_data_wo_outliers,nb_candidates,replace = F)

#save(random_neutral_set_SNPs_T_adapcon_gentree_bis, file="C:/Users/tfrancisco/Documents/Thesis/Results/species/taxus/GEA_new_var/outliers/random_neutral_set_SNPs_T_adapcon_gentree_bis.Rdata")
df_selected <- data.frame(name_snps=colnames(random_neutral_set_SNPs_T_adapcon_gentree))

df_CG_pop <- data.frame("name_snps"=colnames(allelic_frequencies_CG_pop))

df_outliers <- data.frame(name_snps= df_outliers$list_all_candidates_snps)

df <- merge(df_selected,df_CG_pop,"name_snps")

df <- merge(df_selected,df_outliers,"name_snps")

Finally, all neutrally selected SNPs are included in the common garden set of outliers and are not part of the candidates identified by at least one GEA method.

devtools::session_info()

## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.3.2 (2023-10-31 ucrt)
##  os       Windows 11 x64 (build 26100)
##  system   x86_64, mingw32
##  ui       RTerm
##  language (EN)
##  collate  French_France.utf8
##  ctype    French_France.utf8
##  tz       Europe/Paris
##  date     2025-09-22
##  pandoc   3.1.1 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package        * version    date (UTC) lib source
##  bslib            0.6.1      2023-11-28 [1] CRAN (R 4.3.2)
##  cachem           1.0.8      2023-05-01 [1] CRAN (R 4.3.1)
##  cli              3.6.1      2023-03-23 [1] CRAN (R 4.3.1)
##  colorspace       2.1-0      2023-01-23 [1] CRAN (R 4.3.1)
##  combinat       * 0.0-8      2012-10-29 [1] CRAN (R 4.3.1)
##  corrplot       * 0.92       2021-11-18 [1] CRAN (R 4.3.2)
##  devtools         2.4.5      2022-10-11 [1] CRAN (R 4.3.3)
##  digest           0.6.33     2023-07-07 [1] CRAN (R 4.3.1)
##  dplyr          * 1.1.4      2023-11-17 [1] CRAN (R 4.3.2)
##  ellipsis         0.3.2      2021-04-29 [1] CRAN (R 4.3.1)
##  evaluate         0.23       2023-11-01 [1] CRAN (R 4.3.2)
##  farver           2.1.2      2024-05-13 [1] CRAN (R 4.3.3)
##  fastmap          1.1.1      2023-02-24 [1] CRAN (R 4.3.1)
##  formatR          1.14       2023-01-17 [1] CRAN (R 4.3.2)
##  fs               1.6.3      2023-07-20 [1] CRAN (R 4.3.1)
##  futile.logger  * 1.4.3      2016-07-10 [1] CRAN (R 4.3.2)
##  futile.options   1.0.1      2018-04-20 [1] CRAN (R 4.3.1)
##  gdata          * 3.0.0      2023-10-16 [1] CRAN (R 4.3.2)
##  generics         0.1.4      2025-05-09 [1] CRAN (R 4.3.2)
##  genetics       * 1.3.8.1.3  2021-03-01 [1] CRAN (R 4.3.3)
##  ggplot2          3.5.2      2025-04-09 [1] CRAN (R 4.3.2)
##  glue             1.7.0      2024-01-09 [1] CRAN (R 4.3.2)
##  gridExtra        2.3        2017-09-09 [1] CRAN (R 4.3.3)
##  gtable           0.3.6      2024-10-25 [1] CRAN (R 4.3.3)
##  gtools         * 3.9.5      2023-11-20 [1] CRAN (R 4.3.2)
##  highr            0.10       2022-12-22 [1] CRAN (R 4.3.1)
##  htmltools        0.5.7      2023-11-03 [1] CRAN (R 4.3.2)
##  htmlwidgets      1.6.4      2023-12-06 [1] CRAN (R 4.3.2)
##  httpuv           1.6.13     2023-12-06 [1] CRAN (R 4.3.2)
##  jquerylib        0.1.4      2021-04-26 [1] CRAN (R 4.3.1)
##  jsonlite         1.8.8      2023-12-04 [1] CRAN (R 4.3.2)
##  knitr            1.45       2023-10-30 [1] CRAN (R 4.3.2)
##  labeling         0.4.3      2023-08-29 [1] CRAN (R 4.3.1)
##  lambda.r         1.2.4      2019-09-18 [1] CRAN (R 4.3.2)
##  later            1.3.2      2023-12-06 [1] CRAN (R 4.3.2)
##  lifecycle        1.0.4      2023-11-07 [1] CRAN (R 4.3.2)
##  magrittr         2.0.3      2022-03-30 [1] CRAN (R 4.3.1)
##  MASS           * 7.3-60.0.1 2024-01-13 [2] CRAN (R 4.3.2)
##  memoise          2.0.1      2021-11-26 [1] CRAN (R 4.3.1)
##  mime             0.12       2021-09-28 [1] CRAN (R 4.3.1)
##  miniUI           0.1.1.1    2018-05-18 [1] CRAN (R 4.3.2)
##  munsell          0.5.1      2024-04-01 [1] CRAN (R 4.3.3)
##  mvtnorm        * 1.3-1      2024-09-03 [1] CRAN (R 4.3.3)
##  pillar           1.10.2     2025-04-05 [1] CRAN (R 4.3.3)
##  pkgbuild         1.4.3      2023-12-10 [1] CRAN (R 4.3.2)
##  pkgconfig        2.0.3      2019-09-22 [1] CRAN (R 4.3.2)
##  pkgload          1.3.4      2024-01-16 [1] CRAN (R 4.3.2)
##  plyr             1.8.9      2023-10-02 [1] CRAN (R 4.3.2)
##  profvis          0.3.8      2023-05-02 [1] CRAN (R 4.3.2)
##  promises         1.2.1      2023-08-10 [1] CRAN (R 4.3.2)
##  purrr            1.0.2      2023-08-10 [1] CRAN (R 4.3.2)
##  R6               2.6.1      2025-02-15 [1] CRAN (R 4.3.3)
##  Rcpp             1.0.11     2023-07-06 [1] CRAN (R 4.3.1)
##  remotes          2.5.0      2024-03-17 [1] CRAN (R 4.3.3)
##  rlang            1.1.3      2024-01-10 [1] CRAN (R 4.3.2)
##  rmarkdown        2.25       2023-09-18 [1] CRAN (R 4.3.1)
##  rstudioapi       0.15.0     2023-07-07 [1] CRAN (R 4.3.2)
##  sass             0.4.8      2023-12-06 [1] CRAN (R 4.3.2)
##  scales           1.3.0      2023-11-28 [1] CRAN (R 4.3.2)
##  sessioninfo      1.2.2      2021-12-06 [1] CRAN (R 4.3.2)
##  shiny            1.8.0      2023-11-17 [1] CRAN (R 4.3.2)
##  stringi          1.8.3      2023-12-11 [1] CRAN (R 4.3.2)
##  stringr          1.5.1      2023-11-14 [1] CRAN (R 4.3.2)
##  tibble           3.2.1      2023-03-20 [1] CRAN (R 4.3.2)
##  tidyr          * 1.3.1      2024-01-24 [1] CRAN (R 4.3.2)
##  tidyselect       1.2.1      2024-03-11 [1] CRAN (R 4.3.3)
##  UpSetR           1.4.0      2019-05-22 [1] CRAN (R 4.3.3)
##  urlchecker       1.0.1      2021-11-30 [1] CRAN (R 4.3.2)
##  usethis          2.2.3      2024-02-19 [1] CRAN (R 4.3.2)
##  vctrs            0.6.5      2023-12-01 [1] CRAN (R 4.3.2)
##  VennDiagram    * 1.7.3      2022-04-12 [1] CRAN (R 4.3.3)
##  withr            3.0.2      2024-10-28 [1] CRAN (R 4.3.3)
##  writexl        * 1.5.0      2024-02-09 [1] CRAN (R 4.3.2)
##  xfun             0.41       2023-11-01 [1] CRAN (R 4.3.2)
##  xtable           1.8-4      2019-04-21 [1] CRAN (R 4.3.2)
##  yaml             2.3.8      2023-12-11 [1] CRAN (R 4.3.2)
## 
##  [1] C:/Users/tfrancisco/AppData/Local/R/win-library/4.3
##  [2] C:/Users/tfrancisco/AppData/Local/Programs/R/R-4.3.2/library
## 
## ──────────────────────────────────────────────────────────────────────────────

What is below is a draft explaining the GEA methods:

Outlier_selection

Thomas Francisco

2024-04-30