rm(list = ls())
knitr::opts_chunk$set(
    message = FALSE,
    warning = FALSE,
    cache = FALSE
)

library(vcfR)
library(reshape2)
library(ggplot2)
library(SNPfiltR)
library(RColorBrewer)
library(stringr)
library(dplyr)
library(writexl)
library(tidyr)
library(tibble)

1 Introduction

This script filters the genetic data of Taxus baccata, obtained from 501 trees from 29 populations covering nearly the entire range of T. baccata in Europe. The same code was also applied to filter data from the five other species.

The script also generates genetic data files in formats appropriate for subsequent analyses. Two distinct filtering strategies were implemented depending on the analysis:

For population genetic structure analyses, no correction for minor allele count (MAC) was applied to T. baccata (see Francisco et al. 2025). For the other species, SNPs with MAC =< 3 were removed to filter out potential genotyping errors. This was followed, for all species, by filtering based on missing data, with no imputation of the remaining missing values.

For genotype–environment association (GEA) and other analyses, a MAC correction was applied (set to 20 for T. baccata, corresponding to twice the number of individuals in the smallest population prior to filtering, equivalent to a minor allele frequency of 2.11%; see Supplementary Methods for the other species). Missing data were then imputed using the most common genotype within each main gene pool.

2 Population stucture filtering

2.1 Data

vcf <- read.vcfR("Data/Species/Taxus_baccata/genetic/filtered_T_11374SNP_490IND_ADAPCON_GENTREE.vcf", verbose = FALSE,convertNA=T)
meta_data_vcf=read.csv("Data/Species/Taxus_baccata/Samples/samples_taxus_baccata_adapcon_gentree.csv",h=T,sep=";",dec=",")

meta_data_pop <- read.csv("Data/Species/Taxus_baccata/Populations/taxus_sample_29pop.csv",h=T,sep=";",dec=",")

2.2 Filtering steps

For population structure analyses, it is important to minimise the proportion of missing data. Therefore, we applied more stringent filters for missing data. Filtering was performed using the vcfR package.

min_dp= 7
max_dp=250
Allele_balance_min= .2
Allele_balance_max= .8
Na_snp=0.85
Na_indiv=0.15

vcf_filter_structure_V2 <- vcf %>% 
  hard_filter(depth=min_dp ) %>% 
  max_depth(maxdepth =max_dp )%>% 
  filter_allele_balance(min.ratio=Allele_balance_min,max.ratio=Allele_balance_max)%>% 
  missing_by_snp(cutoff=Na_snp)%>%
  missing_by_sample( cutoff = Na_indiv )

#vcfR::write.vcf(vcf_filter_structure, file = "Data/Species/Taxus_baccata/genetic_new/script_filtering(not_running_it_each_time)/bayescan.vcf.gz")

2.3 Formatting the genetic data

We performed two types of population structure analyses: STRUCTURE and principal component analysis (PCA). These analyses have different data input requirements so the genetic data were formatted accordingly.

2.3.1 STRUCTURE software

To use STRUCTURE software, the VCF data stored in the vcfR object were transformed into the required format: the first row contains SNP names, and each subsequent row represents an individual, with each SNP split into two columns to represent the genotype (e.g., a genotype of 1/1 is coded as 1 in both columns). Several steps in R and Ubuntu were performed to obtain the correct format and are detailed below.

#change vcfR to dataframe
vcf_data <- vcf_filter_structure %>% 
  extract.gt(element="GT", IDtoRowNames = T, as.numeric=F) %>% t %>% data.frame

Genotypic information for each SNP was split into two columns using the split_columns function.

#the function
split_columns <- function(data) {
  new_data <- data.frame(matrix(ncol = 0, nrow = nrow(data)))  
  
  for (col in names(data)) { 
    split_parts <- strsplit(as.character(data[[col]]), "/") #Split the columns based on the '/' character
    max_parts <- max(sapply(split_parts, length))
    
    for (i in 1:max_parts) {#Names the columns by the name of the snp + _part 1 or 2
      new_column_name <- paste0(col, "_part", i)
      new_data[[new_column_name]] <- sapply(split_parts, function(x) ifelse(length(x) >= i, x[i], NA))
    }
  }
  row.names(new_data)<-row.names(data)
  return(new_data)
}

data_split <- split_columns(vcf_data)

Next, we ensured that each SNP had a single name for the STRUCTURE input by extracting SNP names from the VCF data and modifying them as needed. Missing data (NA) were replaced with -9, and any quotation marks (“”) were removed from the text file using a text editor.

Finally, because the file was created in Windows but intended for use with Linux software, we converted it to Linux text format using Ubuntu. This was done with the command dos2unix nameofdata.txt.

2.3.2 PCA

Performing PCA requires genotype information, which can be provided either at the individual level or aggregated at the population level.

2.3.2.1 Individual level

vcf_data_no_mac_c <- vcf_filter_structure %>% 
  extract.gt(element="GT", IDtoRowNames = T, as.numeric=F) %>% t %>% data.frame

We also changed the format of the genotypes information. This raw genotype format was then used to calculate the Omega matrix in BayPass.

Data_geno_no_maf_c_8252SNP_452IND <- vcf_data_no_mac_c %>% 
  mutate_all(funs(str_replace(., "0/0", "0"))) %>% 
  mutate_all(funs(str_replace(., "0/1", "1"))) %>% 
  mutate_all(funs(str_replace(., "1/1", "2")))

#Save for BayPass analysis 
save(Data_geno_no_maf_c_8252SNP_452IND,file="Data/Species/Taxus_baccata/genetic_new/GEA/Data_geno_no_maf_c_8252SNP_452IND.Rdata" )

We added the percentage of missing data (NA) per individual to the file in order to visualise it on the PCA plots.

#Specify SNPs as numeric to calculate the percentage of missing data per individual
Dataset_PCA_8252SNP_452IND <- data.frame(row.names(vcf_data),apply(Data_geno_no_maf_c_8252SNP_452IND, 2, as.numeric))

#Calculation of NA %
na_percentage_indiv <- rowMeans(is.na(Dataset_PCA_8252SNP_452IND[,-c(1)])) * 100

Save the prepared file for PCA analysis

Dataset_PCA_8252SNP_452IND$na_percentage_indiv <- na_percentage_indiv
names(Dataset_PCA_8252SNP_452IND)[names(Dataset_PCA_8252SNP_452IND) == 'row.names.vcf_data.'] <- 'VCF_ID'

#Save the genetic dataset 
save(Dataset_PCA_8252SNP_452IND,file="Data/Species/Taxus_baccata/genetic_new/structure/Dataset_PCA_8252SNP_452IND.Rdata")

2.3.2.2 Population level

Gen_matrix_non_imputed_T_adapcon_gentree_8252_452 <- Dataset_PCA_8252SNP_452IND[,-8254]#remove VCF_ID name

#Data for allelic frequencies 
data_frequencies <-merge(meta_data_vcf,Gen_matrix_non_imputed_T_adapcon_gentree_8252_452,"VCF_ID");row.names(data_frequencies) <- data_frequencies$VCF_ID

#Formatting the genomic data
data_frequencies_num <- data_frequencies[,-c(1:3)] %>%
  apply(2,as.numeric) /2 #we divided by 2 because of the format of genomic data: 0,1,2 and for allelic frequencies

#The dataset was prepared with all relevant genomic information in the correct format
data_frequencies_num_tot <- data.frame(data_frequencies[,c(1:3)],data_frequencies_num)

#Calculation of allele frequencies
allelic_frequencies <-data_frequencies_num_tot %>% dplyr::select(-c("VCF_ID","Country","Population")) %>% #Non-genomic data were removed
  group_by(data_frequencies_num_tot$Population) %>% 
  summarise_at(vars(everything()),funs(mean),na.rm=T) %>% #Mean allele frequency for each SNP per population was calculated
    ungroup() %>%
as.data.frame()

Save the dataset of allelic frequencies

data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps <- allelic_frequencies %>% dplyr::select(-c("data_frequencies_num_tot$Population")); row.names(data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps) <- allelic_frequencies$'data_frequencies_num_tot$Population'

#Save
save(data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps,file="Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps.Rdata")

3 Genotype-environment association (GEA) analyses and other analyses filering

For genotype–environment association (GEA) and other landscape genomic analyses, we applied less stringent filtering for missing data, as the performance of these approaches appears to improve with increasing numbers of SNPs and individuals (Santos & Gaiotto, 2020). Remaining missing data were imputed using the most common genotype within each main gene pool identified through STRUCTURE clustering (Pritchard et al., 2000) and evaluated using the Evanno method (Evanno et al., 2005), as most GEA methods do not accommodate missing data.
Additionally, to reduce the risk of false positives associated with low-frequency alleles, SNPs with a minor allele count lower than twice the number of individuals in the smallest population prior to filtering were removed.

3.1 Filtering steps

min_dp= 7
max_dp=250
Allele_balance_min= .2
Allele_balance_max= .8
mac=20
Na_snp=0.7
Na_indiv=0.3

vcf_filter_GEA <- vcf %>% 
  hard_filter(depth= min_dp) %>% 
   max_depth(maxdepth =max_dp )%>%
  filter_allele_balance(min.ratio=Allele_balance_min,max.ratio=Allele_balance_max) %>% 
  min_mac(min.mac = mac) %>% 
  missing_by_snp(cutoff=Na_snp) %>%
  missing_by_sample( cutoff = Na_indiv)

3.2 Imputing genetic data

We converted the VCF data to the format required for GEA analyses. First, only the genotype information was retained from the vcfR object.

vcf_data_GEA <- vcf_filter_GEA %>% 
  extract.gt(element="GT", IDtoRowNames = T, as.numeric=F) %>% t %>% data.frame

The dataset was also converted from nucleotide information to genotype format.

genetic_data_geno <- vcf_data_GEA %>% 
  mutate_all(funs(str_replace(., "0/0", "0"))) %>% 
  mutate_all(funs(str_replace(., "0/1", "1"))) %>% 
  mutate_all(funs(str_replace(., "1/1", "2")))

Missing data were imputed using the most common genotype within each main gene pool. For T. baccata and P. pinea, main gene pools were extracted from previously published studies (See Main Text).

#Merge info
meta_data_tot <- merge(meta_data_vcf,meta_data_pop,"Population")

###Imputation steps###

#Keep ID info
gen_grouped <- genetic_data_geno %>% tibble::rownames_to_column(var = "VCF_ID")
list_indiv_427 <- list(gen_grouped$VCF_ID)

#A metadata file was created containing only the individuals retained after the filtering steps.
meta_data_vcf_imputation <- meta_data_tot[meta_data_tot$VCF_ID %in% gen_grouped$VCF_ID,] 

gen_grouped_num <- data.frame(gen_grouped[,c("VCF_ID")],apply(gen_grouped[,c(2:8617)], 2, as.numeric))

#Name of VCF_ID to merge 
names(gen_grouped_num)[names(gen_grouped_num) == 'gen_grouped...c..VCF_ID...'] <- 'VCF_ID'

#Total dataset
gen_grouped_num_tot <- merge(meta_data_vcf_imputation[,-c(4:8)],gen_grouped_num,"VCF_ID")

#Perform the imputation by transforming the NAs into the most common genotype per gene pool
gen_imp <- gen_grouped_num_tot %>%#Gene pool
  group_by(main_gene_pool) %>%
apply(2, function(x) replace(x, is.na(x), as.numeric(names(which.max(table(x)))))) %>%
  as.data.frame() %>% 
  ungroup()

rownames(gen_imp) <- gen_imp$VCF_ID
gen_matrix <- gen_imp[,-c(1:4)]  #Remove ID,country and main gene pool columns from data frame

#We have an issues with space added during imputation, we removed them: 
Gen_matrix_imp_T_Adapcon_Gentree_475_8616 <- gen_matrix %>% 
  mutate(across(where(is.character), str_trim))

##Check the proportion of missing data before and after imputation
prop.table(table(is.na(gen_grouped_num_tot)))

## 
##      FALSE       TRUE 
## 0.91481084 0.08518916

prop.table(table(is.na(Gen_matrix_imp_T_Adapcon_Gentree_475_8616))) #Should be 0

## 
## FALSE 
##     1

3.3 Formatting genetic data

Two types of genomic datasets were saved for downstream analyses:
- individual-level genomic data
- population-level genomic data

Both datasets can serve as input for GEA or other genomic analyses. In this study, we used the population-level datasets except for LFMM.

3.3.1 Individual-level genomic data

save(Gen_matrix_imp_T_Adapcon_Gentree_475_8616,file="Data/Species/Taxus_baccata/genetic_new/Gen_matrix_imp_T_Adapcon_Gentree_475_8616.Rdata")

#We also saved a version with the nucleotide format : 0/0, 0/1 and 1/1 for baypass
Gen_matrix_imp_T_Adapcon_Gentree_475_8616_BAYPASS <- Gen_matrix_imp_T_Adapcon_Gentree_475_8616 %>% 
  mutate_all(funs(str_replace(.,"0","0/0"))) %>% 
  mutate_all(funs(str_replace(., "1","0/1"))) %>% 
  mutate_all(funs(str_replace(., "2", "1/1")))

save(Gen_matrix_imp_T_Adapcon_Gentree_475_8616_BAYPASS,file="Data/Species/Taxus_baccata/genetic_new/GEA/Gen_matrix_imp_T_Adapcon_Gentree_475_8616_BAYPASS.Rdata")

3.3.2 Population-level genomic data

To perform analyses at the population level, allele frequencies were calculated for each SNP within each population.

Gen_matrix_imp_T_Adapcon_Gentree_475_8616$VCF_ID <- row.names(Gen_matrix_imp_T_Adapcon_Gentree_475_8616)

#Metadata
meta_data_vcf_475 <- meta_data_vcf[meta_data_vcf$VCF_ID %in% row.names(Gen_matrix_imp_T_Adapcon_Gentree_475_8616),]

#Data for allelic frequencies 
data_frequencies <-merge(meta_data_vcf_475,Gen_matrix_imp_T_Adapcon_Gentree_475_8616,"VCF_ID");row.names(data_frequencies) <- data_frequencies$VCF_ID

#Formatting the genomic data
data_frequencies_num <- data_frequencies[,-c(1:3)] %>% #Keeping only the SNPs
  apply(2,as.numeric) /2 #we divided by 2 because of the format of genomic data: 0,1,2 and for allelic frequencies and we want 0,0.5, 1

#Dataset with all information and genomic data in the right format
data_frequencies_num_tot <- data.frame(data_frequencies[,c(1:3)],data_frequencies_num)

#Calculation of allelic frequencies
allelic_frequencies <-data_frequencies_num_tot %>% dplyr::select(-c("VCF_ID","Country","Population")) %>%
  group_by(data_frequencies_num_tot$Population) %>%
  summarise_at(vars(everything()),funs(mean),na.rm=T) %>% #Calculate the mean for each SNP per population
    ungroup() %>%
as.data.frame()

Save the allelic frequencies dataset

#Move population level to row.names
data_allelic_frequencies_29pop_adapcon_gentree_475_8616 <- allelic_frequencies %>% dplyr::select(-c("data_frequencies_num_tot$Population")); row.names(data_allelic_frequencies_29pop_adapcon_gentree_475_8616) <- allelic_frequencies$'data_frequencies_num_tot$Population'

#Save
save(data_allelic_frequencies_29pop_adapcon_gentree_475_8616,file="Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_29pop_adapcon_gentree_475_8616.Rdata")

Session info

packages <- c(
  "vcfR", "reshape2", "ggplot2", "SNPfiltR",
  "RColorBrewer", "stringr", "dplyr",
  "writexl", "tidyr", "tibble"
)

info <- sessionInfo()
packages_used <- info$otherPkgs[packages]
data.frame(
  Package = names(packages_used),
  Version = sapply(packages_used, function(x) x$Version)
)

##                   Package Version
## vcfR                 vcfR  1.15.0
## reshape2         reshape2   1.4.4
## ggplot2           ggplot2   3.5.2
## SNPfiltR         SNPfiltR   1.0.1
## RColorBrewer RColorBrewer   1.1-3
## stringr           stringr   1.5.1
## dplyr               dplyr   1.1.4
## writexl           writexl   1.5.0
## tidyr               tidyr   1.3.1
## tibble             tibble   3.2.1

Genetic_filtering

Thomas Francisco

2024-03-11