rm(list = ls())
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
cache = FALSE
)
library(vcfR)
library(reshape2)
library(ggplot2)
library(SNPfiltR)
library(RColorBrewer)
library(stringr)
library(dplyr)
library(writexl)
library(tidyr)
library(tibble)
This script filters the genetic data of Taxus baccata, obtained from 501 trees from 29 populations covering nearly the entire range of T. baccata in Europe. The same code was also applied to filter data from the five other species.
The script also generates genetic data files in formats appropriate for subsequent analyses. Two distinct filtering strategies were implemented depending on the analysis:
For population genetic structure analyses, no correction for minor allele count (MAC) was applied to T. baccata (see Francisco et al. 2025). For the other species, SNPs with MAC =< 3 were removed to filter out potential genotyping errors. This was followed, for all species, by filtering based on missing data, with no imputation of the remaining missing values.
For genotype–environment association (GEA) and other analyses, a MAC correction was applied (set to 20 for T. baccata, corresponding to twice the number of individuals in the smallest population prior to filtering, equivalent to a minor allele frequency of 2.11%; see Supplementary Methods for the other species). Missing data were then imputed using the most common genotype within each main gene pool.
vcf <- read.vcfR("Data/Species/Taxus_baccata/genetic/filtered_T_11374SNP_490IND_ADAPCON_GENTREE.vcf", verbose = FALSE,convertNA=T)
meta_data_vcf=read.csv("Data/Species/Taxus_baccata/Samples/samples_taxus_baccata_adapcon_gentree.csv",h=T,sep=";",dec=",")
meta_data_pop <- read.csv("Data/Species/Taxus_baccata/Populations/taxus_sample_29pop.csv",h=T,sep=";",dec=",")
For population structure analyses, it is important to minimise the proportion of missing data. Therefore, we applied more stringent filters for missing data. Filtering was performed using the vcfR package.
min_dp= 7
max_dp=250
Allele_balance_min= .2
Allele_balance_max= .8
Na_snp=0.85
Na_indiv=0.15
vcf_filter_structure_V2 <- vcf %>%
hard_filter(depth=min_dp ) %>%
max_depth(maxdepth =max_dp )%>%
filter_allele_balance(min.ratio=Allele_balance_min,max.ratio=Allele_balance_max)%>%
missing_by_snp(cutoff=Na_snp)%>%
missing_by_sample( cutoff = Na_indiv )
#vcfR::write.vcf(vcf_filter_structure, file = "Data/Species/Taxus_baccata/genetic_new/script_filtering(not_running_it_each_time)/bayescan.vcf.gz")
We performed two types of population structure analyses: STRUCTURE and principal component analysis (PCA). These analyses have different data input requirements so the genetic data were formatted accordingly.
To use STRUCTURE software, the VCF data stored in the vcfR object were transformed into the required format: the first row contains SNP names, and each subsequent row represents an individual, with each SNP split into two columns to represent the genotype (e.g., a genotype of 1/1 is coded as 1 in both columns). Several steps in R and Ubuntu were performed to obtain the correct format and are detailed below.
#change vcfR to dataframe
vcf_data <- vcf_filter_structure %>%
extract.gt(element="GT", IDtoRowNames = T, as.numeric=F) %>% t %>% data.frame
Genotypic information for each SNP was split into two columns using the split_columns function.
#the function
split_columns <- function(data) {
new_data <- data.frame(matrix(ncol = 0, nrow = nrow(data)))
for (col in names(data)) {
split_parts <- strsplit(as.character(data[[col]]), "/") #Split the columns based on the '/' character
max_parts <- max(sapply(split_parts, length))
for (i in 1:max_parts) {#Names the columns by the name of the snp + _part 1 or 2
new_column_name <- paste0(col, "_part", i)
new_data[[new_column_name]] <- sapply(split_parts, function(x) ifelse(length(x) >= i, x[i], NA))
}
}
row.names(new_data)<-row.names(data)
return(new_data)
}
data_split <- split_columns(vcf_data)
Next, we ensured that each SNP had a single name for the STRUCTURE input by extracting SNP names from the VCF data and modifying them as needed. Missing data (NA) were replaced with -9, and any quotation marks (“”) were removed from the text file using a text editor.
Finally, because the file was created in Windows but intended for use with Linux software, we converted it to Linux text format using Ubuntu. This was done with the command dos2unix nameofdata.txt.
Performing PCA requires genotype information, which can be provided either at the individual level or aggregated at the population level.
vcf_data_no_mac_c <- vcf_filter_structure %>%
extract.gt(element="GT", IDtoRowNames = T, as.numeric=F) %>% t %>% data.frame
We also changed the format of the genotypes information. This raw genotype format was then used to calculate the Omega matrix in BayPass.
Data_geno_no_maf_c_8252SNP_452IND <- vcf_data_no_mac_c %>%
mutate_all(funs(str_replace(., "0/0", "0"))) %>%
mutate_all(funs(str_replace(., "0/1", "1"))) %>%
mutate_all(funs(str_replace(., "1/1", "2")))
#Save for BayPass analysis
save(Data_geno_no_maf_c_8252SNP_452IND,file="Data/Species/Taxus_baccata/genetic_new/GEA/Data_geno_no_maf_c_8252SNP_452IND.Rdata" )
We added the percentage of missing data (NA) per individual to the file in order to visualise it on the PCA plots.
#Specify SNPs as numeric to calculate the percentage of missing data per individual
Dataset_PCA_8252SNP_452IND <- data.frame(row.names(vcf_data),apply(Data_geno_no_maf_c_8252SNP_452IND, 2, as.numeric))
#Calculation of NA %
na_percentage_indiv <- rowMeans(is.na(Dataset_PCA_8252SNP_452IND[,-c(1)])) * 100
Save the prepared file for PCA analysis
Dataset_PCA_8252SNP_452IND$na_percentage_indiv <- na_percentage_indiv
names(Dataset_PCA_8252SNP_452IND)[names(Dataset_PCA_8252SNP_452IND) == 'row.names.vcf_data.'] <- 'VCF_ID'
#Save the genetic dataset
save(Dataset_PCA_8252SNP_452IND,file="Data/Species/Taxus_baccata/genetic_new/structure/Dataset_PCA_8252SNP_452IND.Rdata")
Gen_matrix_non_imputed_T_adapcon_gentree_8252_452 <- Dataset_PCA_8252SNP_452IND[,-8254]#remove VCF_ID name
#Data for allelic frequencies
data_frequencies <-merge(meta_data_vcf,Gen_matrix_non_imputed_T_adapcon_gentree_8252_452,"VCF_ID");row.names(data_frequencies) <- data_frequencies$VCF_ID
#Formatting the genomic data
data_frequencies_num <- data_frequencies[,-c(1:3)] %>%
apply(2,as.numeric) /2 #we divided by 2 because of the format of genomic data: 0,1,2 and for allelic frequencies
#The dataset was prepared with all relevant genomic information in the correct format
data_frequencies_num_tot <- data.frame(data_frequencies[,c(1:3)],data_frequencies_num)
#Calculation of allele frequencies
allelic_frequencies <-data_frequencies_num_tot %>% dplyr::select(-c("VCF_ID","Country","Population")) %>% #Non-genomic data were removed
group_by(data_frequencies_num_tot$Population) %>%
summarise_at(vars(everything()),funs(mean),na.rm=T) %>% #Mean allele frequency for each SNP per population was calculated
ungroup() %>%
as.data.frame()
Save the dataset of allelic frequencies
data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps <- allelic_frequencies %>% dplyr::select(-c("data_frequencies_num_tot$Population")); row.names(data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps) <- allelic_frequencies$'data_frequencies_num_tot$Population'
#Save
save(data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps,file="Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps.Rdata")
For genotype–environment association (GEA) and other landscape
genomic analyses, we applied less stringent filtering for missing data,
as the performance of these approaches appears to improve with
increasing numbers of SNPs and individuals (Santos & Gaiotto, 2020).
Remaining missing data were imputed using the most common genotype
within each main gene pool identified through STRUCTURE clustering
(Pritchard et al., 2000) and evaluated using the Evanno method (Evanno
et al., 2005), as most GEA methods do not accommodate missing
data.
Additionally, to reduce the risk of false positives associated with
low-frequency alleles, SNPs with a minor allele count lower than twice
the number of individuals in the smallest population prior to filtering
were removed.
min_dp= 7
max_dp=250
Allele_balance_min= .2
Allele_balance_max= .8
mac=20
Na_snp=0.7
Na_indiv=0.3
vcf_filter_GEA <- vcf %>%
hard_filter(depth= min_dp) %>%
max_depth(maxdepth =max_dp )%>%
filter_allele_balance(min.ratio=Allele_balance_min,max.ratio=Allele_balance_max) %>%
min_mac(min.mac = mac) %>%
missing_by_snp(cutoff=Na_snp) %>%
missing_by_sample( cutoff = Na_indiv)
We converted the VCF data to the format required for GEA analyses. First, only the genotype information was retained from the vcfR object.
vcf_data_GEA <- vcf_filter_GEA %>%
extract.gt(element="GT", IDtoRowNames = T, as.numeric=F) %>% t %>% data.frame
The dataset was also converted from nucleotide information to genotype format.
genetic_data_geno <- vcf_data_GEA %>%
mutate_all(funs(str_replace(., "0/0", "0"))) %>%
mutate_all(funs(str_replace(., "0/1", "1"))) %>%
mutate_all(funs(str_replace(., "1/1", "2")))
Missing data were imputed using the most common genotype within each main gene pool. For T. baccata and P. pinea, main gene pools were extracted from previously published studies (See Main Text).
#Merge info
meta_data_tot <- merge(meta_data_vcf,meta_data_pop,"Population")
###Imputation steps###
#Keep ID info
gen_grouped <- genetic_data_geno %>% tibble::rownames_to_column(var = "VCF_ID")
list_indiv_427 <- list(gen_grouped$VCF_ID)
#A metadata file was created containing only the individuals retained after the filtering steps.
meta_data_vcf_imputation <- meta_data_tot[meta_data_tot$VCF_ID %in% gen_grouped$VCF_ID,]
gen_grouped_num <- data.frame(gen_grouped[,c("VCF_ID")],apply(gen_grouped[,c(2:8617)], 2, as.numeric))
#Name of VCF_ID to merge
names(gen_grouped_num)[names(gen_grouped_num) == 'gen_grouped...c..VCF_ID...'] <- 'VCF_ID'
#Total dataset
gen_grouped_num_tot <- merge(meta_data_vcf_imputation[,-c(4:8)],gen_grouped_num,"VCF_ID")
#Perform the imputation by transforming the NAs into the most common genotype per gene pool
gen_imp <- gen_grouped_num_tot %>%#Gene pool
group_by(main_gene_pool) %>%
apply(2, function(x) replace(x, is.na(x), as.numeric(names(which.max(table(x)))))) %>%
as.data.frame() %>%
ungroup()
rownames(gen_imp) <- gen_imp$VCF_ID
gen_matrix <- gen_imp[,-c(1:4)] #Remove ID,country and main gene pool columns from data frame
#We have an issues with space added during imputation, we removed them:
Gen_matrix_imp_T_Adapcon_Gentree_475_8616 <- gen_matrix %>%
mutate(across(where(is.character), str_trim))
##Check the proportion of missing data before and after imputation
prop.table(table(is.na(gen_grouped_num_tot)))
##
## FALSE TRUE
## 0.91481084 0.08518916
prop.table(table(is.na(Gen_matrix_imp_T_Adapcon_Gentree_475_8616))) #Should be 0
##
## FALSE
## 1
Two types of genomic datasets were saved for downstream
analyses:
- individual-level genomic data
- population-level genomic data
Both datasets can serve as input for GEA or other genomic analyses. In this study, we used the population-level datasets except for LFMM.
save(Gen_matrix_imp_T_Adapcon_Gentree_475_8616,file="Data/Species/Taxus_baccata/genetic_new/Gen_matrix_imp_T_Adapcon_Gentree_475_8616.Rdata")
#We also saved a version with the nucleotide format : 0/0, 0/1 and 1/1 for baypass
Gen_matrix_imp_T_Adapcon_Gentree_475_8616_BAYPASS <- Gen_matrix_imp_T_Adapcon_Gentree_475_8616 %>%
mutate_all(funs(str_replace(.,"0","0/0"))) %>%
mutate_all(funs(str_replace(., "1","0/1"))) %>%
mutate_all(funs(str_replace(., "2", "1/1")))
save(Gen_matrix_imp_T_Adapcon_Gentree_475_8616_BAYPASS,file="Data/Species/Taxus_baccata/genetic_new/GEA/Gen_matrix_imp_T_Adapcon_Gentree_475_8616_BAYPASS.Rdata")
To perform analyses at the population level, allele frequencies were calculated for each SNP within each population.
Gen_matrix_imp_T_Adapcon_Gentree_475_8616$VCF_ID <- row.names(Gen_matrix_imp_T_Adapcon_Gentree_475_8616)
#Metadata
meta_data_vcf_475 <- meta_data_vcf[meta_data_vcf$VCF_ID %in% row.names(Gen_matrix_imp_T_Adapcon_Gentree_475_8616),]
#Data for allelic frequencies
data_frequencies <-merge(meta_data_vcf_475,Gen_matrix_imp_T_Adapcon_Gentree_475_8616,"VCF_ID");row.names(data_frequencies) <- data_frequencies$VCF_ID
#Formatting the genomic data
data_frequencies_num <- data_frequencies[,-c(1:3)] %>% #Keeping only the SNPs
apply(2,as.numeric) /2 #we divided by 2 because of the format of genomic data: 0,1,2 and for allelic frequencies and we want 0,0.5, 1
#Dataset with all information and genomic data in the right format
data_frequencies_num_tot <- data.frame(data_frequencies[,c(1:3)],data_frequencies_num)
#Calculation of allelic frequencies
allelic_frequencies <-data_frequencies_num_tot %>% dplyr::select(-c("VCF_ID","Country","Population")) %>%
group_by(data_frequencies_num_tot$Population) %>%
summarise_at(vars(everything()),funs(mean),na.rm=T) %>% #Calculate the mean for each SNP per population
ungroup() %>%
as.data.frame()
Save the allelic frequencies dataset
#Move population level to row.names
data_allelic_frequencies_29pop_adapcon_gentree_475_8616 <- allelic_frequencies %>% dplyr::select(-c("data_frequencies_num_tot$Population")); row.names(data_allelic_frequencies_29pop_adapcon_gentree_475_8616) <- allelic_frequencies$'data_frequencies_num_tot$Population'
#Save
save(data_allelic_frequencies_29pop_adapcon_gentree_475_8616,file="Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_29pop_adapcon_gentree_475_8616.Rdata")
Session info
packages <- c(
"vcfR", "reshape2", "ggplot2", "SNPfiltR",
"RColorBrewer", "stringr", "dplyr",
"writexl", "tidyr", "tibble"
)
info <- sessionInfo()
packages_used <- info$otherPkgs[packages]
data.frame(
Package = names(packages_used),
Version = sapply(packages_used, function(x) x$Version)
)
## Package Version
## vcfR vcfR 1.15.0
## reshape2 reshape2 1.4.4
## ggplot2 ggplot2 3.5.2
## SNPfiltR SNPfiltR 1.0.1
## RColorBrewer RColorBrewer 1.1-3
## stringr stringr 1.5.1
## dplyr dplyr 1.1.4
## writexl writexl 1.5.0
## tidyr tidyr 1.3.1
## tibble tibble 3.2.1