rm(list = ls())
knitr::opts_chunk$set(
    message = FALSE,
    warning = FALSE,
    cache = FALSE
)

1 Introduction

In this script, we analysed the population genetic structure of T. baccata using the population structure dataset, which contains 8,252 SNPs, 452 trees, and 4.37% missing data. Population structure was assessed using principal component analysis (PCA) and STRUCTURE software. This script was also applied to investigate the genetic structure of the other species.

#Load metadata
meta_data_vcf <- read.csv("Data/Species/Taxus_baccata/Samples/samples_taxus_baccata_adapcon_gentree.csv",h=T,sep=";",dec=",")
meta_data_pop <- read.csv("Data/Species/Taxus_baccata/Populations/taxus_sample_29pop.csv",h=T,sep=";",dec=",")
meta_data_pop_order <- meta_data_pop[order(meta_data_pop$Population),]

2 PCA

First, we performed a PCA to examine population structure at both the individual and population levels. We also assessed whether the structure of certain individuals or populations was influenced by the percentage of missing data. To do this, point sizes on the PCA plots were scaled according to the proportion of missing data, while point positions reflected genetic composition. PCA at the individual level was conducted using the PCADAPT function, and at the population level using RDA without covariables (effectively performing PCA).

2.1 Individual-level

We first performed a scree plot analysis to determine how much of the genetic variation is explained by the PCs axis.

#Selection number of axis => screeplot
#Initial parameters
load(file = "Data/Species/Taxus_baccata/genetic_new/structure/Dataset_PCA_8252SNP_452IND.Rdata")
gen_data_new <- Dataset_PCA_8252SNP_452IND[,-1]; row.names(gen_data_new) <- Dataset_PCA_8252SNP_452IND$VCF_ID
data <- gen_data_new
k <- 10 

#Function
screeplot_genetic <- function(data,k){
  
  data_PCa <- data  %>% dplyr::select(-c("na_percentage_indiv")) %>% t() %>% data.frame()
#Format for pcadapt
data_pcadapt <- read.pcadapt(data_PCa)

#Perform the Pca
Pcadapt_results <-pcadapt(data_pcadapt,K=k,method = "mahalanobis")

#Choose the number of axis
 screeplot_data <- data.frame(
    PC_axis = seq_along(Pcadapt_results$singular.values),
    Explained_variance = Pcadapt_results$singular.values^2*100
  )
 
 screeplot_data$PC_axis <- as.factor(screeplot_data$PC_axis)

  gg_screeplot <- ggplot(screeplot_data, aes(x = PC_axis, y = Explained_variance)) +
    geom_bar(stat = "identity", fill = "darkgrey") +
    labs(title = "PCA Scree Plot", x = "Principal Component", y = "Explained Variance (%)") +
    theme_minimal(base_size = 14) +
    theme(plot.title = element_text(hjust = 0.5, face = "bold"))
  
  #  geom_bar(stat = "identity") +
  # labs(x = "PC axis", y = "Explained Variance (%)") +
   # theme_minimal()
  
  #Display the screeplot
  gg_screeplot
  print(gg_screeplot)
  
  #Save
pdf("Results/species/taxus/STRUCTURE/PCA/screeplot_PCA_indiv_level.pdf");print(gg_screeplot);dev.off()
  return(Pcadapt_results)
}
Pcadapt_object <- screeplot_genetic(data,k)

Plot of the PCA.

#Meta_data
meta_data_vcf_452 <- meta_data_vcf[meta_data_vcf$VCF_ID %in% row.names(gen_data_new),]

#Initial parameters
data <-Pcadapt_object 
names <- gen_data_new
axis <- 2 
meta_data <- meta_data_vcf_452

####Color per country

#For loop to do it for multiple PC axis like 1-2 but also 1-3, 2-3.
 Pca_genetic <- function(data, names, num_axes, meta_data) {
   #We can create 2 loops that would perform the Pca_function for each combination of axis retained: 
  for (i in 1:(num_axes - 1)) { #Here, the first loop will select axis between 1 and numb_axis-1 so basically, if we take 3 axis, i will take values of 1 then 2, so PCx will ether be 1 or 2.
    for (j in (i + 1):num_axes) { #Here, the second loop within the first one will take the values of j from i +1 to num axis so if num axis=3, we will have j ranging from 2 to 3 when i=1 and j=3 when i =2
      PCx <- i
      PCy <- j
      
      #Calculate explained variance for chosen PCs
      explained_variance <- data.frame(t(round((data$singular.values[PCx:PCy]^2) * 100, digits = 1)))
      colnames(explained_variance) <- c("PCx", "PCy")
      
      #store the scores along the retained PC axis in a dataframe
      score_Pca_imputed <- data.frame(data$scores[, c(PCx,PCy)], row.names (names))
      colnames(score_Pca_imputed) <- c(paste0("PC", PCx), paste0("PC", PCy), "VCF_ID")
      
      #Add country, population information
      PCa_df_imputed <- merge(score_Pca_imputed, meta_data, "VCF_ID")
      
      #Genetic PCA
      ggplot_representation <- ggplot() +
        geom_point(data = PCa_df_imputed, aes(PCa_df_imputed[,2],PCa_df_imputed[,3],size = names$na_percentage, color = Country)) +
        scale_colour_manual(name = "Country",
                            values = c("orangered3", "gold2", "darkorchid3", "navyblue", "turquoise2", "green3", "blue", "red", "black", "gray", "orange", "darkgreen"),guide = guide_legend(override.aes = list(size = 4))) +
        scale_size(name = "Missing data (%)", breaks = c(0, 5, 10, 14), labels = c("0", "5", "10", "15")) +
        xlab(paste0("PC", PCx, " ", "(", explained_variance$PCx, "%", ")")) + 
        ylab(paste0("PC", PCy, " ", "(", explained_variance$PCy, "%", ")"))+
        theme_bw(base_size = 11) +
        theme(legend.position = "right", 
              legend.title.align = 0.5,
              panel.grid = element_blank(), 
              strip.text = element_text(size = 11),
              plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
              axis.title.x = element_text(size = 14),
              axis.title.y = element_text(size = 14),
              axis.text.x = element_text(size = 12),
              axis.text.y = element_text(size = 12),
              legend.title = element_text(size = 14),
              legend.text = element_text(size = 12),
              legend.key = element_rect(fill = "transparent", color = NA))
      
      print(ggplot_representation)
#Save
pdf("Results/species/taxus/STRUCTURE/PCA/PCA_indiv_level_color_countries.pdf");print(ggplot_representation);dev.off()      
    }
  }
}
 
#Run the function 
Pca_genetic(data,names,axis,meta_data)

####Color per population

Pca_genetic <- function(data, names, num_axes, meta_data) {
   
  for (i in 1:(num_axes - 1)) { 
    for (j in (i + 1):num_axes) { 
      
      PCx <- i
      PCy <- j
      
      explained_variance <- data.frame(t(round((data$singular.values[PCx:PCy]^2) * 100, digits = 1)))
      colnames(explained_variance) <- c("PCx", "PCy")
      
      score_Pca_imputed <- data.frame(data$scores[, c(PCx,PCy)], row.names (names))
      colnames(score_Pca_imputed) <- c(paste0("PC", PCx), paste0("PC", PCy), "VCF_ID")
      
      PCa_df_imputed <- merge(score_Pca_imputed, meta_data, "VCF_ID")
      
      ggplot_representation <- ggplot() +
        geom_point(data = PCa_df_imputed, aes(PCa_df_imputed[,2],PCa_df_imputed[,3],size = names$na_percentage, color = Population)) +
        scale_color_manual(name="Population", values=c("#1f77b4","navyblue", "#2ca02c", "#d62728", "#9467bd",
            "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf",
            "blue", "#ffbb78", "#98df8a", "#ff9896", "#c5b0d5",
            "#c49c94", "#f7b6d2", "black", "#dbdb8d", "#9edae5",
            "darkgreen", "#c49c94", "#c7c7c7","orangered3","green3" ,
            "gold2","white", "#ff7f0e","darkorchid3" )
            ,guide = guide_legend(override.aes = list(size = 4))) +
        scale_size(name = "Missing data (%)", breaks = c(0, 5, 10, 14), labels = c("0", "5", "10", "15")) +
        xlab(paste0("PC", PCx, " ", "(", explained_variance$PCx, "%", ")")) + 
        ylab(paste0("PC", PCy, " ", "(", explained_variance$PCy, "%", ")"))+
        theme_bw(base_size = 11) +
        theme(legend.position = "right", 
              legend.title.align = 0.5,  # Center the title above the legend
              panel.grid = element_blank(), 
              strip.text = element_text(size = 11),
              plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
              axis.title.x = element_text(size = 14),
              axis.title.y = element_text(size = 14),
              axis.text.x = element_text(size = 12),
              axis.text.y = element_text(size = 12),
              legend.title = element_text(size = 14),
              legend.text = element_text(size = 12),
              legend.key = element_rect(fill = "transparent", color = NA))
      
      print(ggplot_representation)
      #Save
pdf("Results/species/taxus/STRUCTURE/PCA/PCA_indiv_level_color_populations.pdf");print(ggplot_representation);dev.off()    
    }
  }
}

#run the function 
Pca_genetic(data,names,axis,meta_data)

Without the percentage of missing data

#Meta_data
meta_data_vcf_452 <- meta_data_vcf[meta_data_vcf$VCF_ID %in% row.names(gen_data_new),]

#Initial parameters
data <-Pcadapt_object
names <- gen_data_new
axis <- 2
meta_data <- meta_data_vcf_452


#Color per country
 Pca_genetic <- function(data, names, num_axes, meta_data) {
  for (i in 1:(num_axes - 1)) { 
    for (j in (i + 1):num_axes) { 
      PCx <- i
      PCy <- j
      
      #Calculate explained variance for chosen PCs
      explained_variance <- data.frame(t(round((data$singular.values[PCx:PCy]^2) * 100, digits = 1)))
      colnames(explained_variance) <- c("PCx", "PCy")
      
      #Store the scores along the retained PC axis in a dataframe
      score_Pca_imputed <- data.frame(data$scores[, c(PCx,PCy)], row.names (names))
      colnames(score_Pca_imputed) <- c(paste0("PC", PCx), paste0("PC", PCy), "VCF_ID")
      
      #Add country, population information
      PCa_df_imputed <- merge(score_Pca_imputed, meta_data, "VCF_ID")
      
      #Genetic PCA
      ggplot_representation <- ggplot() +
        geom_point(data = PCa_df_imputed, aes(PCa_df_imputed[,2],PCa_df_imputed[,3], color = Country,size=2.5)) +
        scale_size_identity(guide = "none") +  # Remove size legend
        scale_colour_manual(name = "Country",
                            values = c("orangered3", "gold2", "darkorchid3", "navyblue", "turquoise2", "green3", "blue", "red", "black", "gray", "orange", "darkgreen"),
                            guide = guide_legend(override.aes = list(size = 4))) +
        xlab(paste0("PC", PCx, " ", "(", explained_variance$PCx, "%", ")")) + 
        ylab(paste0("PC", PCy, " ", "(", explained_variance$PCy, "%", ")"))+
        theme_bw(base_size = 11) +
        theme(legend.position = "right", 
              legend.title.align = 0.5,
              panel.grid = element_blank(), 
              strip.text = element_text(size = 11),
              plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
              axis.title.x = element_text(size = 14),
              axis.title.y = element_text(size = 14),
              axis.text.x = element_text(size = 12),
              axis.text.y = element_text(size = 12),
              legend.title = element_text(size = 14),
              legend.text = element_text(size = 12),
              legend.key = element_rect(fill = "transparent", color = NA))
      
      print(ggplot_representation)
#Save
pdf("Results/species/taxus/STRUCTURE/PCA/PCA_indiv_level_color_countries_wo_label_NA.pdf");print(ggplot_representation);dev.off()      
    }
  }
}
#Run the function 
Pca_genetic(data,names,axis,meta_data)

2.2 Population-level

Finally, we calculated a PCA at the population level to obtain population scores, which were used to account for population structure in pRDA-based GEA analyses. Because allele frequencies were used, the pcadapt function could not be applied, as it does not support allele frequency data. Instead, we used redundancy analysis (RDA), which is equivalent to principal component analysis (PCA) when no explanatory variables are included.

As a first step, we checked the number of principal components (PCs) to retain.

load("Data/Species/Taxus_baccata/genetic_new/data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps.Rdata")
gen_data_new <- data_allelic_frequencies_non_imp_T_adapcon_gentree_452_8252snps
data <- gen_data_new
k <- 10

screeplot_genetic <- function(data, k){
  
  #Perform PCA
  RDA_structure_genetic <- rda(data, scale = TRUE)
  #Compute explained variance
  explained_variance <- (RDA_structure_genetic$CA$eig / sum(RDA_structure_genetic$CA$eig)) * 100
  explained_variance_df <- data.frame(PC = seq_along(explained_variance), Variance = explained_variance)
  
  #Create scree plot using explained variance
  screeplot_graph <- ggplot(explained_variance_df[1:k, ], aes(x = factor(PC), y = Variance)) +
    geom_bar(stat = "identity", fill = "darkgrey") +
    labs(title = "PCA Scree Plot", x = "Principal Component", y = "Explained Variance (%)") +
    theme_minimal(base_size = 14) +
    theme(plot.title = element_text(hjust = 0.5, face = "bold"))
  
  #Save the plot
  ggsave("Results/species/taxus/STRUCTURE/PCA/screeplot_PCA_Pop_level.pdf",
         plot = screeplot_graph, width = 6, height = 4)
  
  return(screeplot_graph)
}

#Run the function and save the object
Pcadapt_object <- screeplot_genetic(data, 10)

#Display the plot
print(Pcadapt_object)

RDA_structure_genetic <- rda(data, scale=T)

PCA at the population level

meta_data_vcf_452 <- meta_data_vcf[meta_data_vcf$Population %in% row.names(gen_data_new),]

axis <- 2
meta_data <- meta_data_vcf_452

 Pca_genetic <- function(data, names, num_axes, meta_data) {
  for (i in 1:(num_axes - 1)) { 
    for (j in (i + 1):num_axes) { 
      
      PCx <- i
      PCy <- j
      #Calculate explained variance for chosen PCs
      explained_variance <- data.frame(eigenvals(RDA_structure_genetic))
      proportion_variance <- data.frame(t(explained_variance[c(PCx,PCy),] / sum(explained_variance) *100)) %>% round(digit=1)
      colnames(proportion_variance) <- c("PCx", "PCy")
      
      #Store the scores along the retained PC axis in a dataframe
      store_score_rda <- scores(RDA_structure_genetic, axes = c(PCx, PCy),display="sites" ) %>% data.frame(row.names(data))
      colnames(store_score_rda) <- c(paste0("PC", PCx), paste0("PC", PCy), "Population")
      
      #Add country, population information
      PCa_score_T_adapcon_gentree <- merge(store_score_rda, meta_data, "Population") %>% group_by(Population) %>% slice_head(n = 1) %>% ungroup() %>% as.data.frame()

      #Genetic PCA
      ggplot_representation <- ggplot() +
        geom_point(data = PCa_score_T_adapcon_gentree,   
                   aes(PCa_score_T_adapcon_gentree[,2], 
                       PCa_score_T_adapcon_gentree[,3], 
                       size = 4, 
                       color = PCa_score_T_adapcon_gentree$Country)) +
        scale_size_identity(guide = "none") +  #Remove size legend
        scale_colour_manual(name = "Country",
                            values = c("orangered3", "gold2", "darkorchid3", "navyblue", 
                                       "turquoise2", "green3", "blue", "red", "black", 
                                       "gray", "orange", "darkgreen"),
                            guide = guide_legend(override.aes = list(size = 4))) +
        xlab(paste0("PC", PCx, " ", "(", proportion_variance$PCx, "%", ")")) + 
        ylab(paste0("PC", PCy, " ", "(", proportion_variance$PCy, "%", ")")) +
        theme_bw(base_size = 11) +
        theme(legend.position = "right", 
              legend.title.align = 0.5,
              panel.grid = element_blank(), 
              strip.text = element_text(size = 11),
              plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
              axis.title.x = element_text(size = 14),
              axis.title.y = element_text(size = 14),
              axis.text.x = element_text(size = 12),
              axis.text.y = element_text(size = 12),
              legend.title = element_text(size = 14),
              legend.text = element_text(size = 12),
              legend.key = element_rect(fill = "transparent", color = NA))

print(ggplot_representation)

 #Save
pdf("Results/species/taxus/STRUCTURE/PCA/PCA_Pop_level.pdf");print(ggplot_representation);dev.off()
      return(PCa_score_T_adapcon_gentree)
    }
  }
}
#Run the function 
PCa_score_T_adapcon_gentree<-Pca_genetic(data,names,axis,meta_data)

Finally, the df of population scores along the PCA axes was saved for use in variance partitioning and GEA pRDA analyses.

3 STRUCTURE software

The next step involved analysing population structure using STRUCTURE, a Bayesian clustering approach that employs Markov Chain Monte Carlo (MCMC) iterations. Initially, individuals are randomly assigned to K groups, where K is the number of clusters chosen. Allele frequencies are then estimated for each group, and individual membership coefficients (Q-values) are calculated. Individuals are re-assigned to groups based on their Q-values, and each MCMC iteration updates allele frequencies and Q-values to maximize the likelihood of the data.

Because the initial assignments are random, a burn-in period is applied to allow the MCMC to stabilise before calculating final Q-values. Subsequent iterations after burn-in are used to compute the mean Q-values for each individual. Multiple runs for each K can be performed to ensure convergence and consistency of results across runs.

We performed the analysis in Genotoul cluster using:
- 8,252 SNPs
- 452 indiv
- K between 2 and 10
- 100,000 burning
- 500,000 iterations (after the burning)
- 10 MCMC per K

Subsequently, we used STRUCTURE Harvester on Linux to estimate the optimal number of clusters (K) using the Evanno method and log-likelihood [ln(K)] values, and to output results in CLUMPP format. Because group labels (1, 2, etc.) may vary across runs for the same K, we used CLUMPP to reassign individuals to consistent group labels, enabling calculation of mean Q-values per individual across runs.

Finally, we visualised the results using the CLUMPAK web interface, which allowed plotting of delta K (Evanno method) and ln(K) values.

The first step involved loading the order of individuals from the STRUCTURE analysis to correctly assign their Q-values.

name <- read.table("Results/species/taxus/STRUCTURE/clumpp_output/data_split_try.txt")

Next, we calculated the mean Q-value per population for each K and visualised the results on a map.

for(i in 2:10){
  input <- paste0("K",i)
  data <- read.table(paste0("Results/species/taxus/STRUCTURE/clumpp_output/",input,"_mean.txt"))
  
  k_full <- data.frame(name[,c(1)],data[,-c(1:5)])
  colnames(k_full) <- c("VCF_ID", paste0("Group",6:ncol(data)-5))
  
  sorted_columns <- names(k_full[,-1])[order(-apply(k_full[,-1], 2, function(x) x[1]))]

k_full_bis <- k_full[, sorted_columns, drop = FALSE]

#add ID again
k_full_final <- data.frame(k_full[,1],k_full_bis);colnames(k_full_final) <- c("VCF_ID", paste0("Group",6:ncol(data)-5))
  
  k_with_pop <- merge(k_full_final,meta_data_vcf,"VCF_ID") %>% merge(meta_data_pop,"Population")
  
  k_final <- k_with_pop %>% dplyr::select(-c("Country.y","N","Elevation.DEM_90m."))
  
  mean_admixture_pop <- k_final %>% 
  group_by(Population) %>% 
  dplyr::mutate(across(where(is.numeric), mean)) %>% 
  slice_head(n = 1) %>% 
  ungroup()
  
  mean_admixture_pop$Longitude=as.numeric(mean_admixture_pop$Longitude);mean_admixture_pop$Latitude=as.numeric(mean_admixture_pop$Latitude)
  
  mean_admixture_pop$Population_bis <- c("Anc","Blind","Bohin","Brank","Buja_A","Buja_G","Cardo","Castle","Fonte","Foresta","Gorenj","Harm","Horn","Jura","Cholo","Olym","Vour","Pater","Rasc","Rouds","Baume","Saro","Serra","Sueve","Unska","Valdi","Vise","Wal","Yew")
  
  groups <- paste0("Group",6:ncol(data)-5)
  admin <- ne_countries(scale = "medium", returnclass = "sf")
my.colors<- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd","#7f7f7f","#bcbd22", "#c49c94", "#f7b6d2", "darkgreen")

#Plot the map with scatterpie
map_pop <- ggplot() + 
  geom_sf(data = admin, fill = gray(0.92), size = 0) +
  geom_scatterpie(
    data = mean_admixture_pop,
    aes(x = Longitude, y = Latitude),
    cols = groups,
    size = 0,
    alpha = 1,
  ) +
  geom_text(
   data = mean_admixture_pop,
    aes(x = Longitude, y = Latitude, label = meta_data_pop_order$pop_abbrev,fontface = "bold"),
    size = 3,
    vjust = 2
  ) +
  scale_fill_manual(values=my.colors)+
  coord_sf(xlim = c(-10, 30), ylim = c(35, 62), expand = FALSE) +
  annotation_scale(location = "br", width_hint = 0.3) +
  annotation_north_arrow(location = "tl", which_north = "true", 
                         pad_x = unit(-0.2, "cm"), pad_y = unit(0.08, "cm"),
                         style = north_arrow_fancy_orienteering) +
  theme_minimal() +
  labs(title = paste0("Genetic Structure for"," ", input," ", "groups"),fill="STRUCTURE groups")

print(map_pop)
#Save
pdf(paste0("Results/species/taxus/GEA_new_var/Structure/Map_",input,"_STRUCTURE.pdf"));print(map_pop);dev.off()
}

#Target projection (ETRS89 / LAEA Europe - EPSG:3035)
lambert_crs <- 3035

#Project background map
admin_proj <- st_transform(admin, crs = lambert_crs)

#Prepare STRUCTURE data
K <- 2
input <- paste0("K", K)
data <- read.table(paste0("Results/species/taxus/STRUCTURE/clumpp_output/", input, "_mean.txt"))

k_full <- data.frame(name[, 1], data[, -c(1:5)])
colnames(k_full) <- c("VCF_ID", paste0("Group", 1:(ncol(k_full) - 1)))
sorted_columns <- names(k_full[, -1])[order(-apply(k_full[, -1], 2, function(x) x[1]))]
k_full_bis <- k_full[, sorted_columns, drop = FALSE]
k_full_final <- data.frame(k_full[, 1], k_full_bis)
colnames(k_full_final) <- c("VCF_ID", paste0("Group", 1:(ncol(k_full_final) - 1)))

k_with_pop <- merge(k_full_final, meta_data_vcf, by = "VCF_ID") %>%
  merge(meta_data_pop, by = "Population") %>%
  dplyr::select(-c("Country.y", "N", "Elevation.DEM_90m."))

mean_admixture_pop <- k_with_pop %>%
  group_by(Population) %>%
  dplyr::mutate(across(where(is.numeric), mean)) %>%
  slice_head(n = 1) %>%
  ungroup()

mean_admixture_pop$Longitude <- as.numeric(mean_admixture_pop$Longitude)
mean_admixture_pop$Latitude <- as.numeric(mean_admixture_pop$Latitude)

#label cleanup
mean_admixture_pop$Population_bis <- c("Anc", "Blind", "Bohin", "Brank", "Buja_A", "Buja_G",
                                       "Cardo", "Castle", "Fonte", "Foresta", "Gorenj", "Harm",
                                       "Horn", "Jura", "Cholo", "Olym", "Vour", "Pater", "Rasc",
                                       "Rouds", "Baume", "Saro", "Serra", "Sueve", "Unska", "Valdi",
                                       "Vise", "Wal", "Yew")[1:nrow(mean_admixture_pop)]

#Convert to sf and project
pies_sf <- st_as_sf(mean_admixture_pop, coords = c("Longitude", "Latitude"), crs = 4326)
pies_proj <- st_transform(pies_sf, crs = lambert_crs)
coords <- st_coordinates(pies_proj)

mean_admixture_pop$x <- coords[, 1]
mean_admixture_pop$y <- coords[, 2]

#Group names
groups <- grep("^Group", colnames(mean_admixture_pop), value = TRUE)

#Colors
my.colors <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
               "#7f7f7f", "#bcbd22", "#c49c94", "#f7b6d2", "darkgreen")

plot <- ggplot() +
  geom_sf(data = admin_proj, fill = gray(0.90), color = "white", size = 0.2) +
  geom_sf(data = shapefile, fill = "#d9f2d9", color = "gray60", size = 0.2) + 
   geom_scatterpie(
    data = mean_admixture_pop,
    aes(x = x, y = y),
    cols = groups,
    pie_scale = 1.2  
  ) +
  scale_fill_manual(values = my.colors) +
 coord_sf(crs = lambert_crs,
         xlim = c(2379349, 7002838),
         ylim = c(1029721, 4519987),
         expand = FALSE)+
  theme_minimal() +
  theme(
    panel.background = element_rect(fill = "azure", color = NA),
    plot.title = element_text(face = "bold", hjust = 0.5)
  ) +
  labs(
    title = paste0("Genetic Structure for ", input, " groups"),
    fill = "STRUCTURE groups"
  )
print(plot)

pdf(paste0("Results/species/taxus/GEA_new_var/Structure/Map_chapter2_",input,"_STRUCTURE.pdf"));print(plot);dev.off()

## png 
##   2

We also generated STRUCTURE barplots to visualise the contribution of each individual to the inferred clusters within populations.

meta_data_pop_order$pop_abbrev <- c("ANC","BLI","BOM","BRA","BUJA","BUJG","CAR","CED","PDT","UMB","GOR","HAR","HHR","JUR","CHO","OLY","VOU","PAT","RAS","RWM","BAU","SAV","SDF","SUE","UNS","VAL","VIS","WAL","YEW")

for(i in 2:3){
  input <- paste0("K", i)

  #Load STRUCTURE results
  data <- read.table(paste0("Results/species/taxus/STRUCTURE/clumpp_output/", input, "_mean.txt"))
  
  k_full <- data.frame(name[,c(1)], data[,-c(1:5)])
  colnames(k_full) <- c("VCF_ID", paste0("Group", 1:(ncol(k_full)-1)))  #Adjust for group numbering
  
  #Reorder groups based on first row's values for color consistency
  sorted_columns <- names(k_full[,-1])[order(-apply(k_full[,-1], 2, function(x) x[1]))]
  k_full_bis <- k_full[, sorted_columns, drop = FALSE]

  #Add ID again
  k_full_final <- data.frame(k_full[,1], k_full_bis)
  colnames(k_full_final) <- c("VCF_ID", paste0("Group", 1:(ncol(data)-5)))
  
  #Merge with metadata, using population abbreviations from meta_data_pop_order
  k_with_pop <- merge(k_full_final, meta_data_vcf, by = "VCF_ID") %>% 
    merge(meta_data_pop, by = "Population") %>%
    merge(meta_data_pop_order[,c(2,7)], by = "Population")

  k_final <- k_with_pop %>% 
    dplyr::select(-c("Country.y", "N", "Elevation.DEM_90m."))

  #Identify the major group for each individual (the group with highest ancestry)
  k_final$Assigned_Group <- apply(k_final %>% dplyr::select(starts_with("Group")), 1, function(x) {
    paste0("Group", which.max(x))  #Assign the group with the highest proportion
  })

  #Order individuals by Assigned Group, then by Population
  k_final <- k_final %>%
    arrange(Assigned_Group, Population)

  #Create a population label that appears only once per group
  k_final <- k_final %>%
    group_by(Assigned_Group, Population) %>%
   dplyr::mutate(Population_Label = ifelse(row_number() == 1, as.character(pop_abbrev), "")) %>%  #Use abbreviation
    ungroup()

  #Convert to long format for ggplot
  k_long <- k_final %>%
    pivot_longer(cols = starts_with("Group"), names_to = "Group", values_to = "Admixture") %>%
   dplyr::mutate(Population = factor(Population, levels = unique(k_final$Population)))  #Ensure correct order
  
  my.colors <- c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#7f7f7f", "#bcbd22", "#c49c94", "#f7b6d2", "darkgreen")

  #Create the STRUCTURE bar plot
  structure_plot <- ggplot(k_long, aes(x = factor(VCF_ID, levels = k_final$VCF_ID), y = Admixture, fill = Group)) +
    geom_bar(stat = "identity", width = 1) +
    scale_fill_manual(values = my.colors) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),  
          axis.ticks.x = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank()) +
    scale_x_discrete(labels = k_final$Population_Label) +
    labs(title = paste0("STRUCTURE Barplot for ", input, " Groups"),
         x = "Populations", y = "Admixture Proportion", fill = "STRUCTURE Groups")

  print(structure_plot)
#Save
pdf(paste0("Results/species/taxus/STRUCTURE/figures/STRUCTURE_barplotK",i,".pdf"));print(structure_plot);dev.off()
}

Finally, we saved the STRUCTURE scores for K = 2 for T. baccata, see Main text for the other species.

#Run the function only for K=3 and save the resulting dataframe
data <- read.table("Results/species/taxus/STRUCTURE/clumpp_output/K2_mean.txt")
  
  k_full <- data.frame(name[,c(1)],data[,-c(1:5)])
  colnames(k_full) <- c("VCF_ID", paste0("Group",6:ncol(data)-5))
  
  sorted_columns <- names(k_full[,-1])[order(-apply(k_full[,-1], 2, function(x) x[1]))]

#Reorder columns based on the maximum value to keep consistency with the color of the groups
k_full_bis <- k_full[, sorted_columns, drop = FALSE]

#Add ID again
k_full_final <- data.frame(k_full[,1],k_full_bis);colnames(k_full_final) <- c("VCF_ID", paste0("Group",6:ncol(data)-5))
  k_with_pop <- merge(k_full_final,meta_data_vcf,"VCF_ID") %>% merge(meta_data_pop,"Population")
  k_final <- k_with_pop %>% dplyr::select(-c("Country.y","N","Elevation.DEM_90m."))
  
  mean_admixture_pop <- k_final %>% 
   group_by(Population) %>% 
 dplyr::mutate(across(where(is.numeric), mean)) %>% 
  slice_head(n = 1) %>% 
  ungroup() %>% 
 dplyr::mutate(across(where(is.numeric),scale))
  
  Cluster_score_STRUCTURE_T_Adapcon_gentreeK2 <- mean_admixture_pop[,-c(2,5:7)]
  
  save(Cluster_score_STRUCTURE_T_Adapcon_gentreeK2,file="Data/Species/Taxus_baccata/GEA/Structure_proxy/Cluster_score_STRUCTURE_T_Adapcon_gentreeK2.Rdata")
  
write_xlsx(k_final,"Data/Species/Taxus_baccata/GEA/Structure_proxy/k2_final.xlsx")

packages <- c(
  "LEA", "ggplot2", "pcadapt", "vegan",
  "dplyr", "RColorBrewer", "rnaturalearth",
  "scatterpie", "writexl", "sf",
  "ggstar", "ggspatial", "tidyr",
  "ggrepel", "rnaturalearthdata"
)

info <- sessionInfo()
packages_used <- info$otherPkgs[packages]
data.frame(
  Package = names(packages_used),
  Version = sapply(packages_used, function(x) x$Version)
)

##                             Package Version
## LEA                             LEA  3.14.0
## ggplot2                     ggplot2   3.5.2
## pcadapt                     pcadapt   4.3.5
## vegan                         vegan   2.6-4
## dplyr                         dplyr   1.1.4
## RColorBrewer           RColorBrewer   1.1-3
## rnaturalearth         rnaturalearth   1.0.1
## scatterpie               scatterpie   0.2.1
## writexl                     writexl   1.5.0
## sf                               sf  1.0-15
## ggstar                       ggstar   1.0.4
## ggspatial                 ggspatial   1.1.9
## tidyr                         tidyr   1.3.1
## ggrepel                     ggrepel   0.9.5
## rnaturalearthdata rnaturalearthdata   1.0.0

Population_structure

Thomas Francisco

2024-03-14

1 Introduction

2 PCA

2.1 Individual-level

2.2 Population-level

3 STRUCTURE software