Load libraries

library("DESeq2")
library("tidyverse")
library("SummarizedExperiment")
library("ggplot2")
library("ggrepel")

All samples

File retrieval and preparation

Get the metadata, which includes sample name, rna extraction batch, clean up batch, behavioural test, viral load, colony location, and colony ID

metadata <- read.csv("./metadata.csv", row.names=1)

Retrieve first row of the counts file, which will be the sample ID

rawcounts <- read.table("./counts.tsv", nrows=1)
colnames <- as.character(rawcounts)

Get rawcounts

  • row.names: use the first column as the row names
  • col.names: use the column names we retrieved above as column names
  • check.names: in this instance prevents R from prepending an “X” to the column names
  • skip: skip the first row because it’s just the sample ID
rawcounts <- read.table("./counts.tsv", row.names=1, col.names=colnames,
                        check.names=FALSE,skip=1)

This has the effect of putting the rawcounts columns in the same order as the metadata rows. This is required by DESeq2

rawcounts <- select(rawcounts, rownames(metadata))
rawcounts <- as.matrix(rawcounts)

Check that they are now in the same order

all(rownames(metadata) == colnames(rawcounts))
## [1] TRUE

SummarizedExperiment

Create SummarizedExperiment object

sum_ex <- SummarizedExperiment(assay = list(count=rawcounts), colData = metadata)

Remove rows/genes with less than 4 reads mapped for any of the samples

sum_ex <- subset(sum_ex, rowSums(assay(sum_ex)) >= 4)

DESeq

Two objects, one to examine viral load and the other to examine behavioural rank each with colony location as a covariate and

Run DESeq

deseq2_viral <- DESeq(deseq2_viral)
deseq2_behavioural <- DESeq(deseq2_behavioural)

Extract results

results_viral <- results(deseq2_viral, contrast=c("viral_load", "high", "low"))
results_behavioural <- results(deseq2_behavioural, contrast=c("behavioural_test", "rank 0", "rank 21"))

Volcano plots

Viral load

Lines of significance shown at alpha = 0.05 and log2 fold changes of 1 and -1

ggplot(as.data.frame(results_viral), 
       aes(x = log2FoldChange, y = -log10(padj))) + 
    geom_point() +
    geom_hline(yintercept = -log10(0.05), col = "red") +
    geom_vline(xintercept = -1, col = "red") +
    geom_vline(xintercept = 1, col = "red")

Behavioural test

Lines of significance shown at alpha = 0.05 and log2 fold changes of 1 and -1

Detailed volcano plot (viral load)

df <- as.data.frame(results_viral)

Find the greatest of the minimum and maximum log2FoldChange column

# so that we can make the plot symmetrical
xlimit <-
    max(
        abs(floor(min(na.omit(df$log2FoldChange)))),
        ceiling(max(na.omit(df$log2FoldChange)))
)

Find greatest y-limit

ylimit <- ceiling(max(na.omit(-log10(df$padj))))

Remove missing values

df <- na.omit(df)

Add gene type column

df <- df %>%
    mutate(gene_type = case_when(log2FoldChange >= 1 & padj <= 0.05 ~ "up",
                                 log2FoldChange <= -1 & padj <= 0.05 ~ "down",
                                 TRUE ~ "ns")) 

Add gene_id as its own column to the dataframe

df$gene_id <- rownames(df)

Add -log10 calculation to the dataframe as a column to make it clearer what we’re plotting

df$minusLog10padj <- -log10(df$padj)

Specify colour, alphas and sizes for the plot

cols <- c("up" = "#ffbd83", "down" = "#36c3ff", "ns" = "grey") 
alphas <- c("up" = 1, "down" = 1, "ns" = 0.5)
sizes <- c("up" = 2, "down" = 2, "ns" = 1) 

Subset significant genes

sig_genes <- subset(df, gene_type == "down" | gene_type == "up")

The plot

volcano_plot_viral_load <- 
    ggplot(df,
       aes(x = log2FoldChange, 
           y = minusLog10padj,
           fill = gene_type,
           size = gene_type,
           alpha = gene_type)
       ) + 
    geom_point(shape=21) +
    geom_hline(yintercept = -log10(0.05), col = "red", linetype = "dashed") +
    geom_vline(xintercept = c(-1, 1), col = "red", linetype = "dashed") +
    geom_label_repel(data = sig_genes,
                     size = 3,
                     aes(label = gene_id),
                     force = 2,
                     nudge_y = 1) +    
    scale_fill_manual(values = cols) +
    scale_size_manual(values = sizes) +
    scale_alpha_manual(values = alphas) + 
    scale_x_continuous(breaks = c(seq(-xlimit, xlimit, 1)),
                       limits = c(-xlimit, xlimit)) +
    scale_y_continuous(breaks = c(seq(0, ylimit, 2))) +
    labs(title = "Gene expression changes in bees with high viral load vs low viral load",
         x = "log2(fold change)",
         y = "-log10(adjusted P-value)",
         colour = "Expression \nchange") +    
    theme_bw() + # Select theme with a white background  
    theme(panel.border = element_rect(colour = "black", fill = NA, size= 0.5),    
          panel.grid = element_line(color="#eeeef5",
                                    size=0.3)
          ) +
    guides(fill = guide_legend(override.aes = aes(label = "")))

volcano_plot_viral_load

Genes of interest

ID Direction (high) Biotype Description Gene length Chromosome L2FC
LOC113218947 Up lncRNA uncharacterized LOC113218947 1255 LG8 2.0

MA Plots

Viral load

DESeq2::plotMA(results_viral)