Load libraries

library("DESeq2")
library("tidyverse")
library("SummarizedExperiment")
library("ggplot2")
library("ggrepel")

All samples

File retrieval and preparation

Get the metadata, which includes sample name, rna extraction batch, clean up batch, behavioural test, viral load, colony location, and colony ID

metadata <- read.csv("./metadata.csv", row.names=1)

Retrieve first row of the counts file, which will be the sample ID

rawcounts <- read.table("./counts.tsv", nrows=1)
colnames <- as.character(rawcounts)

Get rawcounts

  • row.names: use the first column as the row names
  • col.names: use the column names we retrieved above as column names
  • check.names: in this instance prevents R from prepending an “X” to the column names
  • skip: skip the first row because it’s just the sample ID
rawcounts <- read.table("./counts.tsv", row.names=1, col.names=colnames,
                        check.names=FALSE,skip=1)

This has the effect of putting the rawcounts columns in the same order as the metadata rows. This is required by DESeq2

rawcounts <- select(rawcounts, rownames(metadata))
rawcounts <- as.matrix(rawcounts)

Check that they are now in the same order

all(rownames(metadata) == colnames(rawcounts))
## [1] TRUE

SummarizedExperiment

Create SummarizedExperiment object

sum_ex <- SummarizedExperiment(assay = list(count=rawcounts), colData = metadata)

Remove rows/genes with less than 4 reads mapped for any of the samples

sum_ex <- subset(sum_ex, rowSums(assay(sum_ex)) >= 4)

DESeq

Two objects, one to examine viral load and the other to examine behavioural rank each with colony location as a covariate and

Run DESeq

deseq2_viral <- DESeq(deseq2_viral)
deseq2_behavioural <- DESeq(deseq2_behavioural)

Extract results

results_viral <- results(deseq2_viral, contrast=c("viral_load", "high", "low"))
results_behavioural <- results(deseq2_behavioural, contrast=c("behavioural_test", "rank 0", "rank 21"))

Volcano plots

Viral load

Lines of significance shown at alpha = 0.05 and log2 fold changes of 1 and -1

ggplot(as.data.frame(results_viral), 
       aes(x = log2FoldChange, y = -log10(padj))) + 
    geom_point() +
    geom_hline(yintercept = -log10(0.05), col = "red") +
    geom_vline(xintercept = -1, col = "red") +
    geom_vline(xintercept = 1, col = "red")

Behavioural test

Lines of significance shown at alpha = 0.05 and log2 fold changes of 1 and -1

Detailed volcano plot (viral load)

df <- as.data.frame(results_viral)

Find the greatest of the minimum and maximum log2FoldChange column

# so that we can make the plot symmetrical
xlimit <-
    max(
        abs(floor(min(na.omit(df$log2FoldChange)))),
        ceiling(max(na.omit(df$log2FoldChange)))
)

Find greatest y-limit

ylimit <- ceiling(max(na.omit(-log10(df$padj))))

Remove missing values

df <- na.omit(df)

Add gene type column

df <- df %>%
    mutate(gene_type = case_when(log2FoldChange >= 1 & padj <= 0.05 ~ "up",
                                 log2FoldChange <= -1 & padj <= 0.05 ~ "down",
                                 TRUE ~ "ns")) 

Add gene_id as its own column to the dataframe

df$gene_id <- rownames(df)

Add -log10 calculation to the dataframe as a column to make it clearer what we’re plotting

df$minusLog10padj <- -log10(df$padj)

Specify colour, alphas and sizes for the plot

cols <- c("up" = "#ffbd83", "down" = "#36c3ff", "ns" = "grey") 
alphas <- c("up" = 1, "down" = 1, "ns" = 0.5)
sizes <- c("up" = 2, "down" = 2, "ns" = 1) 

Subset significant genes

sig_genes <- subset(df, gene_type == "down" | gene_type == "up")

The plot

volcano_plot_viral_load <- 
    ggplot(df,
       aes(x = log2FoldChange, 
           y = minusLog10padj,
           fill = gene_type,
           size = gene_type,
           alpha = gene_type)
       ) + 
    geom_point(shape=21) +
    geom_hline(yintercept = -log10(0.05), col = "red", linetype = "dashed") +
    geom_vline(xintercept = c(-1, 1), col = "red", linetype = "dashed") +
    geom_label_repel(data = sig_genes,
                     size = 3,
                     aes(label = gene_id),
                     force = 2,
                     nudge_y = 1) +    
    scale_fill_manual(values = cols) +
    scale_size_manual(values = sizes) +
    scale_alpha_manual(values = alphas) + 
    scale_x_continuous(breaks = c(seq(-xlimit, xlimit, 1)),
                       limits = c(-xlimit, xlimit)) +
    scale_y_continuous(breaks = c(seq(0, ylimit, 2))) +
    labs(title = "Gene expression changes in bees with high viral load vs low viral load",
         x = "log2(fold change)",
         y = "-log10(adjusted P-value)",
         colour = "Expression \nchange") +    
    theme_bw() + # Select theme with a white background  
    theme(panel.border = element_rect(colour = "black", fill = NA, size= 0.5),    
          panel.grid = element_line(color="#eeeef5",
                                    size=0.3)
          ) +
    guides(fill = guide_legend(override.aes = aes(label = "")))

volcano_plot_viral_load

Genes of interest

ID Direction (high) Biotype Description Gene length Chromosome L2FC
LOC113218947 Up lncRNA uncharacterized LOC113218947 1255 LG8 2.0

MA Plots

Viral load

DESeq2::plotMA(results_viral)

Behavioural test
DESeq2::plotMA(results_behavioural)

Cruickshank

Preparation

Subset colony location = Cruickshank

sum_ex_cruickshank <- subset(sum_ex, , colony_location == "cruickshank")

DESeq

Two objects, one to examine viral load and the other to examine behavioural rank each with colony location as a covariate and

Run DESeq

deseq2_viral_cruickshank <- DESeq(deseq2_viral_cruickshank)
deseq2_behavioural_cruickshank <- DESeq(deseq2_behavioural_cruickshank)

Extract results

results_viral_cruickshank <- results(deseq2_viral_cruickshank, contrast=c("viral_load", "high", "low"))
results_behavioural_cruickshank <- results(deseq2_behavioural_cruickshank, contrast=c("behavioural_test", "rank 0", "rank 21"))

Volcano plots

Viral load

Lines of significance shown at alpha = 0.05 and log2 fold changes of 1 and -1

Behavioural test

Lines of significance shown at alpha = 0.05 and log2 fold changes of 1 and -1

Detailed volcano plot behavioural test

Genes of interest

ID Direction (high) Biotype Description Gene length Chromosome L2FC
Apid1 Down protein_coding apidaecin 1 1712 LG16 -4.6
LOC552073 Down protein_coding arylsulfatase B 7141 LG2 -1.1

MA Plots

Viral load

DESeq2::plotMA(results_viral_cruickshank)

Behavioural test
DESeq2::plotMA(results_behavioural_cruickshank)

Newburgh

Preparation

Subset colony location = Newburgh

sum_ex_newburgh <- subset(sum_ex, , colony_location == "newburgh")

DESeq

Two objects, one to examine viral load and the other to examine behavioural rank each with colony location as a covariate and

Run DESeq

deseq2_viral_newburgh <- DESeq(deseq2_viral_newburgh)
deseq2_behavioural_newburgh <- DESeq(deseq2_behavioural_newburgh)

Extract results

results_viral_newburgh <- results(deseq2_viral_newburgh, contrast=c("viral_load", "high", "low"))
results_behavioural_newburgh <- results(deseq2_behavioural_newburgh, contrast=c("behavioural_test", "rank 0", "rank 21"))

Volcano plots

Viral load

Lines of significance shown at alpha = 0.05 and log2 fold changes of 1 and -1

Behavioural test

Lines of significance shown at alpha = 0.05 and log2 fold changes of 1 and -1

Detailed volcano plot viral load

Genes of interest

ID Direction (high) Biotype Description Gene length Chromosome L2FC
Apid1 Up protein_coding apidaecin 1 1712 LG16 5.8
Hbg2 Up protein_coding alpha-glucosidase 1743 LG8 5.3
LOC100578156 Up protein_coding uncharacterized LOC100578156, 2289 LG5 1.3
LOC102654076 Up lncRNA uncharacterized LOC102654076 21364 LG9 1.3
LOC113218549 Up lncRNA uncharacterized LOC113218549 37214 LG4 5.7
LOC113218947 Up lncRNA uncharacterized LOC113218947 1255 LG8 2.3
LOC406142 Up protein_coding hymenoptaecin 1640 LG6 3.8
LOC406144 Up protein_coding abaecin 866 LG10 3.0
LOC551263 Up protein_coding monocarboxylate transporter 13, 11240 LG8 1.2
LOC724899 Down protein_coding lysozyme 1516 LG13 -1.8
PPO Down protein_coding phenoloxidase subunit A3 3665 LG14 -1.8
LOC102655375 Down lncRNA uncharacterized LOC102655375 3167 LG8 -1.3
LOC102655319 Down protein_coding inner centromere protein A 5515 LG9 -1.3
LOC408544 Down protein_coding lysyl oxidase homolog 4, 10351 LG15 -1.1

MA Plots

Viral load

DESeq2::plotMA(results_viral_newburgh)

Behavioural test
DESeq2::plotMA(results_behavioural_newburgh)