Fast Gene Set Enrichment Analysis

The primary focus of hypeR is on downstream enrichment analysis workflows. As long as enrichment results are formatted into a hyp object, they are compatible. To wrap third-party packages, we create modified version of the hypeR() function that calls external packages when performing the enrichment analysis, and then coerces the results into a hyp or multihypobjects. Here we wrap fgsea - an R-package for fast preranked gene set enrichment analysis (GSEA).

fgsea is an R-package for fast preranked gene set enrichment analysis (GSEA). This package allows to quickly and accurately calculate arbitrarily low GSEA P-values for a collection of gene sets. P-value estimation is based on an adaptive multi-level split Monte-Carlo scheme.

suppressPackageStartupMessages(library(fgsea))

Here we can take a differential expression analysis and rank genes by their t-statstic or some other measure of differential expression between two groups.

data(limma)

signature <- limma %>% arrange(desc(t)) %>% select(symbol, t) %>% deframe()

head(signature)

LMAN2L SHKBP1  SPHK2  AJUBA   TJP1  TMCC3 
  6.59   6.56   6.37   6.26   6.26   6.11

tail(signature)

   PALD1 FRA10AC1      UNG     BET1  NKIRAS1   GPR146 
   -6.21    -6.28    -6.28    -6.52    -7.26    -8.48

We will also need some genesets…

genesets <- msigdb_gsets("Homo sapiens", "C2", "CP:KEGG", clean=TRUE)

Wrapper function

This is an experimental feature and not officially apart of the package.

The wrapper will take a signature and genesets as normally would be pass to hyper() and will instead feed them into fgsea::fgseaMultilevel(). Please refer to their documentation for specific parameters. This will test your preranked signature in both directions (e.g. enrichment in high and low ranking genes). The wrapper then separates each direction into and up and down signature respectively and formats them within a hyp object. Thus the wrapper will return a multihyp object.

.handle.genesets <- function(genesets) {
    if (is(genesets, "list")) {
        gsets.obj <- gsets$new(genesets, quiet=TRUE)
    }
    else if (is(genesets, "gsets") | is(genesets, "rgsets")) {
        gsets.obj <- genesets
    } 
    else {
        stop("Genesets must be gsets/rgsets object or named list of genesets")
    }
    return(gsets.obj)
}

fgsea.wrapper <- function(signature, genesets, sample.size=101, min.size=1, max.size=Inf, ...) {
    # Save original arguments
    args <- as.list(environment())
    
    # Save gsets object
    gsets.obj <- .handle.genesets(genesets)
    args$genesets <- gsets.obj
    
    # Run fgsea
    results <- fgsea::fgseaMultilevel(stats=signature, 
                                      pathways=gsets.obj$genesets, 
                                      sampleSize=sample.size, 
                                      minSize=min.size, 
                                      maxSize=max.size, 
                                      ...)
    
    data <- results %>%
        data.frame() %>%
        plyr::rename(c("pathway"="label", "padj"="fdr", "log2err"="lte", "size"="overlap", "leadingEdge"="le")) %>%
        dplyr::rename_with(tolower) %>%
        mutate(pval=signif(pval, 2)) %>%
        mutate(fdr=signif(fdr, 2)) %>%
        mutate(le=sapply(le, function(x) paste(x, collapse=','))) %>%
        mutate(signature=length(signature)) %>%
        mutate(geneset=sapply(label, function(x) length(gsets.obj$genesets[[x]]))) %>%
        dplyr::select(c("label", "pval", "fdr", "lte", "es", "nes", "signature", "geneset", "overlap", "le"))
    
    data.up <- data %>%
        dplyr::filter(es > 0) %>%
        dplyr::arrange(pval, es)

    data.dn <- data %>%
        dplyr::filter(es < 0) %>%
        dplyr::arrange(pval, es)    

    # Reproducibility information
    info <- list(fgsea=paste("v", packageVersion("fgsea"), sep=""),
                 signature=length(signature), 
                 genesets=args$genesets$info())
    
    info <- c(info, args[c("sample.size", "min.size", "max.size")])
    info <- lapply(info, as.character)
    
    # Wrap dataframe in hyp object
    hyp.up <- hyp$new(data=data.up, args=args, info=info)
    hyp.dn <- hyp$new(data=data.dn, args=args, info=info)
    mhyp <- multihyp$new(data=list("up"=hyp.up, "dn"=hyp.dn))
    return(mhyp)
}

You’ll likely see some warnings from fgsea…

mhyp_obj <- fgsea.wrapper(signature, genesets)

Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (70.45% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.

Warning in fgsea::fgseaMultilevel(stats = signature, pathways =
gsets.obj$genesets, : For some pathways, in reality P-values are less than
1e-10. You can set the `eps` argument to zero for better estimation.

print(mhyp_obj)

(multihyp) 
  (hyp) up
        132 x 10
  (hyp) dn
        54 x 10

# The hyp object of the up signature
hyp.up <- mhyp_obj$data$up
print(hyp.up$info)

$fgsea
[1] "v1.15.0"

$signature
[1] "9682"

$genesets
[1] "C2.CP:KEGG v7.2.1"

$sample.size
[1] "101"

$min.size
[1] "1"

$max.size
[1] "Inf"

head(hyp.up$data[,1:6])

                           label    pval     fdr       lte        es      nes
1                     Proteasome 8.8e-06 0.00055 0.5933255 0.6790805 2.161802
2           Steroid Biosynthesis 2.2e-05 0.00084 0.5756103 0.8209958 2.107131
3 Neurotrophin Signaling Pathway 2.3e-05 0.00084 0.5756103 0.4861587 1.966820
4                 Tight Junction 3.6e-05 0.00110 0.5573322 0.4919114 1.958631
5                    Endocytosis 3.0e-04 0.00690 0.4984931 0.4103587 1.729304
6             Endometrial Cancer 3.0e-04 0.00690 0.4984931 0.5940982 1.940034

Now we can use it as we normally would…

hyp_dots(mhyp_obj, merge=TRUE)