The CaDrA package currently supports four scoring functions to search for subsets of genomic features that are likely associated with a specific outcome of interest (e.g., protein expression, pathway activity, etc.)
ks
)revealer
)wilcox
)custom
)Below, we run candidate_search()
over the top 3 starting
features using each of the four scoring functions described above.
Important Note:
topn_eval()
is
equivalent to the new and recommended candidate_search()
function
library(CaDrA)
library(pheatmap)
library(SummarizedExperiment)
binary features matrix
also known as
Feature Set
(such as somatic mutations, copy number
alterations, chromosomal translocations, etc.) The 1/0 row vectors
indicate the presence/absence of ‘omics’ features in the samples. The
Feature Set
can be a matrix or an object of class
SummarizedExperiment from
SummarizedExperiment package)Input Scores
)
representing a functional response of interest (such as protein
expression, pathway activity, etc.)The simulated dataset, sim_FS
, comprises of 1000 genomic
features and 100 sample profiles. There are 10 left-skewed (i.e. True
Positive or TP) and 990 uniformly-distributed (i.e. True Null or TN)
features simulated in the dataset. Below is a heatmap of the first 100
features.
See ?ks_rowscore
for more details
ks_topn_l <- CaDrA::candidate_search(
FS = sim_FS,
input_score = sim_Scores,
method = "ks_pval", # Use Kolmogorow-Smirnow scoring function
method_alternative = "less", # Use one-sided hypothesis testing
weights = NULL, # If weights is provided, perform a weighted-KS test
search_method = "both", # Apply both forward and backward search
top_N = 3, # Evaluate top 3 starting points for the search
max_size = 10, # Allow at most 10 features in meta-feature matrix
do_plot = FALSE, # We will plot it AFTER finding the best hits
best_score_only = FALSE # Return all results from the search
)
# Now we can fetch the feature set of top N features that corresponded to the best scores over the top N search
ks_topn_best_meta <- topn_best(ks_topn_l)
# Visualize best meta-feature result
meta_plot(topn_best_list = ks_topn_best_meta)
See ?wilcox_rowscore
for more details
wilcox_topn_l <- CaDrA::candidate_search(
FS = sim_FS,
input_score = sim_Scores,
method = "wilcox_pval", # Use Wilcoxon Rank-Sum scoring function
method_alternative = "less", # Use one-sided hypothesis testing
search_method = "both", # Apply both forward and backward search
top_N = 3, # Evaluate top 3 starting points for the search
max_size = 10, # Allow at most 10 features in meta-feature matrix
do_plot = FALSE, # We will plot it AFTER finding the best hits
best_score_only = FALSE # Return all results from the search
)
# Now we can fetch the feature set of top N feature that corresponded to the best scores over the top N search
wilcox_topn_best_meta <- topn_best(topn_list = wilcox_topn_l)
# Visualize best meta-feature result
meta_plot(topn_best_list = wilcox_topn_best_meta)
See ?revealer_rowscore
for more details
revealer_topn_l <- CaDrA::candidate_search(
FS = sim_FS,
input_score = sim_Scores,
method = "revealer", # Use REVEALER's CMI scoring function
search_method = "both", # Apply both forward and backward search
top_N = 3, # Evaluate top 3 starting points for the search
max_size = 10, # Allow at most 10 features in meta-feature matrix
do_plot = FALSE, # We will plot it AFTER finding the best hits
best_score_only = FALSE # Return all results from the search
)
# Now we can fetch the ESet of top feature that corresponded to the best scores over the top N search
revealer_topn_best_meta <- topn_best(topn_list = revealer_topn_l)
# Visualize best meta-feature result
meta_plot(topn_best_list = revealer_topn_best_meta)
See ?custom_rowscore
for more details
# A customized function using ks-test
customized_ks_rowscore <- function(FS, input_score, meta_feature=NULL, alternative="less", metric="pval"){
# Check if meta_feature is provided
if(!is.null(meta_feature)){
# Getting the position of the known meta features
locs <- match(meta_feature, row.names(FS))
# Taking the union across the known meta features
if(length(meta_feature) > 1) {
meta_vector <- as.numeric(ifelse(colSums(FS[meta_feature,]) == 0, 0, 1))
}else{
meta_vector <- as.numeric(FS[meta_feature,])
}
# Remove the meta features from the binary feature matrix
# and taking logical OR btw the remaining features with the meta vector
FS <- base::sweep(FS[-locs, , drop=FALSE], 2, meta_vector, `|`)*1
# Check if there are any features that are all 1s generated from
# taking the union between the matrix
# We cannot compute statistics for such features and thus they need
# to be filtered out
if(any(rowSums(FS) == ncol(FS))){
warning("Features with all 1s generated from taking the matrix union ",
"will be removed before progressing...\n")
FS <- FS[rowSums(FS) != ncol(FS), , drop=FALSE]
}
}
# KS is a ranked-based method
# So we need to sort input_score from highest to lowest values
input_score <- sort(input_score, decreasing=TRUE)
# Re-order the matrix based on the order of input_score
FS <- FS[, names(input_score), drop=FALSE]
# Compute the scores using the KS method
ks <- apply(FS, 1, function(r){
x = input_score[which(r==1)];
y = input_score[which(r==0)];
res <- ks.test(x, y, alternative=alternative)
return(c(res$statistic, res$p.value))
})
# Obtain score statistics
stat <- ks[1,]
# Obtain p-values and change values of 0 to the machine lowest value
# to avoid taking -log(0)
pval <- ks[2,]
pval[which(pval == 0)] <- .Machine$double.xmin
# Compute the -log(pval)
# Make sure scores has names that match the row names of FS object
pval <- -log(pval)
# Determine which metric to returned the scores
if(metric == "pval"){
scores <- pval
}else{
scores <- stat
}
names(scores) <- rownames(FS)
return(scores)
}
# Search for best features using a custom-defined function
custom_topn_l <- CaDrA::candidate_search(
FS = SummarizedExperiment::assay(sim_FS),
input_score = sim_Scores,
method = "custom", # Use custom scoring function
custom_function = customized_ks_rowscore, # Use a customized scoring function
custom_parameters = NULL, # Additional parameters to pass to custom_function
search_method = "both", # Apply both forward and backward search
top_N = 3, # Evaluate top 3 starting points for the search
max_size = 10, # Allow at most 10 features in meta-feature matrix
do_plot = FALSE, # We will plot it AFTER finding the best hits
best_score_only = FALSE # Return all results from the search
)
# Now we can fetch the feature set of top N feature that corresponded to the best scores over the top N search
custom_topn_best_meta <- topn_best(topn_list = custom_topn_l)
# Visualize best meta-feature result
meta_plot(topn_best_list = custom_topn_best_meta)
sessionInfo()
R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: UTC
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] CaDrA_1.0.2 testthat_3.2.1
[3] devtools_2.4.5 usethis_2.2.3
[5] pheatmap_1.0.12 SummarizedExperiment_1.32.0
[7] Biobase_2.62.0 GenomicRanges_1.54.1
[9] GenomeInfoDb_1.38.7 IRanges_2.36.0
[11] S4Vectors_0.40.2 BiocGenerics_0.48.1
[13] MatrixGenerics_1.14.0 matrixStats_1.2.0
loaded via a namespace (and not attached):
[1] bitops_1.0-7 tcltk_4.3.3 remotes_2.4.2.1
[4] rlang_1.1.3 magrittr_2.0.3 compiler_4.3.3
[7] reshape2_1.4.4 systemfonts_1.0.6 vctrs_0.6.5
[10] stringr_1.5.1 profvis_0.3.8 pkgconfig_2.0.3
[13] crayon_1.5.2 fastmap_1.1.1 XVector_0.42.0
[16] ellipsis_0.3.2 labeling_0.4.3 caTools_1.18.2
[19] utf8_1.2.4 promises_1.2.1 rmarkdown_2.26
[22] sessioninfo_1.2.2 ragg_1.2.7 purrr_1.0.2
[25] xfun_0.42 zlibbioc_1.48.0 cachem_1.0.8
[28] jsonlite_1.8.8 highr_0.10 later_1.3.2
[31] DelayedArray_0.28.0 parallel_4.3.3 R6_2.5.1
[34] bslib_0.6.1 stringi_1.8.3 RColorBrewer_1.1-3
[37] pkgload_1.3.4 brio_1.1.4 jquerylib_0.1.4
[40] Rcpp_1.0.12 iterators_1.0.14 knitr_1.45
[43] R.utils_2.12.3 R.cache_0.16.0 httpuv_1.6.14
[46] Matrix_1.6-5 rstudioapi_0.15.0 abind_1.4-5
[49] yaml_2.3.8 doParallel_1.0.17 gplots_3.1.3.1
[52] codetools_0.2-19 miniUI_0.1.1.1 misc3d_0.9-1
[55] pkgbuild_1.4.3 lattice_0.22-5 tibble_3.2.1
[58] plyr_1.8.9 shiny_1.8.0 withr_3.0.0
[61] evaluate_0.23 desc_1.4.3 urlchecker_1.0.1
[64] pillar_1.9.0 KernSmooth_2.23-22 foreach_1.5.2
[67] rprojroot_2.0.4 ggplot2_3.5.0 munsell_0.5.0
[70] scales_1.3.0 gtools_3.9.5 xtable_1.8-4
[73] glue_1.7.0 ppcor_1.1 tools_4.3.3
[76] fs_1.6.3 grid_4.3.3 colorspace_2.1-0
[79] GenomeInfoDbData_1.2.11 cli_3.6.2 textshaping_0.3.7
[82] fansi_1.0.6 S4Arrays_1.2.1 gtable_0.3.4
[85] R.methodsS3_1.8.2 sass_0.4.8 digest_0.6.35
[88] SparseArray_1.2.4 farver_2.1.1 htmlwidgets_1.6.4
[91] R.oo_1.26.0 memoise_2.0.1 htmltools_0.5.7
[94] pkgdown_2.0.7 lifecycle_1.0.4 mime_0.12
[97] MASS_7.3-60.0.1