Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Description: The SplicingFactory R package uses transcript-level expression
transcript isoform diversity within samples or between conditions.
Additionally, the package analyzes the isoform diversity data, looking for
significant changes between conditions.
RoxygenNote: 7.1.1
RoxygenNote: 7.3.3
Imports: SummarizedExperiment, methods, stats
Suggests:
testthat,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(calculate_entropy)
export(calculate_gini)
export(calculate_inverse_simpson)
export(calculate_simpson)
export(calculate_tsallis_entropy)
import(methods)
import(stats)
importFrom(SummarizedExperiment,SummarizedExperiment)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# SplicingFactory 1.3.2 (dev)

* Added Tsallis entropy as a diversity metric, with full documentation and examples.
* Users can now set the `q` parameter for Tsallis entropy in `calculate_diversity()` and `calculate_method()`.
* Documentation and vignette updated to reflect this new feature.

# SplicingFactory 1.3.1 (dev)

* Citation update
Expand Down
50 changes: 37 additions & 13 deletions R/calculate_diversity.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
#' input dataset with transcript-level expression values. The values in
#' \code{x} are grouped into genes based on this vector.
#' @param method Method to use for splicing diversity calculation, including
#' naive entropy (\code{naive}), Laplace entropy (\code{laplace}), Gini index
#' (\code{gini}), Simpson index (\code{simpson}) and inverse Simpson index
#' naive entropy (\code{naive}), Laplace entropy (\code{laplace}), Tsallis entropy (\code{tsallis}),
#' Gini index (\code{gini}), Simpson index (\code{simpson}) and inverse Simpson index
#' (\code{invsimpson}). The default method is Laplace entropy.
#' @param norm If \code{TRUE}, the entropy values are normalized to the number
#' of transcripts for each gene. The normalized entropy values are always
Expand All @@ -21,6 +21,7 @@
#' to use for diversity calculations.
#' @param verbose If \code{TRUE}, the function will print additional diagnostic
#' messages, besides the warnings and errors.
#' @param q Tsallis entropy parameter (q > 0). Only used if method = "tsallis". Default is 2. If q = 1, the function returns the Shannon entropy.
#' @return Gene-level splicing diversity values in a \code{SummarizedExperiment}
#' object.
#' @import methods
Expand All @@ -44,6 +45,8 @@
#' values mean a more diverse set of transcripts for a gene.
#' \item Laplace entropy: Shannon entropy where the transcript frequencies are
#' replaced by a Bayesian estimate, using Laplace's prior.
#' \item Tsallis entropy: A generalization of Shannon entropy, parameterized by q (q > 0).
#' For q → 1, Tsallis entropy converges to Shannon entropy. The default q is 2.
#' \item Gini index: a measure of statistical dispersion originally used in
#' economy. This measurement ranges from 0 (complete equality) to 1
#' (complete inequality). A value of 1 (complete inequality) means a single
Expand Down Expand Up @@ -73,7 +76,7 @@
#' # calculating normalized Laplace entropy
#' result <- calculate_diversity(x, gene, method = "laplace", norm = TRUE)
calculate_diversity <- function(x, genes = NULL, method = "laplace", norm = TRUE,
tpm = FALSE, assayno = 1, verbose = FALSE) {
tpm = FALSE, assayno = 1, verbose = FALSE, q = 2) {
if (!(is.matrix(x) || is.data.frame(x) || is.list(x) || is(x, "DGEList") ||
is(x, "RangedSummarizedExperiment") || is(x, "SummarizedExperiment"))) {
stop("Input data type is not supported! Please use `?calculate_diversity`
Expand Down Expand Up @@ -143,7 +146,7 @@ calculate_diversity <- function(x, genes = NULL, method = "laplace", norm = TRUE
stop("The number of rows is not equal to the given gene set.", call. = FALSE)
}

if (!(method %in% c("naive", "laplace", "gini", "simpson", "invsimpson"))) {
if (!(method %in% c("naive", "laplace", "tsallis", "gini", "simpson", "invsimpson"))) {
stop("Invalid method. Please use `?calculate_diversity` to see the possible
arguments and details.",
call. = FALSE
Expand All @@ -168,18 +171,39 @@ calculate_diversity <- function(x, genes = NULL, method = "laplace", norm = TRUE
have any effect on the calculation.", call. = FALSE)
}

result <- calculate_method(x, genes, method, norm, verbose = verbose)
result <- calculate_method(x, genes, method, norm, verbose = verbose, q = q)

# Prepare assay and row/col data
result_assay <- result[, -1, drop = FALSE]
rownames(result_assay) <- result[, 1]
result_rowData <- data.frame(genes = result[, 1], row.names = result[, 1])
result_colData <- data.frame(samples = colnames(x), row.names = colnames(x))
result_metadata <- list(method = method, norm = norm)

result <- SummarizedExperiment(assays = list(diversity = result_assay),
rowData = result_rowData,
colData = result_colData,
metadata = result_metadata)
if (method == "tsallis" && length(q) > 1) {
col_split <- do.call(rbind, strsplit(colnames(result)[-1], "_q="))
col_ids <- paste0(col_split[,1], "_q=", col_split[,2])
row_ids <- as.character(result[, 1])
result_colData <- data.frame(
samples = as.character(col_split[, 1]),
q = as.numeric(col_split[, 2]),
row.names = col_ids,
stringsAsFactors = FALSE
)
colnames(result_assay) <- col_ids
rownames(result_assay) <- row_ids
} else {
col_ids <- colnames(x)
row_ids <- as.character(result[, 1])
result_colData <- data.frame(samples = col_ids, row.names = col_ids)
colnames(result_assay) <- col_ids
rownames(result_assay) <- row_ids
}

result_metadata <- list(method = method, norm = norm)
if (method == "tsallis") result_metadata$q <- q

return(result)
SummarizedExperiment(
assays = list(diversity = result_assay),
rowData = result_rowData,
colData = result_colData,
metadata = result_metadata
)
}
35 changes: 32 additions & 3 deletions R/calculate_method.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
#' input dataset with transcript-level expression values. The values in
#' \code{x} are grouped into genes based on this vector.
#' @param method Method to use for splicing diversity calculation, including
#' naive entropy (\code{naive}), Laplace entropy (\code{laplace}), Gini index
#' (\code{gini}), Simpson index (\code{simpson}) and inverse Simpson index
#' naive entropy (\code{naive}), Laplace entropy (\code{laplace}), Tsallis entropy (\code{tsallis}),
#' Gini index (\code{gini}), Simpson index (\code{simpson}) and inverse Simpson index
#' (\code{invsimpson}). The default method is Laplace entropy.
#' @param norm If \code{TRUE}, the entropy values are normalized to the number
#' of transcripts for each gene. The normalized entropy values are always
#' between 0 and 1. If \code{FALSE}, genes cannot be compared to each other,
#' due to possibly different maximum entropy values.
#' @param q Tsallis entropy parameter (q > 0). Only used if method = "tsallis". Default is 2. If q = 1, the function returns the Shannon entropy.
#' @param verbose If \code{TRUE}, the function will print additional diagnostic
#' messages, besides the warnings and errors.
#' @return Gene-level splicing diversity values in a \code{data.frame}, where
Expand All @@ -23,7 +24,8 @@
#' transcript-level expression values, aggregated by the genes defined in the
#' \code{genes} parameter.
#' @import stats
calculate_method <- function(x, genes, method, norm = TRUE, verbose = FALSE) {
calculate_method <- function(x, genes, method, norm = TRUE, verbose = FALSE, q = 2) {

if (method == "naive") {
x <- aggregate(x, by = list(genes), calculate_entropy, norm = norm)
}
Expand All @@ -33,6 +35,33 @@ calculate_method <- function(x, genes, method, norm = TRUE, verbose = FALSE) {
pseudocount = 1)
}

if (method == "tsallis") {
# It is not possible to use aggregate here, because it expect to return
# a single value per group, but calculate_tsallis_entropy can return
# multiple values (if length(q) > 1)
gene_levels <- unique(genes)
coln <- as.vector(outer(colnames(x), q, function(s, qq) paste0(s, "_q=", qq)))
rown <- gene_levels
tsallis_row <- function(gene) {
idx <- which(genes == gene)
unlist(lapply(seq_len(ncol(x)), function(j) {
v <- calculate_tsallis_entropy(x[idx, j], q = q, norm = norm)
if (length(v) == length(q) && all(is.finite(v) | is.na(v))) v else setNames(rep(NA_real_, length(q)), paste0("q=", q))
}))
}
result_mat <- t(vapply(gene_levels, tsallis_row, FUN.VALUE = setNames(numeric(length(coln)), coln)))
colnames(result_mat) <- coln
rownames(result_mat) <- rown
out_df <- data.frame(Gene = rown, result_mat, check.names = FALSE)
if (all(rowSums(!is.na(result_mat)) == 0)) {
out_df <- data.frame(Gene=character(0))
for (nm in coln) out_df[[nm]] <- numeric(0)
x <- out_df
return(x)
}
x <- out_df
}

if (method == "gini") {
x <- aggregate(x, by = list(genes), calculate_gini)
}
Expand Down
56 changes: 56 additions & 0 deletions R/diversity_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,59 @@ calculate_inverse_simpson <- function(x) {
}
return(x)
}

#' Calculate Tsallis entropy for a vector of transcript-level
#' expression values of one gene.
#'
#' @param x Vector of expression values.
#' @param q Tsallis entropy parameter (q > 0). Default is 2. If q = 1, the function returns the Shannon entropy.
#' @param norm If \code{TRUE}, the entropy values are normalized to the number
#' of transcripts for each gene. The normalized entropy values are always
#' between 0 and 1. If \code{FALSE}, genes cannot be compared to each other,
#' due to possibly different maximum entropy values.
#' @export
#' @return A single gene-level Tsallis entropy value.
#' @details
#' The function calculates the Tsallis entropy, a generalization of Shannon entropy.
#' For q → 1, Tsallis entropy converges to Shannon entropy.
#' If there is only a single transcript, the value will be NaN.
#' If the expression of the given gene is 0, the value will be NA.
##' @examples
##' x <- rnbinom(5, size = 10, prob = 0.4)
##' tsallis <- calculate_tsallis_entropy(x, q = 2)
##'
##' @param q Tsallis entropy parameter (q > 0). Can be a single value or a numeric vector. Default is 2. If q = 1, the function returns the Shannon entropy.
calculate_tsallis_entropy <- function(x, q = 2, norm = TRUE) {
if (!is.numeric(q)) stop("q must be numeric.")
if (any(q <= 0)) stop("q must be greater than 0.")
if (sum(x) != 0 & length(x) > 1) {
p <- x / sum(x)
tsallis_vec <- vapply(q, function(qi) {
if (abs(qi - 1) < .Machine$double.eps^0.5) {
# q == 1, return Shannon entropy
shannon <- -sum(ifelse(p > 0, p * log2(p), 0))
if (norm) {
shannon <- shannon / log2(length(x))
}
return(shannon)
} else {
tsallis <- (1 - sum(p^qi)) / (qi - 1)
if (norm) {
max_tsallis <- (1 - length(x)^(1 - qi)) / (qi - 1)
tsallis <- tsallis / max_tsallis
}
return(tsallis)
}
}, numeric(1))
if (length(q) == 1) {
return(unname(tsallis_vec))
} else {
names(tsallis_vec) <- paste0("q=", q)
return(tsallis_vec)
}
} else if (length(x) == 1) {
return(rep(NaN, length(q)))
} else {
return(rep(NA_real_, length(q)))
}
}
11 changes: 8 additions & 3 deletions man/calculate_diversity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions man/calculate_method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 37 additions & 0 deletions man/calculate_tsallis_entropy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions tests/testthat/test-calculate_diversity.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,35 @@ test_that("Basic input error handling is working.", {
}
}
})

test_that("calculate_diversity supports q as a vector and returns correct metadata", {
x <- matrix(c(0, 0, 5, 4, 1, 2, 2, 2, 2, 2), ncol = 2)
colnames(x) <- c("Sample1", "Sample2")
gene <- c("Gene1", "Gene1", "Gene1", "Gene1", "Gene1")
qvec <- c(1.1, 1.5, 2)
result <- calculate_diversity(x, gene, method = "tsallis", norm = TRUE, q = qvec)
# Assay should have columns for each q and sample
assay_names <- colnames(assay(result))
for (qi in qvec) {
expect_true(any(grepl(paste0("q=", qi), assay_names)))
}
# Metadata should contain the q vector
expect_equal(S4Vectors::metadata(result)$q, qvec)
})
test_that("calculate_diversity passes q parameter for Tsallis entropy", {
x <- matrix(c(0, 0, 5, 4, 1, 2, 2, 2, 2, 2), ncol = 2)
colnames(x) <- c("Sample1", "Sample2")
gene <- c("Gene1", "Gene1", "Gene1", "Gene1", "Gene1")
# Calculate with q = 2
result_q2 <- calculate_diversity(x, gene, method = "tsallis", norm = TRUE, q = 2)
# Calculate with q = 1.5
result_q15 <- calculate_diversity(x, gene, method = "tsallis", norm = TRUE, q = 1.5)
# Extract values
val_q2 <- assay(result_q2)[1, 1]
val_q15 <- assay(result_q15)[1, 1]
expect_true(is.numeric(val_q2))
expect_true(is.numeric(val_q15))
expect_false(is.na(val_q2))
expect_false(is.na(val_q15))
expect_false(abs(val_q2 - val_q15) < 1e-8) # Should be different for different q
})
Loading