Source code for python_bioinformagicks.tools._tf_idf_markers

"""
Code for identifying marker genes based on
term-frequency, inverse-document frequency
testing (TF-IDF).
"""

import scanpy as sc
import anndata as ad

import pandas as pd
import numpy as np

from scipy.stats import (
    hypergeom,
    rankdata
)


[docs]
def tf_idf_markers(
    adata: ad.AnnData, 
    groupby: str, 
    n_genes: int = 10, 
    n_genes_to_plot: int = 10, 
    redo_threshold: bool = False,
    dendrogram: bool = False, 
    save_markers: bool = False, 
    return_fig: bool = False,
    var_key_to_ignore: str = None,
    **kwargs
):
    """
    Performs term-frequency, inverse-document frequency 
    calculation in order to rank marker genes across
    groups in groupby. This is an alternative to mean expression
    based DEG testing that prioritizes uniqueness of genes 
    over fold-change in expression across groups. 

    TODO: make compatible with :code:`scanpy.tl.rank_genes_groups`
    
    Parameters
    ---------
    adata: ad.AnnData
        The anndata object for this experiment
    
    groupby: str
        The column in adata.obs that we will split the 
        data on for the TF-IDF test. 
        
    n_genes: int > 0 (default: 10)
        The number of top genes per group to return 
        in the marker gene dictionary
    
    n_genes_to_plot: int in range (0, n_genes] (default: 10)
        The number of genes per group to plot in the 
        resulting dotplot. Set to 0 to disable plot generation.
    
    redo_threshold: bool (default: False)
        If :code:`adata.layers["binary"]` exists, redo the 
        calculation of the binary matrix. Possibly useful 
        when performing TF-IDF ranking on a subset of adata.
        
    save_markers: bool (default: False)
        If True, store the marker gene matrix in the adata
        object as :code:`adata.uns["TF_IDF_" + groupby]`
            
    return_fig: bool (default: False)
        If True, output of this function changes to a tuple
        of dict, dpl (dotplot object from scanpy) instead
        of just dict. Otherwise, dotplot is generated and
        shown but not returned from this function.
    
    var_key_to_ignore: str (default: None)
        The name of a boolean column in adata.var which
        indicates genes that should be ignored during marker
        gene testing (often mitochondrial genes, pseudogenes,
        ribosomal genes, etc.). If `None`, no gene masking
        is performed.
        
    kwargs:
        Arguments passed to :code:`sc.pl.dotplot` 
        (if `n_genes_to_plot>0`)

    Returns
    -------
    marker_dict: dict
        The marker gene dictionary. 
        Keys are :code:`adata.obs[groupby].unique()`
    
    dpl: sc.pl.DotPlot (optional)
        If :code:`plot is True` and :code:`return_fig is True`, 
        scanpy DotPlot object is also returned for 
        further processing/saving.
    
    References
    ----------

    * https://github.com/constantAmateur/SoupX
    * https://doi.org/10.1093/gigascience/giaa151
    * https://constantamateur.github.io/2020-04-10-scDE
    """

    # binarize the gene expression data
    if ((redo_threshold) 
    or ("binary" not in adata.layers.keys())):
        threshold = np.percentile(adata.X.data, 5)
        X = adata.X.copy()
        X.data = np.where(adata.X.data > threshold, 1, 0)
        adata.layers["binary"] = X.copy().astype(np.bool_)
        del X
    
    # get gene (var) mask
    if (var_key_to_ignore in adata.var.columns):
        gene_mask = ~adata.var[var_key_to_ignore].astype(np.bool_)
    else:
        gene_mask = np.array([True for x in adata.var.index])
    
    # calculate the TF matrix
    clusters = adata.obs[groupby].astype("category").cat.categories
    X_clust  = []
    for c in clusters:
        cell_mask = adata.obs[groupby] == c
        expression = np.squeeze(
            np.asarray(
                np.sum(adata[cell_mask, gene_mask].layers["binary"], axis=0)
            )
        )
        X_clust.append(expression)
    cluster_max = np.max(X_clust, axis=1)
    TF = np.asarray(X_clust).T / cluster_max
    TF = TF.T
    
    # calculate the IDF vector
    n_cells = len(adata.obs.index)
    n_cells_expressing_gene = adata[:, gene_mask].layers["binary"].getnnz(axis=0)
    IDF = (np.log2((n_cells + 1) / (n_cells_expressing_gene + 1)))
    
    # calculate the TF-IDF matrix (marker gene scores)
    TF_IDF = TF * IDF
    TF_IDF = pd.DataFrame(
        data=TF_IDF, 
        index=clusters, 
        columns=adata[:, gene_mask].var.index.tolist()
    ).T
    if (save_markers):
        adata.uns["TF_IDF_" + groupby] = TF_IDF
    
    """
    Do hypergeometric test to assign p-values to this
    distribution of gene-cluster frequencies.
    M is the total number of objects, 
    n is total number of Type I objects. 
    N is the number of items selected/draws
    k is the observed number of Type I objects selected/drawn

    Suppose we have a collection of 20 genes, of which 7 are DEGs. 
    Then if we want to know the probability of finding a given number 
    of DEGs if we choose at random 12 of the 20 genes, we can 
    initialize a frozen distribution and plot the probability mass function:
    [M, n, N] = [20, 7, 12]
    """

    k = TF
    M = n_cells
    n = n_cells_expressing_gene
    N = cluster_max
    pvals = {}
    adjusted_pvals  = {}
    for i, c in enumerate(clusters):
        pvals[c] = hypergeom.pmf(k[i], M, n, N[i])
        adjusted_pvals[c]  = _bh_correct(pvals[c])
    adjusted_pvals = pd.DataFrame.from_dict(
        adjusted_pvals, 
        orient="index", 
        columns=adata[:, gene_mask].var.index.tolist()
    ).T
    
    # get the top N genes for each cluster
    target_fdr = 0.05
    top_genes = {}
    top_adjusted_pvals  = {}
    for i, c in enumerate(clusters):
        mask = adjusted_pvals[c] <= target_fdr
        df = TF_IDF[mask].sort_values(c, ascending=False).head(n_genes)
        genes = df.index.tolist()
        genes_idx = [adata[:, gene_mask].var.index.get_loc(g) for g in genes]
        top_genes[c] = genes
        top_adjusted_pvals[c]  = adjusted_pvals[c].iloc[genes_idx]
    
    if (n_genes_to_plot > 0):
        if (0 < n_genes_to_plot < n_genes):
            plot_me = {}
            for k in clusters:
                plot_me[k] = top_genes[k][0:n_genes_to_plot]
        else:
            plot_me = top_genes

        dpl = sc.pl.dotplot(
            adata, 
            plot_me, 
            groupby=groupby, 
            dendrogram=dendrogram,
            return_fig=True,
            **kwargs
        )
        dpl.add_totals(color="lightgray").style(cmap="Reds")
        
        if (return_fig):
            return top_genes, dpl
        else:
            dpl.show()
    
    return top_genes



def _bh_correct(
    pvals: np.array
):
    """
    Perform Benjamini-Hochberg p-value adjustment
    for multiple comparisons.

    Parameters
    ----------

    pvals: np.array (in range [0,1])
        The uncorrected p-values

    Returns
    -------

    adjusted_pvals: np.array (in range [0,1])
        The BH adjusted p-values
    """

    ranked_pvals = rankdata(pvals)
    adjusted_pvals = pvals * len(pvals) / ranked_pvals
    adjusted_pvals[adjusted_pvals > 1] = 1
    return adjusted_pvals