Source code for python_bioinformagicks.plotting._plot_gprofiler_results

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from natsort import natsorted

from ..utilities._truncate_colormap import truncate_colormap


[docs]
def plot_gprofiler_results(
    ora_df: pd.DataFrame,
    title: str = "",
    cmap: bool = "Blues",
    n_terms: int = 15,
    sort_by: str = "fold_enrichment",
    min_FE: int = 0
):
    """
    Generates a barplot representing gProfiler
    ORA results, with: 
    `x = -log10(FDR)` 
    `y = term name`
    `color = term fold enrichment`
    
    Parameters
    ----------
    
    ora_df: pandas.DataFrame
        The results of a gProfiler ORA.
        
    title: str (default: "")
        The base title of the plot; may 
        be modified if iterating over
        gProfiler data sources
        
    cmap: str (default: "Blues")
        The matplotlib colormap to map to the 
        color parameter (fold-enrichment, FE)
    
    n_terms: uint (default: 15)
        The number of over-represented terms
        to keep for plotting, ranked by term
        fold enrichment (FE)
    
    sort_by: str (default: 'fold_enrichment')
        One of `(FE, FDR)`.
        Sort the statistically significant terms by
        term fold enrichment (FE), i.e. 
        `(n_in_term/n_expected_in_term)`
        before cutting off to top n_terms. 
        If `FDR`, sort instead by statistical
        significance before cutting off.
    
    min_FE: uint (default: 0)
        Ignore terms if their term fold enrichment
        is below this minimum value.
    
    Returns
    -------
    
    fig: matplotlib.Figure
        The resulting figures, one per source,
        aligned vertically.
    """

    # Determine how many subplots to make and create the axes
    n_sources = len(ora_df["source"].unique())

    fig, axs = plt.subplots(
        n_sources,1, 
        figsize = (10,7*n_sources), 
        squeeze = True
    )

    if (n_sources == 0):
        print("No ORA result sources (GO:BP, GO:MF, ...) in dataframe.")
        return None
    if (n_sources == 1):
        axs = [axs]

    for i,source in enumerate(natsorted(ora_df["source"].unique())):
        # filter to just this source
        d = ora_df[ora_df["source"]==source]
        if (len(d) < 2):
            continue

        # remove 'match class' terms, which are often duplicates of TF terms
        d = d[~d["name"].str.contains("match class")]

        # calculate term fold enrichment
        d["n_in_term"] = d["intersection_size"].tolist()
        d["n_expected_in_term"] = d["query_size"] * d["term_size"]
        d["n_expected_in_term"] /= d["effective_domain_size"]
        d["FE"] = d["n_in_term"] / (0.05 + d["n_expected_in_term"])

        # calculate -log10(p_adj)
        # gProfiler automatically converts to FDR/p_adj, then re-labels as pvals
        d["FDR"] = d["p_value"] 
        d["-log10(FDR)"] = -1 * np.log10(d["FDR"])

        # clean up term names for compactness
        max_len = 50
        if (source == "TF"):
            d["name"] = d["name"].apply(lambda x: _shorten_TF_term(x, max_len))
        elif ("GO:" in source):
            d["name"] = d["name"].apply(lambda x: _shorten_GO_term(x))
        
        # remove low term fold enrichment terms
        df = d[d["FE"] >= min_FE]

        # sort before cutting to top n_terms
        if (sort_by in ["fold_enrichment", "FE"]):
            df = df.sort_values("FE", ascending=False)
        elif (sort_by in ["false_discovery_rate", "pval", "FDR"]):
            df = df.sort_values("FDR", ascending=True)
        
        data = df.head(n_terms)
        data = data.sort_values("FDR", ascending=False)
        
        # generate a pleasant colormap to represent term fold enrichment
        color_facet = "FE"

        new_cmap = truncate_colormap(
            matplotlib.colormaps.get_cmap(cmap),
            minval=0.4,
            maxval=0.9
        )
        raw_vals = data[color_facet].to_numpy()
        vals = (raw_vals - np.min(raw_vals)) / (np.ptp(raw_vals))
        cvals = [new_cmap(v) for v in vals]

        # draw the bars
        axs[i].barh(
            data["name"], 
            data["-log10(FDR)"], 
            color=cvals,
            edgecolor="black", 
            linewidth=2
        )

        # format the graph
        axs[i].set_xlabel("-log10(FDR)")
        axs[i].set_ylabel(source)
        axs[i].set_title(title.replace("gProfiler_", "").replace("_", " "))
        axs[i].grid(False)

        # add the colorbar and map reasonable integer values to its ticklabels
        cbar = fig.colorbar(
            matplotlib.cm.ScalarMappable(cmap=new_cmap),
            ticks=[0,0.5,1],
            ax=axs[i],
        )
        cbar_tickmarks = [
            int(round(np.min(raw_vals),0)),
            int(round((np.min(raw_vals) + np.max(raw_vals))/2,0)),
            int(round(np.max(raw_vals),0))
        ]
        raw_val_range = np.ptp(raw_vals)
        if (raw_val_range < 3):
            cbar_tickmarks[0] = cbar_tickmarks[0] - 1
            cbar_tickmarks[2] = cbar_tickmarks[2] + 1
        
        cbar.ax.set_yticklabels(cbar_tickmarks)
        cbar.ax.set_ylabel("Term fold enrichment")

        # set the font sizes to something appropriate
        # TODO: is there a better way to parametrize this?
        for item in (
            [axs[i].title, axs[i].xaxis.label, axs[i].yaxis.label] 
            + axs[i].get_xticklabels()
            + axs[i].get_yticklabels()
            + [cbar.ax.title, axs[i].xaxis.label, axs[i].yaxis.label]
            + cbar.ax.get_xticklabels()
            + cbar.ax.get_yticklabels()
        ):
            item.set_fontsize(20) 

    return fig



def _shorten_GO_term(x):
    ret = x.replace("ositive", "os.")
    ret = ret.replace("egative", "eg.")
    ret = ret.replace("egulation", "eg.")
    return ret

def _shorten_TF_term(x, max_len):
    ret = x.split("Factor: ")[1]
    ret = ret.replace(";","")
    ret = ret.rjust(max_len)
    return ret