Source code for python_bioinformagicks.tools._do_gprofiler_analysis

import requests
import pandas as pd

[docs] def do_gprofiler_analysis( genes: list[str], organism: str = "hsapiens", max_term_size: int = 10000, ordered: bool = False, sources: list[str] = ["GO:BP", "GO:MF", "REAC", "KEGG", "TF"], highlight: bool = True, ): """ Given an ordered list of genes, performs an over-representation analysis (ORA) using the gProfiler API. Parameters ---------- genes: list of str List of genes to query, optionally sorted by user based on FDR or another significance metric. organism: str (default: "hsapiens") The organism ID to use for the query. See https://biit.cs.ut.ee/gprofiler/page/organism-list for a list of organism IDs supported by gProfiler. Commonly one of: ["hsapiens", "mmusculus"] max_term_size: uint (default: 10000) Filter results table to only include those results with term sizes less than this maximum. Set to None to disable. ordered: bool (default: False) If the list of genes is ordered by descending significance, set to True, otherwise set to False. A slightly different ORA is performed on ordered gene lists. sources: list of str (default: ["GO\\:BP", "GO\\:MF", "REAC", "KEGG", "TF"]) The list of source databases to consider for gProfiler ORA. Some may only be available for certain organisms; see organism list page in the gProfiler documentation for more information. highlight: bool (default: False) If True, request that gProfiler add a 'highlighted' column to the resulting dataframe indicating if a given term is a 'driver' term. See: https://biit.cs.ut.ee/gprofiler/page/docs#highlight_go Returns ------- ora_df: pandas.DataFrame The resulting gProfiler results dataframe. References ---------- * https://biit.cs.ut.ee/gprofiler/gost * https://doi.org/10.1093/nar/gkad347 Usage ----- >>> diff_exp_genes = ["SFTPC", "LAMP3", "NAPSA", "SFTPB", "EPCAM", "COL1A1"] >>> ora_df = do_gprofiler_analysis(diff_exp_genes, ordered=False) >>> print(ora_df.head(1)[["source", "name", "p_value", "intersections"]]) source name p_value intersections 0 REAC Surfactant metabolism 0.000033 [SFTPC, NAPSA, SFTPB] """ # check if this is an ordered query; if it is, gProfiler # API cannot perform highlighting (interesting) if (ordered and highlight): highlight = False res = requests.post( url = "https://biit.cs.ut.ee/gprofiler/api/gost/profile/", json = { "organism": organism, "query": genes, "sources": sources, "highlight": highlight, "ordered": ordered, "no_evidences": True, "no_iea": False, }, headers = { "User-Agent": "FullPythonRequest" } ) ora_df = pd.DataFrame.from_dict(res.json()['result']) if (max_term_size) & (max_term_size > 0): ora_df = ora_df[ora_df["term_size"] < max_term_size] return ora_df