Source code for python_bioinformagicks.tools._do_gprofiler_analysis
import requests
import pandas as pd
[docs]
def do_gprofiler_analysis(
genes: list[str],
organism: str = "hsapiens",
max_term_size: int = 10000,
ordered: bool = False,
sources: list[str] = ["GO:BP", "GO:MF", "REAC", "KEGG", "TF"],
highlight: bool = True,
):
"""
Given an ordered list of genes,
performs an over-representation analysis (ORA)
using the gProfiler API.
Parameters
----------
genes: list of str
List of genes to query, optionally sorted by user
based on FDR or another significance metric.
organism: str (default: "hsapiens")
The organism ID to use for the query.
See https://biit.cs.ut.ee/gprofiler/page/organism-list
for a list of organism IDs supported
by gProfiler.
Commonly one of: ["hsapiens", "mmusculus"]
max_term_size: uint (default: 10000)
Filter results table to only include those
results with term sizes less than this
maximum. Set to None to disable.
ordered: bool (default: False)
If the list of genes is ordered by
descending significance, set to True,
otherwise set to False. A slightly different
ORA is performed on ordered gene lists.
sources: list of str (default: ["GO\\:BP", "GO\\:MF", "REAC", "KEGG", "TF"])
The list of source databases to consider for
gProfiler ORA. Some may only be available for
certain organisms; see organism list page in
the gProfiler documentation for more
information.
highlight: bool (default: False)
If True, request that gProfiler add a 'highlighted'
column to the resulting dataframe indicating if a
given term is a 'driver' term.
See: https://biit.cs.ut.ee/gprofiler/page/docs#highlight_go
Returns
-------
ora_df: pandas.DataFrame
The resulting gProfiler results dataframe.
References
----------
* https://biit.cs.ut.ee/gprofiler/gost
* https://doi.org/10.1093/nar/gkad347
Usage
-----
>>> diff_exp_genes = ["SFTPC", "LAMP3", "NAPSA", "SFTPB", "EPCAM", "COL1A1"]
>>> ora_df = do_gprofiler_analysis(diff_exp_genes, ordered=False)
>>> print(ora_df.head(1)[["source", "name", "p_value", "intersections"]])
source name p_value intersections
0 REAC Surfactant metabolism 0.000033 [SFTPC, NAPSA, SFTPB]
"""
# check if this is an ordered query; if it is, gProfiler
# API cannot perform highlighting (interesting)
if (ordered and highlight):
highlight = False
res = requests.post(
url = "https://biit.cs.ut.ee/gprofiler/api/gost/profile/",
json = {
"organism": organism,
"query": genes,
"sources": sources,
"highlight": highlight,
"ordered": ordered,
"no_evidences": True,
"no_iea": False,
},
headers = {
"User-Agent": "FullPythonRequest"
}
)
ora_df = pd.DataFrame.from_dict(res.json()['result'])
if (max_term_size) & (max_term_size > 0):
ora_df = ora_df[ora_df["term_size"] < max_term_size]
return ora_df