import requests
import numpy as np, pandas as pd
= "http://biit.cs.ut.ee/gprofiler/"
BASE_URL = {'User-Agent': 'python-gprofiler'}
HEADERS
def gprofiler(
=20, organism='hsapiens', ordered_query=False, significant=True,
query, topk=False, region_query=False, max_p_value=1.0, max_set_size=0,
exclude_iea='analytical', hier_filtering='none',
correction_method='annotated', custom_bg=[], numeric_ns='', no_isects=False,
domain_size=None, include_graph=False, src_filter=None, mode='enrich'
png_fn
):''' Annotate gene list functionally
Interface to the g:Profiler tool for finding enrichments in gene lists. Organism
names are constructed by concatenating the first letter of the name and the
family name. Example: human - 'hsapiens', mouse - 'mmusculus'. If requesting PNG
output, the request is directed to the g:GOSt tool in case 'query' is a vector
and the g:Cocoa (compact view of multiple queries) tool in case 'query' is a
list. PNG output can fail (return FALSE) in case the input query is too large.
In such case, it is advisable to fall back to a non-image request.
Returns a pandas DataFrame with enrichment results or None
'''
= ''
query_url = 'enrich' # Currently gprofiler gconvert gives an HTTP 405 error.
mode if mode == 'enrich':
= BASE_URL + 'gcocoa.cgi'
my_url elif mode == 'lookup':
= BASE_URL + 'convert.cgi'
my_url = True if png_fn else False
wantpng = 'mini_png' if wantpng else 'mini'
output_type if wantpng:
raise NotImplementedError('PNG Output not implemented')
return
if include_graph:
raise NotImplementedError('Biogrid Interactions not implemented (include_graph)')
return
# Query
= list(query)
qnames if not qnames:
raise ValueError('Missing query')
= ' '.join(qnames)
query_url
# Significance threshold
if correction_method == 'gSCS':
= 'analytical'
correction_method if correction_method not in ('analytical', 'fdr', 'bonferroni'):
raise ValueError("Multiple testing correction method not recognized (correction_method)")
# Hierarchical filtering
if hier_filtering not in ('none', 'moderate', 'strong'):
raise ValueError("hier_filtering must be one of \"none\", \"moderate\" or \"strong\"")
if hier_filtering == 'strong':
= 'compact_ccomp'
hier_filtering elif hier_filtering == 'moderate':
= 'compact_rgroups'
hier_filtering else:
= ''
hier_filtering
# Domain size
if domain_size not in ('annotated', 'known'):
raise ValueError("domain_size must be one of \"annotated\" or \"known\"")
# Custom background
if isinstance(custom_bg, list):
= ' '.join(custom_bg)
custom_bg else:
raise TypeError('custom_bg need to be a list')
# Max. set size
if max_set_size < 0:
= 0
max_set_size
# HTTP request
if mode == 'enrich':
= {
query_params 'organism': organism,
'query': query_url,
'output': output_type,
'analytical': '1',
'sort_by_structure': '1',
'ordered_query': '1' if ordered_query else '0',
'significant': '1' if significant else '0',
'no_iea': '1' if exclude_iea else '0',
'as_ranges': '1' if region_query else '0',
'omit_metadata': '0' if include_graph else '1',
'user_thr': str(max_p_value),
'max_set_size': str(max_set_size),
'threshold_algo': correction_method,
'hierfiltering': hier_filtering,
'domain_size_type': domain_size,
'custbg_file': '',
'custbg': custom_bg,
'prefix': numeric_ns,
'no_isects': '1' if no_isects else '0'
}elif mode == 'lookup':
= {
query_params 'query': query_url,
'target': 'GO'
}if src_filter:
for i in src_filter:
'sf_' + i] = '1'
query_params[= requests.post(my_url, data=query_params, headers=HEADERS)
raw_query
# Here PNG request parsing would go, but not implementing that
if wantpng:
pass
# Requested text
= raw_query.text.split('\n')
split_query
# Here interaction parsing would go, but not implementing that
if include_graph:
pass
# Parse main result body
= [
split_query '\t')
s.split(for s in split_query
if s and not s.startswith('#')
]
= pd.DataFrame(split_query)
enrichment if mode == 'enrich':
= [
colnames "query.number", "significant", "p.value", "term.size",
"query.size", "overlap.size", "recall", "precision",
"term.id", "domain", "subgraph.number", "term.name",
"relative.depth", "intersection"
]= [
numeric_colnames "query.number", "p.value", "term.size",
"query.size", "overlap.size", "recall",
"precision", "subgraph.number", "relative.depth"
]elif mode == 'lookup':
return enrichment
#colnames = ["alias.number", "alias", "target.number", "target", "name", "description", "namespace"]
#numeric_colnames = ["alias.number", "target.number"]
if enrichment.shape[1] > 0:
= colnames
enrichment.columns = enrichment['term.id']
enrichment.index = numeric_colnames
numeric_columns for column in numeric_columns:
= pd.to_numeric(enrichment[column])
enrichment[column] if mode == 'enrich':
'significant'] = enrichment['significant'] == '!'
enrichment[else:
= None
enrichment # Only report most significant results
if (enrichment is not None) and (enrichment.shape[0] > 0):
= np.array(np.argsort(enrichment['p.value']))
x = enrichment.iloc[x[:topk], :]
enrichment return enrichment
def viz_gprofiler(selected_genes, topk=20):
= gprofiler(selected_genes, topk=topk)
go_results = { 'MF': '#CB3C19', 'BP': '#FA9D18', 'CC': '#198520', 'keg': '#D9869B', 'rea': '#335ACC', 'tf': '#4F6E9B', 'mir': '#53B8AD', 'hpa': '#542DB1', 'cor': '#6AAB19', 'hp': '#8C0784'}
database_colors = {'MF': 'Molecular function', 'BP': 'Biological process', 'CC': 'Cellular component', 'keg': 'KEGG', 'rea': 'REAC', 'tf': 'TF', 'mir': 'MIRNA', 'hpa': 'HPA', 'cor': 'CORUM', 'hp': 'HP'}
database_IDs = -np.log10(go_results['p.value'].astype(float))
top_go_logpvals = go_results['domain']
top_go_dbIDs = go_results['term.name']
top_go_termnames = go_results['term.id']
top_go_termIDs = go_results['intersection']
top_go_queryhits = np.array([database_colors[x] for x in top_go_dbIDs])
bar_colors = []
panel_data = np.arange(len(top_go_dbIDs))[::-1]
ordi for c in np.unique(bar_colors):
= np.where(bar_colors == c)[0]
trace_locs = trace_locs[::-1] # Reverse because it's better.
trace_locs
panel_data.append({'name': database_IDs[top_go_dbIDs[trace_locs[0]]],
'x': top_go_logpvals[trace_locs],
# 'y': ordi[trace_locs],
'y': top_go_termIDs[trace_locs],
'hovertext': [
"-log10(p): {}<br>{}<br>{}".format(
str(round(top_go_logpvals[t], 2)),
top_go_termnames[t],
top_go_termIDs[t]for t in trace_locs],
) 'text': top_go_termnames[trace_locs]
})return panel_data
Looking up gene sets in databases
Looking up the significance of gene sets
The Gene Ontology (GO) is a controlled vocabulary for describing gene function. It is organized as a directed acyclic graph with three main branches: biological process, molecular function, and cellular component. The GO is a useful tool for summarizing the results of an analysis over protein-coding genes.
Querying the GO is often performed alongside queries to other databases, such as those for protein-protein interaction. Here is code for querying several major databases all at once, using the g:Profiler web service (Raudvere et al. 2019). Along with the excellent gget
package (Luebbert and Pachter 2023), this covers many common gene-set lookup use cases. Here is a wrapper, adapting an excellent existing Python port for the gProfiler API to report the heterogeneous results of the lookup in a pandas dataframe.
= gprofiler(['NF1', 'KRAS', 'RAF1']) go_results
go_results
query.number | significant | p.value | term.size | query.size | overlap.size | recall | precision | term.id | domain | subgraph.number | term.name | relative.depth | intersection | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
term.id | ||||||||||||||
GO:0021896 | 1 | True | 0.000012 | 2 | 3 | 2 | 0.667 | 1.000 | GO:0021896 | BP | 11 | forebrain astrocyte differentiation | 2 | KRAS,NF1 |
GO:0021897 | 1 | True | 0.000012 | 2 | 3 | 2 | 0.667 | 1.000 | GO:0021897 | BP | 11 | forebrain astrocyte development | 3 | KRAS,NF1 |
REAC:R-HSA-6802949 | 1 | True | 0.000018 | 51 | 3 | 3 | 1.000 | 0.059 | REAC:R-HSA-6802949 | rea | 8 | Signaling by RAS mutants | 3 | RAF1,KRAS,NF1 |
KEGG:01521 | 1 | True | 0.000055 | 79 | 3 | 3 | 1.000 | 0.038 | KEGG:01521 | keg | 50 | EGFR tyrosine kinase inhibitor resistance | 1 | RAF1,KRAS,NF1 |
REAC:R-HSA-6802953 | 1 | True | 0.000056 | 4 | 3 | 2 | 0.667 | 0.500 | REAC:R-HSA-6802953 | rea | 8 | RAS signaling downstream of NF1 loss-of-f... | 3 | KRAS,NF1 |
REAC:R-HSA-6802957 | 1 | True | 0.000057 | 74 | 3 | 3 | 1.000 | 0.041 | REAC:R-HSA-6802957 | rea | 8 | Oncogenic MAPK signaling | 2 | RAF1,KRAS,NF1 |
HP:0002967 | 1 | True | 0.000283 | 32 | 3 | 3 | 1.000 | 0.094 | HP:0002967 | hp | 23 | Cubitus valgus | 1 | RAF1,KRAS,NF1 |
GO:0046578 | 1 | True | 0.001030 | 219 | 3 | 3 | 1.000 | 0.014 | GO:0046578 | BP | 62 | regulation of Ras protein signal transduction | 2 | RAF1,KRAS,NF1 |
HP:0030063 | 1 | True | 0.001190 | 51 | 3 | 3 | 1.000 | 0.059 | HP:0030063 | hp | 43 | Neuroepithelial neoplasm | 3 | RAF1,KRAS,NF1 |
GO:0035020 | 1 | True | 0.001220 | 15 | 3 | 2 | 0.667 | 0.133 | GO:0035020 | BP | 62 | regulation of Rac protein signal transduc... | 3 | KRAS,NF1 |
HP:0030061 | 1 | True | 0.001410 | 54 | 3 | 3 | 1.000 | 0.056 | HP:0030061 | hp | 43 | Neuroectodermal neoplasm | 2 | RAF1,KRAS,NF1 |
HP:0100836 | 1 | True | 0.001410 | 54 | 3 | 3 | 1.000 | 0.056 | HP:0100836 | hp | 41 | Malignant neoplasm of the central nervous... | 3 | RAF1,KRAS,NF1 |
HP:0030060 | 1 | True | 0.001410 | 54 | 3 | 3 | 1.000 | 0.056 | HP:0030060 | hp | 43 | Nervous tissue neoplasm | 1 | RAF1,KRAS,NF1 |
KEGG:04014 | 1 | True | 0.001450 | 233 | 3 | 3 | 1.000 | 0.013 | KEGG:04014 | keg | 15 | Ras signaling pathway | 1 | RAF1,KRAS,NF1 |
REAC:R-HSA-5621575 | 1 | True | 0.001960 | 21 | 3 | 2 | 0.667 | 0.095 | REAC:R-HSA-5621575 | rea | 14 | CD209 (DC-SIGN) signaling | 1 | RAF1,KRAS |
REAC:R-HSA-5673001 | 1 | True | 0.001980 | 239 | 3 | 3 | 1.000 | 0.013 | REAC:R-HSA-5673001 | rea | 58 | RAF/MAP kinase cascade | 3 | RAF1,KRAS,NF1 |
REAC:R-HSA-5684996 | 1 | True | 0.002140 | 245 | 3 | 3 | 1.000 | 0.012 | REAC:R-HSA-5684996 | rea | 58 | MAPK1/MAPK3 signaling | 2 | RAF1,KRAS,NF1 |
HP:0010991 | 1 | True | 0.002160 | 62 | 3 | 3 | 1.000 | 0.048 | HP:0010991 | hp | 52 | Abnormality of the abdominal musculature | 1 | RAF1,KRAS,NF1 |
REAC:R-HSA-5673000 | 1 | True | 0.002800 | 25 | 3 | 2 | 0.667 | 0.080 | REAC:R-HSA-5673000 | rea | 58 | RAF activation | 4 | RAF1,KRAS |
KEGG:04010 | 1 | True | 0.002980 | 296 | 3 | 3 | 1.000 | 0.010 | KEGG:04010 | keg | 22 | MAPK signaling pathway | 1 | RAF1,KRAS,NF1 |