#
# This file is part of bioservices software
#
# Copyright (c) 2013-2014 - EBI-EMBL
#
# File author(s):
# Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
# website: https://github.com/cokelaer/bioservices
# documentation: http://packages.python.org/bioservices
#
##############################################################################
# $Id$
"""Interface to some part of the Panther web service
.. topic:: What is Panther ?
:URL: http://www.panther.org
:Citation:
.. highlights::
The PANTHER (Protein ANalysis THrough Evolutionary Relationships)
Classification System was designed to classify proteins (and
their genes) in order to facilitate high-throughput analysis.
Proteins have been classified according to:
* Family and subfamily: families are groups of evolutionarily related
proteins; subfamilies are related proteins that also have the same function
* Molecular function: the function of the protein by itself or with directly
interacting proteins at a biochemical level, e.g. a protein kinase
* Biological process: the function of the protein in the context of a larger
network of proteins that interact to accomplish a process at the level of the
cell or organism, e.g. mitosis.
* Pathway: similar to biological process, but a pathway also explicitly
specifies the relationships between the interacting molecules.
-- From PantherDB (about) , Feb 2020
"""
from bioservices import logger
from bioservices.services import REST
logger.name = __name__
__all__ = ["Panther"]
[docs]class Panther:
"""Interface to `Panther <http://www.pantherdb.org/services/oai/pantherdb>`_ pages
::
>>> from bioservices import Panther
>>> p = Panther()
>>> p.get_supported_genomes()
>>> p.get_ortholog("zap70", 9606)
>>> from bioservices import Panther
>>> p = Panther()
>>> taxon = [x[0]['taxon_id'] for x in p.get_supported_genomes() if "coli" in x['name'].lower()]
>>> # you may also use our method called search_organism
>>> taxon = p.get_taxon_id(pattern="coli")
>>> res = p.get_mapping("abrB,ackA,acuI", taxon)
The get_mapping returns for each gene ID the GO terms corresponding to each
ID. Those go terms may belong to different categories (see
:meth:`get_annotation_datasets`):
- MF for molecular function
- BP for biological process
- PC for Protein class
- CC Cellular location
- Pathway
Note that results from the website application http://pantherdb.org/
do not agree with the output of the get_mapping service... Try out the dgt
gene from ecoli for example
"""
_url = "http://www.pantherdb.org/services/oai/pantherdb"
def __init__(self, verbose=True, cache=False):
"""**Constructor**
:param bool verbose: set to False to prevent informative messages
:param bool cache: set to True to enable HTTP caching
"""
# super(Panther, self).__init__(name="Panther", url=Panther._url,
# verbose=verbose, cache=cache)
self.services = REST(
name="Panther",
url=Panther._url,
verbose=verbose,
cache=cache,
url_defined_later=True,
)
self._allPathwaysURL = "http://www.pantherdb.org/pathway/pathwayList.jsp"
[docs] def get_pathways(self):
"""Returns all pathways from pantherdb"""
return self.services.http_get("supportedpantherpathways")
[docs] def get_supported_genomes(self, type=None):
"""Returns list of supported organisms.
:param type: can be chrLoc to restrict the search
"""
if type is not None:
params = {"type": type}
else:
params = {}
res = self.services.http_get("supportedgenomes", params=params)
res = [x for x in res["search"]["output"]["genomes"]["genome"]]
return res
[docs] def get_taxon_id(self, pattern=None):
"""Return all taxon IDs supported by the service.
If pattern is provided, we filter the name to keep those that contain
the filter. If only one is found, we return the name itself, otherwise a
list of candidates
"""
res = self.get_supported_genomes()
if pattern:
taxon = [x["taxon_id"] for x in res if pattern.lower() in x["name"].lower()]
if len(taxon) == 1:
return taxon[0]
else:
return taxon
else:
taxon = [x["taxon_id"] for x in res]
return taxon
[docs] def get_mapping(self, gene_list, taxon):
"""Map identifiers
:param str gene_list: comma-delimited gene identifiers (max 1000). Can be
any of: Ensembl gene/protein/transcript ID, Entrez gene id, gene symbol,
NCBI GI, HGNC Id, International protein index id, NCBI UniGene id,
UniProt accession or UniProt id.
:param taxon: one taxon ID. See
:meth:`~bioservices.panther.Panther.get_supported_genomes`
If an identifier is not found, information can be found in the
unmapped_genes key while found identifiers are in the mapped_genes key.
.. warning:: found and not found identifiers are dispatched into
unmapped and mapped genes. If there are not found identifiers,
the input gene list and the mapped genes list do not have the same
length. The input names are not stored in the output.
Developers should be aware of that feature.
"""
params = {"geneInputList": gene_list, "organism": taxon}
res = self.services.http_post("geneinfo", params=params, frmt="json")
if "mapped_genes" in res["search"]:
mapped_genes = res["search"]["mapped_genes"]["gene"]
# if only one identifier, retuns a dictionary.
# if several identifiers, returns a list of dictionary.
# We will be consistent and return a list
if "accession" in mapped_genes:
mapped_genes = [mapped_genes]
else:
mapped_genes = [{}]
if "unmapped_list" in res["search"]:
unmapped_genes = res["search"]["unmapped_list"]["unmapped"]
if isinstance(unmapped_genes, list):
pass
else:
unmapped_genes = [unmapped_genes]
else:
unmapped_genes = []
logger.warning("Some identifiers were not found")
return {"unmapped": unmapped_genes, "mapped": mapped_genes}
[docs] def get_enrichment(
self,
gene_list,
organism,
annotation,
enrichment_test="Fisher",
correction="FDR",
ref_gene_list=None,
):
"""Returns over represented genes
Compares a test gene list to a reference gene list,
and determines whether a particular class (e.g. molecular function,
biological process, cellular component, PANTHER protein class, the
PANTHER pathway or Reactome pathway) of genes is overrepresented
or underrepresented.
:param str gene_list: comma-delimited gene identifiers to test for enrichment
:param int organism: a valid taxon ID
:param enrichment_test: either **Fisher** or **Binomial** test
:param correction: correction for multiple testing. Either **FDR**,
**Bonferonni**, or **None**.
:param annotation: one of the supported PANTHER annotation data types.
See :meth:`~bioservices.panther.Panther.get_annotation_datasets` to retrieve a list of
supported annotation data types
:param ref_gene_list: if not specified, the system will use all the genes
for the specified organism. Otherwise, a list delimited by
comma. Maximum of 100000 Identifiers can be any of the
following: Ensembl gene identifier, Ensembl protein
identifier, Ensembl transcript identifier, Entrez gene id,
gene symbol, NCBI GI, HGNC Id, International protein index id,
NCBI UniGene id, UniProt accession and UniProt id.
:return: a dictionary with the following keys. 'reference' contains the
organism, 'input_list' is the input gene list with unmapped genes.
'result' contains the list of candidates.
::
>>> from bioservices import Panther
>>> p = Panther()
>>> res = p.get_enrichment('zap70,mek1,erk', 9606, "GO:0008150")
>>> # For molecular function, use:
>>> res = p.get_enrichment('zap70,mek1,erk', 9606,
"ANNOT_TYPE_ID_PANTHER_GO_SLIM_MF")
"""
if enrichment_test.lower() not in ["fisher", "binomial"]:
raise ValueError("enrichment_test must be 'fisher' or 'binomial'")
if correction is None:
correction = "none"
if correction.lower() not in ["fdr", "bonferroni", "none"]:
raise ValueError("correction must be 'fdr', 'bonferroni', or 'none'")
# This is a bug in panther DB where they used bonferonni . should be
# bonferroni...
if correction.lower() == "bonferroni":
correction = "bonferonni"
valid_annotations = [x["id"] for x in self.get_annotation_datasets()]
if annotation not in valid_annotations:
raise ValueError(f"annotation must be one of {valid_annotations}")
params = {"enrichmentTestType": enrichment_test.upper()}
params["organism"] = organism
if gene_list:
params["geneInputList"] = gene_list
if ref_gene_list:
params["refInputList"] = ref_gene_list
params["annotDataSet"] = annotation
params["correction"] = correction.upper()
try:
res = self.services.http_post("enrich/overrep", params=params, frmt="json")
try:
return res["results"]
except Exception:
return res
except Exception:
return res
[docs] def get_annotation_datasets(self):
"""Retrieve the list of supported annotation data sets"""
res = self.services.http_get("supportedannotdatasets")
res = res["search"]["annotation_data_sets"]["annotation_data_type"]
return res
[docs] def get_ortholog(self, gene_list, organism, target_organism=None, ortholog_type="all"):
"""search for matching orthologs in target organisms.
Searches for matching orthologs in the gene family that contains
the search gene associated with the search terms. Returns
ortholog genes in target organisms given a search organism,
the search terms and a list of target organisms.
:param str gene_list: comma-delimited gene identifiers
:param int organism: a valid taxon ID
:param target_organism: zero or more taxon IDs separated by ','. See
:meth:`~bioservices.panther.Panther.get_supported_genomes`
:param ortholog_type: optional parameter to specify ortholog type of target organism
:return: a dictionary with "mapped" and "unmapped" keys, each of them
being a list. For each unmapped gene, a dictionary with id and
organism is returned. For the mapped gene, a list of ortholog is
returned.
"""
if ortholog_type not in ["LDO", "all"]:
raise ValueError("ortholog_type must be 'LDO' or 'all'")
params = {
"geneInputList": gene_list,
"organism": organism,
"targetOrganism": target_organism,
"orthologType": ortholog_type,
}
if params["targetOrganism"] is None:
del params["targetOrganism"]
res = self.services.http_get("ortholog/matchortho", frmt="json", params=params)
res = res["search"]["mapping"]
mapped = res["mapped"]
try:
unmapped = res["unmapped_ids"]["unmapped"]
# make sure we always have a list
if isinstance(unmapped, dict):
unmapped = [unmapped]
except Exception:
unmapped = []
res = {"unmapped": unmapped, "mapped": mapped}
return res
[docs] def get_homolog_position(self, gene, organism, position, ortholog_type="all"):
"""Return the homolog at a given position in the family tree.
:param str gene: a gene identifier — can be any of: Ensembl gene/protein/transcript ID,
Entrez gene id, gene symbol, NCBI GI, HGNC Id, International protein index id,
NCBI UniGene id, UniProt accession or UniProt id
:param int organism: a valid taxon ID
:param int position: 1-based position in the gene family tree
:param str ortholog_type: ortholog type of target organism (``"LDO"`` or ``"all"``)
"""
if "," in gene:
logger.warning("did not expect a comma. Please provide only one gene name")
if ortholog_type not in ["LDO", "all"]:
raise ValueError("ortholog_type must be 'LDO' or 'all'")
if position < 1:
raise ValueError("position must be >= 1")
params = {
"gene": gene,
"organism": organism,
"pos": position,
"orthologType": ortholog_type,
}
res = self.services.http_get("ortholog/homologpos", params=params, frmt="json")
res = res["search"]["mapping"]
if "mapped" in res.keys():
res = res["mapped"]
return res
elif "unmapped_ids" in res.keys():
logger.warning("did not find any match for {}".format(gene))
return res["unmapped_ids"]
[docs] def get_supported_families(self, N=1000, progress=True):
"""Returns the list of supported PANTHER family IDs
This services returns only 1000 items per request. This is defined by
the index. For instance index set to 1 returns the first 1000 families.
Index set to 2 returns families between index 1000 and 2000 and so on.
As of 20 Feb 2020, there was about 15,000 families.
This function simplifies your life by calling the service as many times
as required. Therefore it returns all families in one go.
"""
from easydev import Progress
params = {"startIndex": 1}
res = self.services.http_get("supportedpantherfamilies", params=params)
results = res["search"]["panther_family_subfam_list"]["family"]
if len(results) != N:
msg = "looks like the services changed. Call this function with N={}"
msg = msg.format(len(results))
raise ValueError(msg)
number_of_families = res["search"]["number_of_families"]
pb = Progress(int(number_of_families / N))
pb.animate(1)
for i in range(1, int(number_of_families / N) + 1):
params = {"startIndex": i * N + 1}
res = self.services.http_get("supportedpantherfamilies", params=params)
data = res["search"]["panther_family_subfam_list"]["family"]
results.extend(data)
if progress:
pb.animate(i)
return results
[docs] def get_family_ortholog(self, family, taxon_list=None):
"""Search for matching orthologs in target organisms
Also return the corresponding position in the target
organism sequence. The system searches for matching
orthologs in the gene family that contains the search
gene associated with the search term.
:param family: Family ID
:param taxon_list: Zero or more taxon IDs separated by ','.
"""
params = {"family": family}
if taxon_list:
params["taxonFltr"] = taxon_list
res = self.services.http_get("familyortholog", params=params, frmt="json")
return res["search"]["ortholog_list"]["ortholog"]
[docs] def get_family_msa(self, family, taxon_list=None):
"""Returns MSA information for the specified family.
:param family: family ID
:param taxon_list: Zero or more taxon IDs separated by ','.
"""
params = {"family": family}
if taxon_list:
params["taxonFltr"] = taxon_list
res = self.services.http_get("familymsa", params=params, frmt="json")
return res["search"]["MSA_list"]["sequence_info"]
[docs] def get_tree_info(self, family, taxon_list=None):
"""Returns tree topology information and node attributes for the specified family.
:param family: Family ID
:param taxon_list: Zero or more taxon IDs separated by ','.
"""
params = {"family": family}
if taxon_list:
params["taxonFltr"] = taxon_list
res = self.services.http_get("treeinfo", params=params, frmt="json")
return res["search"] # ['tree_topology']['annotation_node']