Source code for bioservices.panther

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s):
#      Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
# $Id$
"""Interface to some part of the Panther web service

.. topic:: What is Panther ?

    :URL: http://www.panther.org
    :Citation:

    .. highlights::


        The PANTHER (Protein ANalysis THrough Evolutionary Relationships)
        Classification System was designed to classify proteins (and
        their genes) in order to facilitate high-throughput analysis.
        Proteins have been classified according to:

            * Family and subfamily: families are groups of evolutionarily related
              proteins; subfamilies are related proteins that also have the same function
            * Molecular function: the function of the protein by itself or with directly
              interacting proteins at a biochemical level, e.g. a protein kinase
            * Biological process: the function of the protein in the context of a larger
              network of proteins that interact to accomplish a process at the level of the
              cell or organism, e.g. mitosis.
            * Pathway: similar to biological process, but a pathway also explicitly
              specifies the relationships between the interacting molecules.

        -- From PantherDB (about) , Feb 2020

"""
from bioservices.services import REST
from bioservices import logger

logger.name = __name__


__all__ = ["Panther"]


[docs]class Panther:
    """Interface to `Panther <http://www.pantherdb.org/services/oai/pantherdb>`_ pages


    ::

        >>> from bioservics import Panther
        >>> p = Panther()
        >>> p.get_supported_genomes()
        >>> p.get_ortholog("zap70", 9606)


        >>> from bioservics import Panther
        >>> p = Panther()
        >>> taxon = [x[0]['taxon_id'] for x in p.get_supported_genomes() if "coli" in x['name'].lower()]
        >>> # you may also use our method called search_organism
        >>> taxon = p.get_taxon_id(pattern="coli")
        >>> res = p.get_mapping("abrB,ackA,acuI", taxon)

    The get_mapping returns for each gene ID the GO terms corresponding to each
    ID. Those go terms may belong to different categories (see
    meth:`get_annotation_datasets`):

    - MF for molecular function
    - BP for biological process
    - PC for Protein class
    - CC Cellular location
    - Pathway

    Note that results from the website application http://pantherdb.org/
    do not agree with the oupput of the get_mapping service... Try out the dgt
    gene from ecoli for example




    """

    _url = "http://www.pantherdb.org/services/oai/pantherdb"

    def __init__(self, verbose=True, cache=False):
        """**Constructor**

        :param verbose: set to False to prevent informative messages
        """
        # super(Panther, self).__init__(name="Panther", url=Panther._url,
        #       verbose=verbose, cache=cache)
        self.services = REST(
            name="Panther",
            url=Panther._url,
            verbose=verbose,
            cache=cache,
            url_defined_later=True,
        )

        self._allPathwaysURL = "http://www.pantherdb.org/pathway/pathwayList.jsp"

[docs]    def get_pathways(self):
        """Returns all pathways from pantherdb"""
        return self.services.http_get("supportedpantherpathways")

[docs]    def get_supported_genomes(self, type=None):
        """Returns list of supported organisms.

        :param type: can be chrLoc to restrict the search


        """
        if type is not None:
            params = {"type": type}
        else:
            params = {}
        res = self.services.http_get("supportedgenomes", params=params)
        res = [x for x in res["search"]["output"]["genomes"]["genome"]]
        return res

[docs]    def get_taxon_id(self, pattern=None):
        """return all taxons supported by the service

        If pattern is provided, we filter the name to keep those that contain
        the filter. If only one is found, we return the name itself, otherwise a
        list of candidates

        """
        res = self.get_supported_genomes()
        if pattern:
            taxon = [x["taxon_id"] for x in res if pattern.lower() in x["name"].lower()]
            if len(taxon) == 1:
                return taxon[0]
            else:
                return taxon
        else:
            taxon = [x["taxon_id"] for x in res]
            return taxon

[docs]    def get_mapping(self, gene_list, taxon):
        """Map identifiers

        Each identifier to be delimited by comma i.e. ',. Maximum of 1000 Identifiers
        can be any of the following: Ensemble gene identifier, Ensemble protein
        identifier, Ensemble transcript identifier, Entrez gene id, gene symbol, NCBI
        GI, HGNC Id, International protein index id, NCBI UniGene id, UniProt accession
        and UniProt id

        :param gene_list: see above
        :param taxon: one taxon ID. See supported
            :meth:`~bioservices.panther.Panther.get_supported_genomes`

        If an identifier is not found, information can be found in the
        unmapped_genes key while found identifiers are in the mapped_genes key.

        .. warning:: found and not found identifiers are dispatched into
            unmapped and mapped genes. If there are not found identifiers,
            the input gene list and the mapped genes list do not have the same
            length. The input names are not stored in the output.
            Developpers should be aware of that feature.

        """
        params = {"geneInputList": gene_list, "organism": taxon}
        res = self.services.http_post("geneinfo", params=params, frmt="json")

        if "mapped_genes" in res["search"]:
            mapped_genes = res["search"]["mapped_genes"]["gene"]
            # if only one identifier, retuns a dictionary.
            # if several identifiers, returns a list of dictionary.
            # We will be consistent and return a list
            if "accession" in mapped_genes:
                mapped_genes = [mapped_genes]
        else:
            mapped_genes = [{}]

        if "unmapped_list" in res["search"]:
            unmapped_genes = res["search"]["unmapped_list"]["unmapped"]
            if isinstance(unmapped_genes, list):
                pass
            else:
                unmapped_genes = [unmapped_genes]
        else:
            unmapped_genes = []

        logger.warning("Some identifiers were not found")
        return {"unmapped": unmapped_genes, "mapped": mapped_genes}

[docs]    def get_enrichment(
        self,
        gene_list,
        organism,
        annotation,
        enrichment_test="Fisher",
        correction="FDR",
        ref_gene_list=None,
    ):
        """Returns over represented genes

        Compares a test gene list to a reference gene list,
        and determines whether a particular class (e.g. molecular function,
        biological process, cellular component, PANTHER protein class, the
        PANTHER pathway or Reactome pathway) of genes is overrepresented
        or underrepresented.

        :param organism: a valid taxon ID
        :param enrichment_test: either **Fisher** or **Binomial** test
        :param correction: correction for multiple testing. Either **FDR**,
            **Bonferonni**, or **None**.
        :param annotation: one of the supported PANTHER annotation data types.
            See :meth:`~bioservices.panther.Panther.get_annotation_datasets` to retrieve a list of
            supported annotation data types
        :param ref_gene_list: if not specified, the system will use all the genes
            for the specified organism. Otherwise, a list delimited by
            comma. Maximum of 100000 Identifiers can be any of the
            following: Ensemble gene identifier, Ensemble protein
            identifier, Ensemble transcript identifier, Entrez gene id,
            gene symbol, NCBI GI, HGNC Id, International protein index id,
            NCBI UniGene id, UniProt accession andUniProt id.

        :return: a dictionary with the following keys. 'reference' contains the
            orgnaism, 'input_list' is the input gene list with unmapped genes.
            'result' contains the list of candidates.

        ::

            >>> from bioservices import Panther
            >>> p = Panther()
            >>> res = p.get_enrichment('zap70,mek1,erk', 9606, "GO:0008150")
            >>> For molecular function, use :
            >>> res = p.get_enrichment('zap70,mek1,erk', 9606,
                    "ANNOT_TYPE_ID_PANTHER_GO_SLIM_MF")

        """
        assert enrichment_test.lower() in ["fisher", "binomial"]
        if correction is None:
            correction = "none"

        assert correction.lower() in ["fdr", "bonferroni", "none"]

        # This is a bug in panther DB where they used bonferonni . should be
        # bonferroni...
        if correction.lower() == "bonferroni":
            correction = "bonferonni"
        assert annotation in [x["id"] for x in self.get_annotation_datasets()]

        params = {"enrichmentTestType": enrichment_test.upper()}
        params["organism"] = organism
        if gene_list:
            params["geneInputList"] = gene_list
        if ref_gene_list:
            params["refInputList"] = ref_gene_list
        params["annotDataSet"] = annotation
        params["correction"] = correction.upper()
        try:
            res = self.services.http_post("enrich/overrep", params=params, frmt="json")
            try:
                return res["results"]
            except:
                return res
        except:
            return res

[docs]    def get_annotation_datasets(self):
        """Retrieve the list of supported annotation data sets"""
        res = self.services.http_get("supportedannotdatasets")
        res = res["search"]["annotation_data_sets"]["annotation_data_type"]
        return res

[docs]    def get_ortholog(self, gene_list, organism, target_organism=None, ortholog_type="all"):
        """search for matching orthologs in target organisms.

        Searches for matching orthologs in the gene family that contains
        the search gene associated with the search terms. Returns
        ortholog genes in target organisms given a search organism,
        the search terms and a list of target organisms.

        :param gene_list:
        :param organism: a valid taxon ID
        :param target_organism: zero or more taxon IDs separated by ','. See
            :meth:`~bioservices.panther.Panther.get_supported_genomes`
        :param ortholog_type: optional parameter to specify ortholog type of target organism
        :return: a dictionary with "mapped" and "unmapped" keys, each of them
            being a list. For each unmapped gene, a dictionary with id and
            organism is is returned. For the mapped gene, a list of ortholog is
            returned.

        """
        assert ortholog_type in ["LDO", "all"]
        params = {
            "geneInputList": gene_list,
            "organism": organism,
            "targetOrganism": target_organism,
            "orthologType": ortholog_type,
        }
        if params["targetOrganism"] is None:
            del params["targetOrganism"]
        res = self.services.http_get("ortholog/matchortho", frmt="json", params=params)
        res = res["search"]["mapping"]
        mapped = res["mapped"]

        try:
            unmapped = res["unmapped_ids"]["unmapped"]
            # make sure we always have a list
            if isinstance(unmapped, dict):
                unmapped = [unmapped]
        except:
            unmapped = []
        res = {"unmapped": unmapped, "mapped": mapped}

        return res

[docs]    def get_homolog_position(self, gene, organism, position, ortholog_type="all"):
        """

        :param gene: Can be any of the following: Ensemble gene identifier,
            Ensemble protein identifier, Ensemble transcript identifier, Entrez gene id,
            gene symbol, NCBI GI, HGNC Id, International protein index id, NCBI UniGene id,
            UniProt accession andUniProt id
        :param organism: a valid taxon ID
        :param ortholog_type: optional parameter to specify ortholog type of target organism
        """
        if "," in gene:
            logger.warning("did not expect a comma. Please provide only one gene name")
        assert ortholog_type in ["LDO", "all"]
        assert position >= 1
        params = {
            "gene": gene,
            "organism": organism,
            "pos": position,
            "orthologType": ortholog_type,
        }
        res = self.services.http_get("ortholog/homologpos", params=params, frmt="json")
        res = res["search"]["mapping"]
        if "mapped" in res.keys():
            res = res["mapped"]
            return res
        elif "unmapped_ids" in res.keys():
            logger.warning("did not find any match for {}".format(gene))
            return res["unmapped_ids"]

[docs]    def get_supported_families(self, N=1000, progress=True):
        """Returns the list of supported PANTHER family IDs

        This services returns only 1000 items per request. This is defined by
        the index. For instance index set to 1 returns the first 1000 families.
        Index set to 2 returns families between index 1000 and 2000 and so on.
        As of 20 Feb 2020, there was about 15,000 families.

        This function simplifies your life by calling the service as many times
        as required. Therefore it returns all families in one go.

        """
        from easydev import Progress

        params = {"startIndex": 1}
        res = self.services.http_get("supportedpantherfamilies", params=params)
        results = res["search"]["panther_family_subfam_list"]["family"]
        if len(results) != N:
            msg = "looks like the services changed. Call this function with N={}"
            msg = msg.format(len(results))
            raise ValueError(msg)

        number_of_families = res["search"]["number_of_families"]
        pb = Progress(int(number_of_families / N))
        pb.animate(1)
        for i in range(1, int(number_of_families / N) + 1):
            params = {"startIndex": i * N + 1}
            res = self.services.http_get("supportedpantherfamilies", params=params)
            data = res["search"]["panther_family_subfam_list"]["family"]
            results.extend(data)
            if progress:
                pb.animate(i)
        return results

[docs]    def get_family_ortholog(self, family, taxon_list=None):
        """Search for matching orthologs in target organisms

        Also return the corresponding position in the target
        organism sequence. The system searches for matching
        orthologs in the gene family that contains the search
        gene associated with the search term.

        :param family: Family ID
        :param taxon_list: Zero or more taxon IDs separated by ','.
        """

        params = {"family": family}
        if taxon_list:
            params["taxonFltr"] = taxon_list
        res = self.services.http_get("familyortholog", params=params, frmt="json")
        return res["search"]["ortholog_list"]["ortholog"]

[docs]    def get_family_msa(self, family, taxon_list=None):
        """Returns MSA information for the specified family.

        :param family: family ID
        :param taxon_list: Zero or more taxon IDs separated by ','.

        """
        params = {"family": family}
        if taxon_list:
            params["taxonFltr"] = taxon_list
        res = self.services.http_get("familymsa", params=params, frmt="json")
        return res["search"]["MSA_list"]["sequence_info"]

[docs]    def get_tree_info(self, family, taxon_list=None):
        """Returns tree topology information and node attributes for the specified family.

        :param family: Family ID
        :param taxon_list: Zero or more taxon IDs separated by ','.
        """
        params = {"family": family}
        if taxon_list:
            params["taxonFltr"] = taxon_list
        res = self.services.http_get("treeinfo", params=params, frmt="json")
        return res["search"]  # ['tree_topology']['annotation_node']