Source code for bioservices.pathwaycommons

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s):
#      Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
"""This module provides a class :class:`~PathwayCommons`

.. topic:: What is PathwayCommons ?

    :URL: http://www.pathwaycommons.org/about
    :REST:

    .. highlights::

        Pathway Commons is a convenient point of access to biological pathway
        information collected from public pathway databases, which you can
        search, visualize and download. All data is freely available, under the
        license terms of each contributing database.

       -- PathwayCommons home page, Nov 2013


Data is freely available, under the license terms of each contributing database.

"""
from bioservices.services import REST

__all__ = ["PathwayCommons"]


[docs]class PathwayCommons:
    """Interface to the `PathwayCommons <http://www.pathwaycommons.org/about>`_ service


    >>> from bioservices import *
    >>> pc2 = PathwayCommons(verbose=False)
    >>> res = pc2.get("http://identifiers.org/uniprot/Q06609")



    .. todo:: traverse() method not implemented.
    """

    #: valid formats
    _valid_format = ["GSEA", "SBGN", "BIOPAX", "SIF", "TXT", "JSONLD"]
    _valid_directions = ["BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED"]
    _valid_patterns = [
        "CONTROLS_STATE_CHANGE_OF",
        "CONTROLS_PHOSPHORYLATION_OF",
        "CONTROLS_TRANSPORT_OF",
        "CONTROLS_EXPRESSION_OF",
        "IN_COMPLEX_WITH",
        "INTERACTS_WITH",
        "CATALYSIS_PRECEDES",
        "NEIGHBOR_OF",
        "CONSUMPTION_CONTROLLED_BY",
        "CONTROLS_TRANSPORT_OF_CHEMICAL",
        "CONTROLS_PRODUCTION_OF",
        "CHEMICAL_AFFECTS",
        "REACTS_WITH",
        "USED_TO_PRODUCE",
    ]
    _url = "https://www.pathwaycommons.org"

    def __init__(self, verbose=True, cache=False):
        """.. rubric:: Constructor

        :param bool verbose: prints informative messages

        """
        pass
        self._default_extension = "json"

        self.services = REST(name="PathwayCommons", url=PathwayCommons._url, verbose=verbose, cache=cache)

    # just a get/set to the default extension
    def _set_default_ext(self, ext):
        self.services.devtools.check_param_in_list(ext, ["json", "xml"])
        self._default_extension = ext

    def _get_default_ext(self):
        return self._default_extension

    default_extension = property(
        _get_default_ext,
        _set_default_ext,
        doc="set extension of the requests (default is json). Can be 'json' or 'xml'",
    )

[docs]    def search(self, q, page=0, datasource=None, organism=None, type=None):
        """Text search in PathwayCommons using Lucene query syntax

        Some of the parameters are BioPAX properties, others are composite
        relationships.

        All index fields are (case-sensitive): comment, ecnumber,
        keyword, name, pathway, term, xrefdb, xrefid, dataSource, and organism.

        The pathway field maps to all participants of pathways that contain
        the keyword(s) in any of its text fields.

        Finally, keyword is a transitive aggregate field that includes all
        searchable keywords of that element and its child elements.

        All searches can also be filtered by data source and organism.

        It is also possible to restrict the domain class using the
        'type' parameter.

        This query can be used standalone or to retrieve starting points
        for graph searches.


        :param str q: requires a keyword , name, external identifier, or a
            Lucene query string.
        :param int page: (N>=0, default is 0), search result page number.
        :param str datasource: filter by data source (use names or URIs of
            pathway data sources or of any existing Provenance object). If
            multiple data source values are specified, a union of hits from
            specified sources is returned. datasource=[reactome,pid] returns
            hits associated with Reactome or PID.
        :param str organism: The organism can be specified either by
            official name, e.g. "homo sapiens" or by NCBI taxonomy id,
            e.g. "9606". Similar to data sources, if multiple organisms
            are declared a union of all hits from specified organisms
            is returned. For example organism=[9606, 10016] returns results
            for both human and mice.
        :param str type: BioPAX class filter. (e.g., 'pathway', 'proteinreference')


        .. doctest::

            >>> from bioservices import PathwayCommons
            >>> pc2 = PathwayCommons(vverbose=False)
            >>> pc2.search("Q06609")
            >>> pc2.search("brca2", type="proteinreference",
                    organism="homo sapiens",  datasource="pid")
            >>> pc2.search("name:'col5a1'", type="proteinreference", organism=9606)
            >>> pc2.search("a*", page=3)

        Find the FGFR2 keyword::

            pc2.search("FGFR2")

        Find pathways by FGFR2 keyword in any index field.::

            pc2.search("FGFR2", type="pathway")

        Finds control interactions that contain the word binding but not
        transcription in their indexed fields::

            pc2.search("binding NOT transcription", type="control")

        Find all interactions that directly or indirectly participate
        in a pathway that has a keyword match for "immune" (Note the star after
        immune):

            pc.search("pathway:immune*", type="conversion")


        Find all Reactome pathways::

            pc.search("*", type="pathway", datasource="reactome")

        """
        if self.default_extension == "xml":
            url = "pc2/search.xml?q=%s" % q
        elif self.default_extension == "json":
            url = "pc2/search.json?q=%s" % q

        params = {}
        if page >= 0:
            params["page"] = page
        else:
            self.services.logging.warning("page should be >=0")

        if datasource:
            params["datasource"] = datasource

        if type:
            params["type"] = type

        if organism:
            params["organism"] = organism

        res = self.services.http_get(url, frmt=self.default_extension, params=params)

        # if self.default_extension == "json":
        #    res = json.loads(res)
        if self.default_extension == "xml":
            import bs4

            res = bs4.BeautifulSoup(res, "xml")

        return res

[docs]    def get(self, uri, frmt="BIOPAX"):
        """Retrieves full pathway information for a set of elements

        elements can be for example pathway, interaction or physical
        entity given the RDF IDs. Get commands only
        retrieve the BioPAX elements that are directly mapped to the ID.
        Use the :meth:`traverse` query to traverse BioPAX graph and
        obtain child/owner elements.

        :param str uri: valid/existing BioPAX element's URI (RDF ID; for
            utility classes that were "normalized", such as entity refereneces
            and controlled vocabularies, it is usually a Identifiers.org URL.
            Multiple IDs can be provided using list
            uri=[http://identifiers.org/uniprot/Q06609,
            http://identifiers.org/uniprot/Q549Z0']
            See also about MIRIAM and Identifiers.org.
        :param str format: output format (values)

        :return: a complete BioPAX representation for the record
            pointed to by the given URI is returned. Other output
            formats are produced by converting the BioPAX record on
            demand and can be specified by the optional format
            parameter. Please be advised that with some output formats
            it might return "no result found" error if the conversion is
            not applicable for the BioPAX result. For example,
            BINARY_SIF output usually works if there are some
            interactions, complexes, or pathways in the retrieved set
            and not only physical entities.


        .. doctest::

            >>> from bioservices import PathwayCommons
            >>> pc2 = PathwayCommons(verbose=False)
            >>> res = pc2.get("col5a1")
            >>> res = pc2.get("http://identifiers.org/uniprot/Q06609")


        """

        self.services.devtools.check_param_in_list(frmt, self._valid_format)

        # validates the URIs
        if isinstance(uri, str):
            url = "pc2/get?uri=" + uri
        elif isinstance(uri, list):
            url = "pc2/get?uri=" + uri[0]
            if len(uri) > 1:
                for u in uri[1:]:
                    url += "&uri=" + u

        # ?uri=http://identifiers.org/uniprot/Q06609
        # http://www.pathwaycommons.org/pc2/get?uri=COL5A1

        if frmt != "BIOPAX":
            url += "&format=%s" % frmt

        if frmt.lower() in ["biopax", "sbgn"]:
            frmt = "xml"
        else:
            frmt = "txt"
        res = self.services.http_get(url, frmt=frmt)

        return res

[docs]    def top_pathways(self, query="*", datasource=None, organism=None):
        """This command returns all *top* pathways

                Pathways can be top or pathways that are neither
                'controlled' nor 'pathwayComponent' of another process.

                :param query: a keyword, name, external identifier or lucene query
                    string like in 'search'. Default is "*"
                :param str datasource: filter by data source (same as search)
                :param str organism: organism filter. 9606 for human.

                :return: dictionary with information about top pathways. Check the
                    "searchHit" key for information about "dataSource" for instance


                .. doctest::

                    >>> from bioservices import PathwayCommons
                    >>> pc2 = PathwayCommons(verbose=False)
                    >>> res = pc2.top_pathways()


        https://www.pathwaycommons.org/pc2/top_pathways?q=TP53

        """
        if self.default_extension == "json":
            url = "pc2/top_pathways.json"
        else:
            url = "pc2/top_pathways"

        params = {}
        if datasource:
            params["datasource"] = datasource
        if organism:
            params["organism"] = organism
        params["q"] = query

        res = self.services.http_get(url, frmt=self.default_extension, params=params)

        if self.default_extension == "xml":
            import bs4

            res = bs4.BeautifulSoup(res, "xml")
        return res

[docs]    def graph(
        self,
        kind,
        source,
        target=None,
        direction=None,
        limit=1,
        frmt=None,
        datasource=None,
        organism=None,
    ):
        """Finds connections and neighborhoods of elements

        Connections can be for example the shortest path between two proteins
        or the neighborhood for a particular protein state or all states.

        Graph searches take detailed BioPAX semantics such as generics or
        nested complexes into account and traverse the graph accordingly.
        The starting points can be either physical entites or entity references.

        In the case of the latter the graph search starts from ALL
        the physical entities that belong to that particular entity references,
        i.e.  all of its states. Note that we integrate BioPAX data from
        multiple databases  based on our proteins and small molecules data
        warehouse and consistently normalize UnificationXref, EntityReference,
        Provenance, BioSource, and ControlledVocabulary objects when we are
        absolutely sure that two objects of the same type are equivalent. We,
        however, do not merge physical entities and reactions from different
        sources as matching and aligning pathways at that level is still an
        open research problem. As a result, graph searches can return
        several similar but disconnected sub-networks that correspond to
        the pathway data from different providers (though some physical
        entities often refer to the same small molecule or protein reference
        or controlled vocabulary).


        :param str kind: graph query
        :param str source:  source object's URI/ID. Multiple source URIs/IDs
            must be encoded as list of valid URI
            **source=['http://identifiers.org/uniprot/Q06609',
            'http://identifiers.org/uniprot/Q549Z0']**.
        :param str target: required for PATHSFROMTO graph query.  target
            URI/ID. Multiple target URIs must be encoded as list (see source
            parameter).
        :param str direction: graph search  direction in [BOTHSTREAM,
            DOWNSTREAM, UPSTREAM] see :attr:`_valid_directions` attribute.
        :param int limit: graph query search distance limit (default = 1).
        :param str format: output format. see :attr:`_valid-format`
        :param str datasource: datasource filter (same as for 'search').
        :param str organism: organism filter (same as for 'search').


        :return:  By default, graph queries return a complete BioPAX
            representation of the subnetwork matched by the algorithm.
            Other output formats are available as specified by the optional
            format parameter. Please be advised that some output format
            choices might cause "no result found" error if the conversion
            is not applicable for the BioPAX result (e.g., BINARY_SIF output
            fails if there are no interactions, complexes, nor pathways
            in the retrieved set).

        .. doctest::

            >>> from bioservices import PathwayCommons
            >>> pc2 = PathwayCommons(verbose=False)
            >>> res = pc2.graph(source="http://identifiers.org/uniprot/P20908",
                    kind="neighborhood", format="EXTENDED_BINARY_SIF")



        """
        url = "pc2/graph"
        params = {}
        params["source"] = source
        params["kind"] = kind
        params["limit"] = limit

        params = {}
        if target:
            params["target"] = target
        if frmt:
            params["format"] = frmt
        if datasource:
            params["datasource"] = datasource
        if organism:
            params["organism"] = organism

        res = self.services.http_get(url, frmt="txt", params=params)
        return res

[docs]    def traverse(self, uri, path):
        """Provides XPath-like access to the PC.


        The format of the path query is in the form::

            [InitialClass]/[property1]:[classRestriction(optional)]/[property2]... A "*"

        sign after the property instructs path accessor to transitively traverse
        that property. For example, the following path accessor will traverse
        through all physical entity components within a complex::

            "Complex/component*/entityReference/xref:UnificationXref"

        The following will list display names of all participants of
        interactions, which are components (pathwayComponent) of a pathway
        (note: pathwayOrder property, where same or other interactions can be
        reached, is not considered here)::

            "Pathway/pathwayComponent:Interaction/participant*/displayName"

        The optional parameter classRestriction allows to restrict/filter the
        returned property values to a certain subclass of the range of that
        property. In the first example above, this is used to get only the
        Unification Xrefs. Path accessors can use all the official BioPAX
        properties as well as additional derived classes and parameters in
        paxtools such as inverse parameters and interfaces that represent
        anonymous union classes in OWL. (See Paxtools documentation for more
        details).

        :param str uri: a biopax element URI - specified similar to the 'GET'
            command. multiple IDs are allowed as a list of strings.
        :param str path: a BioPAX propery path in the form of
                property1[:type1]/property2[:type2]; see above, inverse
                properties, Paxtools,
                org.biopax.paxtools.controller.PathAccessor.

        .. seealso:: `properties
            <http://www.pathwaycommons.org/pc2/#biopax_properties>`_

        :return:  XML result that follows the Search Response XML Schema
            (TraverseResponse type; pagination is disabled: returns all values at
            once)

        ::


            from bioservices import PathwayCommons
            pc2 = PathwayCommons(verbose=False)
            res = pc2.traverse(uri=['http://identifiers.org/uniprot/P38398','http://identifiers.org/uniprot/Q06609'], path="ProteinReference/organism")
            res = pc2.traverse(uri="http://identifiers.org/uniprot/Q06609",
                path="ProteinReference/entityReferenceOf:Protein/name")
            res = pc2.traverse("http://identifiers.org/uniprot/P38398",
                path="ProteinReference/entityReferenceOf:Protein")
            res = pc2.traverse(uri=["http://identifiers.org/uniprot/P38398",
                "http://identifiers.org/taxonomy/9606"], path="Named/name")


        """
        url = "pc2/traverse?"

        if isinstance(uri, str):
            url += "?uri=" + uri
        elif isinstance(uri, list):
            url += "?uri=" + uri[0]
            for u in uri[1:]:
                url += "&uri=" + u

        url += "&path=" + path

        res = self.services.http_get(url, frmt="json")
        return res

[docs]    def get_sifgraph_neighborhood(self, source, limit=1, direction="BOTHSTREAM", pattern=None):
        """finds the neighborhood sub-network in the Pathway Commons Simple Interaction
        Format (extented SIF) graph (see http://www.pathwaycommons.org/pc2/formats#sif)


        :param source: set of gene identifiers (HGNC symbol). Can be a list of
            identifiers or just one string(if only one identifier)
        :param int limit: Graph traversal depth. Limit > 1 value can result
            in very large data or error.
        :param str direction: Graph traversal direction. Use UNDIRECTED if you want
            to see interacts-with relationships too.
        :param str pattern: Filter by binary relationship (SIF edge) type(s).
            one of "BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED".

        returns: the graph in SIF format. The output must be stripped and
            returns one line per relation. In each line, items are separated by
            a tabulation. You can save the text with .sif extensions and it
            should be ready to use e.g. in cytoscape viewer.

        ::

            res = pc.get_sifgraph_neighborhood('BRD4')

        """
        self.services.devtools.check_param_in_list(direction, self._valid_directions)
        if pattern:
            self.services.devtools.check_param_in_list(pattern, self._valid_patterns)
        if limit < 1:
            raise ValueError("limit must be >= 1")

        if isinstance(source, str):
            source = [source]
        if not isinstance(source, list):
            raise TypeError("source must be a list")
        source = ",".join(source)

        params = {"source": source, "limit": limit, "direction": direction}

        if pattern:
            params["pattern"] = pattern

        res = self.services.http_get(
            "sifgraph/v1/neighborhood",
            params=params,
            headers=self.services.get_headers(content="text"),
        )

        return res.content

[docs]    def get_sifgraph_common_stream(self, source, limit=1, direction="DOWNSTREAM", pattern=None):
        """finds the common stream for them; extracts a sub-network from the loaded
        Pathway Commons SIF model.

        :param source: set of gene identifiers (HGNC symbol). Can be a list of
            identifiers or just one string(if only one identifier)
        :param int limit: Graph traversal depth. Limit > 1 value can result
            in very large data or error.
        :param str direction: Graph traversal direction. Use UNDIRECTED if you want
            to see interacts-with relationships too.
        :param str pattern: Filter by binary relationship (SIF edge) type(s).
            one of "BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED".

        returns: the graph in SIF format. The output must be stripped and
            returns one line per relation. In each line, items are separated by
            a tabulation. You can save the text with .sif extensions and it
            should be ready to use e.g. in cytoscape viewer.

        ::

            res = pc.get_sifgraph_common_stream(['BRD4', 'MYC'])
        """
        self.services.devtools.check_param_in_list(direction, self._valid_directions)
        if pattern:
            self.services.devtools.check_param_in_list(pattern, self._valid_patterns)
        if limit < 1:
            raise ValueError("limit must be >= 1")

        if isinstance(source, str):
            source = [source]
        if not isinstance(source, list):
            raise TypeError("source must be a list")
        source = ",".join(source)

        params = {"source": source, "limit": limit, "direction": direction}

        if pattern:
            params["pattern"] = pattern

        res = self.services.http_get(
            "sifgraph/v1/commonstream",
            params=params,
            headers=self.services.get_headers(content="text"),
        )
        try:
            return res.content
        except Exception:
            # if no match, returns code 406 and ""
            return None

[docs]    def get_sifgraph_pathsbetween(self, source, limit=1, directed=False, pattern=None):
        """finds the paths between them; extracts a sub-network from the Pathway Commons SIF graph.

        :param source: set of gene identifiers (HGNC symbol). Can be a list of
            identifiers or just one string(if only one identifier)
        :param int limit: Graph traversal depth. Limit > 1 value can result
            in very large data or error.
        :param bool directed: Directionality: 'true' is for DOWNSTREAM/UPSTREAM, 'false' - UNDIRECTED
        :param str pattern: Filter by binary relationship (SIF edge) type(s).
            one of "BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED".

        returns: the graph in SIF format. The output must be stripped and
            returns one line per relation. In each line, items are separated by
            a tabulation. You can save the text with .sif extensions and it
            should be ready to use e.g. in cytoscape viewer.
        """
        if pattern:
            self.services.devtools.check_param_in_list(pattern, self._valid_patterns)
        if limit < 1:
            raise ValueError("limit must be >= 1")

        if isinstance(source, str):
            source = [source]
        if not isinstance(source, list):
            raise TypeError("source must be a list")
        source = ",".join(source)

        params = {"source": source, "limit": limit, "directed": directed}

        if pattern:
            params["pattern"] = pattern

        res = self.services.http_get(
            "sifgraph/v1/pathsbetween",
            params=params,
            headers=self.services.get_headers(content="text"),
        )

        return res.content

[docs]    def get_sifgraph_pathsfromto(self, source, target, limit=1, pattern=None):
        """finds the paths between them; extracts a sub-network from the Pathway Commons SIF graph.

        :param source: set of gene identifiers (HGNC symbol). Can be a list of
            identifiers or just one string(if only one identifier)
        :param target: A target set of gene identifiers.
        :param int limit: Graph traversal depth. Limit > 1 value can result
            in very large data or error.
        :param str pattern: Filter by binary relationship (SIF edge) type(s).
            one of "BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED".

        returns: the graph in SIF format. The output must be stripped and
            returns one line per relation. In each line, items are separated by
            a tabulation. You can save the text with .sif extensions and it
            should be ready to use e.g. in cytoscape viewer.
        """
        if pattern:
            self.services.devtools.check_param_in_list(pattern, self._valid_patterns)
        if limit < 1:
            raise ValueError("limit must be >= 1")

        if isinstance(source, str):
            source = [source]
        if not isinstance(source, list):
            raise TypeError("source must be a list")
        source = ",".join(source)
        if isinstance(target, str):
            target = [target]
        if not isinstance(target, list):
            raise TypeError("target must be a list")
        target = ",".join(target)

        params = {"source": source, "target": target, "limit": limit}

        if pattern:
            params["pattern"] = pattern

        res = self.services.http_get(
            "sifgraph/v1/pathsfromto",
            params=params,
            headers=self.services.get_headers(content="text"),
        )

        return res.content