Source code for bioservices.pathwaycommons

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s):
#      Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
"""This module provides a class :class:`~PathwayCommons`

.. topic:: What is PathwayCommons ?

    :URL: http://www.pathwaycommons.org/about
    :REST:

    .. highlights::

        Pathway Commons is a convenient point of access to biological pathway
        information collected from public pathway databases, which you can
        search, visualize and download. All data is freely available, under the
        license terms of each contributing database.

       -- PathwayCommons home page, Nov 2013


Data is freely available, under the license terms of each contributing database.

"""
from bioservices.services import REST, BioServicesError


__all__ = ["PathwayCommons"]


[docs]class PathwayCommons: """Interface to the `PathwayCommons <http://www.pathwaycommons.org/about>`_ service >>> from bioservices import * >>> pc2 = PathwayCommons(verbose=False) >>> res = pc2.get("http://identifiers.org/uniprot/Q06609") .. todo:: traverse() method not implemented. """ #: valid formats _valid_format = ["GSEA", "SBGN", "BIOPAX", "SIF", "TXT", "JSONLD"] _valid_directions = ["BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED"] _valid_patterns = [ "CONTROLS_STATE_CHANGE_OF", "CONTROLS_PHOSPHORYLATION_OF", "CONTROLS_TRANSPORT_OF", "CONTROLS_EXPRESSION_OF", "IN_COMPLEX_WITH", "INTERACTS_WITH", "CATALYSIS_PRECEDES", "NEIGHBOR_OF", "CONSUMPTION_CONTROLLED_BY", "CONTROLS_TRANSPORT_OF_CHEMICAL", "CONTROLS_PRODUCTION_OF", "CHEMICAL_AFFECTS", "REACTS_WITH", "USED_TO_PRODUCE", ] _url = "https://www.pathwaycommons.org" def __init__(self, verbose=True, cache=False): """.. rubric:: Constructor :param bool verbose: prints informative messages """ self.easyXMLConversion = False self._default_extension = "json" self.services = REST(name="PathwayCommons", url=PathwayCommons._url, verbose=verbose, cache=cache) # just a get/set to the default extension def _set_default_ext(self, ext): self.services.devtools.check_param_in_list(ext, ["json", "xml"]) self._default_extension = ext def _get_default_ext(self): return self._default_extension default_extension = property( _get_default_ext, _set_default_ext, doc="set extension of the requests (default is json). Can be 'json' or 'xml'", )
[docs] def search(self, q, page=0, datasource=None, organism=None, type=None): """Text search in PathwayCommons using Lucene query syntax Some of the parameters are BioPAX properties, others are composite relationships. All index fields are (case-sensitive): comment, ecnumber, keyword, name, pathway, term, xrefdb, xrefid, dataSource, and organism. The pathway field maps to all participants of pathways that contain the keyword(s) in any of its text fields. Finally, keyword is a transitive aggregate field that includes all searchable keywords of that element and its child elements. All searches can also be filtered by data source and organism. It is also possible to restrict the domain class using the 'type' parameter. This query can be used standalone or to retrieve starting points for graph searches. :param str q: requires a keyword , name, external identifier, or a Lucene query string. :param int page: (N>=0, default is 0), search result page number. :param str datasource: filter by data source (use names or URIs of pathway data sources or of any existing Provenance object). If multiple data source values are specified, a union of hits from specified sources is returned. datasource=[reactome,pid] returns hits associated with Reactome or PID. :param str organism: The organism can be specified either by official name, e.g. "homo sapiens" or by NCBI taxonomy id, e.g. "9606". Similar to data sources, if multiple organisms are declared a union of all hits from specified organisms is returned. For example organism=[9606, 10016] returns results for both human and mice. :param str type: BioPAX class filter. (e.g., 'pathway', 'proteinreference') .. doctest:: >>> from bioservices import PathwayCommons >>> pc2 = PathwayCommons(vverbose=False) >>> pc2.search("Q06609") >>> pc2.search("brca2", type="proteinreference", organism="homo sapiens", datasource="pid") >>> pc2.search("name:'col5a1'", type="proteinreference", organism=9606) >>> pc2.search("a*", page=3) Find the FGFR2 keyword:: pc2.search("FGFR2") Find pathways by FGFR2 keyword in any index field.:: pc2.search("FGFR2", type="pathway") Finds control interactions that contain the word binding but not transcription in their indexed fields:: pc2.search("binding NOT transcription", type="control") Find all interactions that directly or indirectly participate in a pathway that has a keyword match for "immune" (Note the star after immune): pc.search("pathway:immune*", type="conversion") Find all Reactome pathways:: pc.search("*", type="pathway", datasource="reactome") """ if self.default_extension == "xml": url = "pc2/search.xml?q=%s" % q elif self.default_extension == "json": url = "pc2/search.json?q=%s" % q params = {} if page >= 0: params["page"] = page else: self.services.logging.warning("page should be >=0") if datasource: params["datasource"] = datasource if type: params["type"] = type if organism: params["organism"] = organism res = self.services.http_get(url, frmt=self.default_extension, params=params) # if self.default_extension == "json": # res = json.loads(res) if self.default_extension == "xml": res = self.easyXML(res) return res
[docs] def get(self, uri, frmt="BIOPAX"): """Retrieves full pathway information for a set of elements elements can be for example pathway, interaction or physical entity given the RDF IDs. Get commands only retrieve the BioPAX elements that are directly mapped to the ID. Use the :meth:`traverse` query to traverse BioPAX graph and obtain child/owner elements. :param str uri: valid/existing BioPAX element's URI (RDF ID; for utility classes that were "normalized", such as entity refereneces and controlled vocabularies, it is usually a Identifiers.org URL. Multiple IDs can be provided using list uri=[http://identifiers.org/uniprot/Q06609, http://identifiers.org/uniprot/Q549Z0'] See also about MIRIAM and Identifiers.org. :param str format: output format (values) :return: a complete BioPAX representation for the record pointed to by the given URI is returned. Other output formats are produced by converting the BioPAX record on demand and can be specified by the optional format parameter. Please be advised that with some output formats it might return "no result found" error if the conversion is not applicable for the BioPAX result. For example, BINARY_SIF output usually works if there are some interactions, complexes, or pathways in the retrieved set and not only physical entities. .. doctest:: >>> from bioservices import PathwayCommons >>> pc2 = PathwayCommons(verbose=False) >>> res = pc2.get("col5a1") >>> res = pc2.get("http://identifiers.org/uniprot/Q06609") """ self.services.devtools.check_param_in_list(frmt, self._valid_format) # validates the URIs if isinstance(uri, str): url = "pc2/get?uri=" + uri elif instance(uri, list): url = "pc2/get?uri=" + uri[0] if len(uri) > 1: for u in uri[1:]: url += "&uri=" + u # ?uri=http://identifiers.org/uniprot/Q06609 # http://www.pathwaycommons.org/pc2/get?uri=COL5A1 if frmt != "BIOPAX": url += "&format=%s" % frmt if frmt.lower() in ["biopax", "sbgn"]: frmt = "xml" else: frmt = "txt" res = self.services.http_get(url, frmt=frmt) return res
[docs] def top_pathways(self, query="*", datasource=None, organism=None): """This command returns all *top* pathways Pathways can be top or pathways that are neither 'controlled' nor 'pathwayComponent' of another process. :param query: a keyword, name, external identifier or lucene query string like in 'search'. Default is "*" :param str datasource: filter by data source (same as search) :param str organism: organism filter. 9606 for human. :return: dictionary with information about top pathways. Check the "searchHit" key for information about "dataSource" for instance .. doctest:: >>> from bioservices import PathwayCommons >>> pc2 = PathwayCommons(verbose=False) >>> res = pc2.top_pathways() https://www.pathwaycommons.org/pc2/top_pathways?q=TP53 """ if self.default_extension == "json": url = "pc2/top_pathways.json" else: url = "pc2/top_pathways" params = {} if datasource: params["datasource"] = datasource if organism: params["organism"] = organism params["q"] = query res = self.services.http_get(url, frmt=self.default_extension, params=params) if self.default_extension == "xml": res = self.easyXML(res) return res
[docs] def graph( self, kind, source, target=None, direction=None, limit=1, frmt=None, datasource=None, organism=None, ): """Finds connections and neighborhoods of elements Connections can be for example the shortest path between two proteins or the neighborhood for a particular protein state or all states. Graph searches take detailed BioPAX semantics such as generics or nested complexes into account and traverse the graph accordingly. The starting points can be either physical entites or entity references. In the case of the latter the graph search starts from ALL the physical entities that belong to that particular entity references, i.e. all of its states. Note that we integrate BioPAX data from multiple databases based on our proteins and small molecules data warehouse and consistently normalize UnificationXref, EntityReference, Provenance, BioSource, and ControlledVocabulary objects when we are absolutely sure that two objects of the same type are equivalent. We, however, do not merge physical entities and reactions from different sources as matching and aligning pathways at that level is still an open research problem. As a result, graph searches can return several similar but disconnected sub-networks that correspond to the pathway data from different providers (though some physical entities often refer to the same small molecule or protein reference or controlled vocabulary). :param str kind: graph query :param str source: source object's URI/ID. Multiple source URIs/IDs must be encoded as list of valid URI **source=['http://identifiers.org/uniprot/Q06609', 'http://identifiers.org/uniprot/Q549Z0']**. :param str target: required for PATHSFROMTO graph query. target URI/ID. Multiple target URIs must be encoded as list (see source parameter). :param str direction: graph search direction in [BOTHSTREAM, DOWNSTREAM, UPSTREAM] see :attr:`_valid_directions` attribute. :param int limit: graph query search distance limit (default = 1). :param str format: output format. see :attr:`_valid-format` :param str datasource: datasource filter (same as for 'search'). :param str organism: organism filter (same as for 'search'). :return: By default, graph queries return a complete BioPAX representation of the subnetwork matched by the algorithm. Other output formats are available as specified by the optional format parameter. Please be advised that some output format choices might cause "no result found" error if the conversion is not applicable for the BioPAX result (e.g., BINARY_SIF output fails if there are no interactions, complexes, nor pathways in the retrieved set). .. doctest:: >>> from bioservices import PathwayCommons >>> pc2 = PathwayCommons(verbose=False) >>> res = pc2.graph(source="http://identifiers.org/uniprot/P20908", kind="neighborhood", format="EXTENDED_BINARY_SIF") """ url = "pc2/graph" params = {} params["source"] = source params["kind"] = kind params["limit"] = limit params = {} if target: params["target"] = target if frmt: params["format"] = frmt if datasource: params["datasource"] = datasource if organism: params["organism"] = organism res = self.services.http_get(url, frmt="txt", params=params) return res
[docs] def traverse(self, uri, path): """Provides XPath-like access to the PC. The format of the path query is in the form:: [InitialClass]/[property1]:[classRestriction(optional)]/[property2]... A "*" sign after the property instructs path accessor to transitively traverse that property. For example, the following path accessor will traverse through all physical entity components within a complex:: "Complex/component*/entityReference/xref:UnificationXref" The following will list display names of all participants of interactions, which are components (pathwayComponent) of a pathway (note: pathwayOrder property, where same or other interactions can be reached, is not considered here):: "Pathway/pathwayComponent:Interaction/participant*/displayName" The optional parameter classRestriction allows to restrict/filter the returned property values to a certain subclass of the range of that property. In the first example above, this is used to get only the Unification Xrefs. Path accessors can use all the official BioPAX properties as well as additional derived classes and parameters in paxtools such as inverse parameters and interfaces that represent anonymous union classes in OWL. (See Paxtools documentation for more details). :param str uri: a biopax element URI - specified similar to the 'GET' command. multiple IDs are allowed as a list of strings. :param str path: a BioPAX propery path in the form of property1[:type1]/property2[:type2]; see above, inverse properties, Paxtools, org.biopax.paxtools.controller.PathAccessor. .. seealso:: `properties <http://www.pathwaycommons.org/pc2/#biopax_properties>`_ :return: XML result that follows the Search Response XML Schema (TraverseResponse type; pagination is disabled: returns all values at once) :: from bioservices import PathwayCommons pc2 = PathwayCommons(verbose=False) res = pc2.traverse(uri=['http://identifiers.org/uniprot/P38398','http://identifiers.org/uniprot/Q06609'], path="ProteinReference/organism") res = pc2.traverse(uri="http://identifiers.org/uniprot/Q06609", path="ProteinReference/entityReferenceOf:Protein/name") res = pc2.traverse("http://identifiers.org/uniprot/P38398", path="ProteinReference/entityReferenceOf:Protein") res = pc2.traverse(uri=["http://identifiers.org/uniprot/P38398", "http://identifiers.org/taxonomy/9606"], path="Named/name") """ url = "pc2/traverse?" if isinstance(uri, str): url += "?uri=" + uri elif isinstance(uri, list): url += "?uri=" + uri[0] for u in uri[1:]: url += "&uri=" + u url += "&path=" + path res = self.services.http_get(url, frmt="json") return res
[docs] def get_sifgraph_neighborhood(self, source, limit=1, direction="BOTHSTREAM", pattern=None): """finds the neighborhood sub-network in the Pathway Commons Simple Interaction Format (extented SIF) graph (see http://www.pathwaycommons.org/pc2/formats#sif) :param source: set of gene identifiers (HGNC symbol). Can be a list of identifiers or just one string(if only one identifier) :param int limit: Graph traversal depth. Limit > 1 value can result in very large data or error. :param str direction: Graph traversal direction. Use UNDIRECTED if you want to see interacts-with relationships too. :param str pattern: Filter by binary relationship (SIF edge) type(s). one of "BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED". returns: the graph in SIF format. The output must be stripped and returns one line per relation. In each line, items are separated by a tabulation. You can save the text with .sif extensions and it should be ready to use e.g. in cytoscape viewer. :: res = pc.get_sifgraph_neighborhood('BRD4') """ self.services.devtools.check_param_in_list(direction, self._valid_directions) if pattern: self.services.devtools.check_param_in_list(pattern, self._valid_patterns) assert limit >= 1 if isinstance(source, str): source = [source] assert isinstance(source, list) source = ",".join(source) params = {"source": source, "limit": limit, "direction": direction} if pattern: params["pattern"] = pattern res = self.services.http_get( "sifgraph/v1/neighborhood", params=params, headers=self.services.get_headers(content="text"), ) return res.content
[docs] def get_sifgraph_common_stream(self, source, limit=1, direction="DOWNSTREAM", pattern=None): """finds the common stream for them; extracts a sub-network from the loaded Pathway Commons SIF model. :param source: set of gene identifiers (HGNC symbol). Can be a list of identifiers or just one string(if only one identifier) :param int limit: Graph traversal depth. Limit > 1 value can result in very large data or error. :param str direction: Graph traversal direction. Use UNDIRECTED if you want to see interacts-with relationships too. :param str pattern: Filter by binary relationship (SIF edge) type(s). one of "BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED". returns: the graph in SIF format. The output must be stripped and returns one line per relation. In each line, items are separated by a tabulation. You can save the text with .sif extensions and it should be ready to use e.g. in cytoscape viewer. :: res = pc.get_sifgraph_common_stream(['BRD4', 'MYC']) """ self.services.devtools.check_param_in_list(direction, self._valid_directions) if pattern: self.services.devtools.check_param_in_list(pattern, self._valid_patterns) assert limit >= 1 if isinstance(source, str): source = [source] assert isinstance(source, list) source = ",".join(source) params = {"source": source, "limit": limit, "direction": direction} if pattern: params["pattern"] = pattern res = self.services.http_get( "sifgraph/v1/commonstream", params=params, headers=self.services.get_headers(content="text"), ) try: return res.content except: # if no match, returns code 406 and "" return None
[docs] def get_sifgraph_pathsbetween(self, source, limit=1, directed=False, pattern=None): """finds the paths between them; extracts a sub-network from the Pathway Commons SIF graph. :param source: set of gene identifiers (HGNC symbol). Can be a list of identifiers or just one string(if only one identifier) :param int limit: Graph traversal depth. Limit > 1 value can result in very large data or error. :param bool directed: Directionality: 'true' is for DOWNSTREAM/UPSTREAM, 'false' - UNDIRECTED :param str pattern: Filter by binary relationship (SIF edge) type(s). one of "BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED". returns: the graph in SIF format. The output must be stripped and returns one line per relation. In each line, items are separated by a tabulation. You can save the text with .sif extensions and it should be ready to use e.g. in cytoscape viewer. """ if pattern: self.services.devtools.check_param_in_list(pattern, self._valid_patterns) assert limit >= 1 if isinstance(source, str): source = [source] assert isinstance(source, list) source = ",".join(source) params = {"source": source, "limit": limit, "directed": directed} if pattern: params["pattern"] = pattern res = self.services.http_get( "sifgraph/v1/pathsbetween", params=params, headers=self.services.get_headers(content="text"), ) return res.content
[docs] def get_sifgraph_pathsfromto(self, source, target, limit=1, pattern=None): """finds the paths between them; extracts a sub-network from the Pathway Commons SIF graph. :param source: set of gene identifiers (HGNC symbol). Can be a list of identifiers or just one string(if only one identifier) param target: A target set of gene identifiers. :param int limit: Graph traversal depth. Limit > 1 value can result in very large data or error. :param str pattern: Filter by binary relationship (SIF edge) type(s). one of "BOTHSTREAM", "UPSTREAM", "DOWNSTREAM", "UNDIRECTED". returns: the graph in SIF format. The output must be stripped and returns one line per relation. In each line, items are separated by a tabulation. You can save the text with .sif extensions and it should be ready to use e.g. in cytoscape viewer. """ if pattern: self.services.devtools.check_param_in_list(pattern, self._valid_patterns) assert limit >= 1 if isinstance(source, str): source = [source] assert isinstance(source, list) source = ",".join(source) if isinstance(target, str): target = [target] assert isinstance(target, list) target = ",".join(target) params = {"source": source, "target": target, "limit": limit} if pattern: params["pattern"] = pattern res = self.services.http_get( "sifgraph/v1/pathsfromto", params=params, headers=self.services.get_headers(content="text"), ) return res.content