Source code for bioservices.kegg

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s):
#      Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
"""This module provides a class :class:`~KEGG` to access to the
REST KEGG interface. There are additional methods and functionalities added by
**BioServices**.

.. note:: a previous imterface to the KEGG WSDL service was designed but the
    WSDL closed in  Dec 2012.

.. topic:: What is KEGG ?

    :URL: http://www.kegg.jp/
    :REST: http://www.kegg.jp/kegg/rest/keggapi.html
    :weblink: http://www.genome.jp/kegg/rest/weblink.html
    :dbentries: http://www.genome.jp/kegg/rest/dbentry.html

    .. highlights::

        "KEGG is a database resource for understanding high-level functions and
        utilities of the biological system, such as the cell, the organism and the
        ecosystem, from molecular-level information, especially large-scale molecular
        datasets generated by genome sequencing and other high-throughput experimental
        technologies (See Release notes for new and updated features). "

        -- KEGG home page, Jan 2013

.. _terminology:

Some terminology
--------------------

The following list is a simplified list of terminology taken from KEGG API
pages.


* organisms (**org**) are made of a three-letter (or four-letter) code (e.g.,
  **hsa** stands for Human Sapiens) used in KEGG (see  :attr:`~bioservices.kegg.KEGG.organismIds`).
* **db** is a database name. See :attr:`~bioservices.kegg.KEGG.databases`
  attribute and :ref:`kegg_database` section.
* **entry_id** is a unique identifier. It is a combination of the database name
  and the identifier of an entry joined by a colon sign (e.g. 'embl:J00231').

  Here are some examples of entry Ids:

    * **genes_id**: A KEGG organism and a gene name (e.g. 'eco:b0001').
    * **enzyme_id**: 'ec' and an enzyme code. (e.g. 'ec:1.1.1.1'). 
      See :attr:`~bioservices.kegg.KEGG.enzymeIds`.
    * **compound_id**: 'cpd' and a compound number (e.g. 'cpd:C00158').
      Some compounds also have 'glycan_id' and
      both IDs are accepted and converted internally.
      See :attr:`~bioservices.kegg.KEGG.compoundIds`.
    * **drug_id**: 'dr' and a drug number (e.g. 'dr:D00201'). See
      :attr:`~bioservices.kegg.KEGG.drugIds`.
    * **glycan_id**: 'gl' and a glycan number (e.g.
    * 'gl:G00050'). Some glycans also have 'compound_id' and both
      IDs are accepted and converted internally. see
      :attr:`~bioservices.kegg.KEGG.glycanIds` attribute.
    * **reaction_id**:  'rn' and a reaction number (e.g.
    * 'rn:R00959' is a reaction which catalyze cpd:C00103 into cpd:C00668).
      See :attr:`~bioservices.kegg.KEGG.reactionIds` attribute.
    * **pathway_id**: 'path' and a pathway number. Pathway numbers prefixed
      by 'map' specify the reference pathway and pathways prefixed by
      a KEGG organism specify pathways specific to the organism (e.g.
      'path:map00020', 'path:eco00020'). 
      See :attr:`~bioservices.kegg.KEGG.pathwayIds` attribute.
    * **motif_id**: a motif database names ('ps' for prosite, 'bl' for blocks,
      'pr' for prints, 'pd' for prodom, and 'pf' for pfam) and a motif entry
      name. (e.g. 'pf:DnaJ' means a Pfam  database entry 'DnaJ').
    * **ko_id**: identifier made of 'ko' and a ko number (e.g. 'ko:K02598').
      See :attr:`~bioservices.kegg.KEGG.koIds` attribute.


.. _kegg_database:

KEGG Databases Names and Abbreviations
-------------------------------------------

Here is a list of databases used in KEGG API with their name and abbreviation:

=============== =========== ==========
Database Name   Abbrev      kid
=============== =========== ==========
pathway         path        map number
brite           br          br number
module          md          M number
disease         ds          H number
drug            dr          D number
environ         ev          E number
orthology       ko          K number
genome          genome      T number
genomes         gn          T number
genes           -           -
ligand          ligand      -
compound        cpd         C number
glycan          gl          G number
reaction        rn          R number
rpair           rp          RP number
rclass          rc          RC number
enzyme          ec          -
=============== =========== ==========


.. _db_entries:

Database Entries
-------------------

Database entries can be written in on of the following ways::

    <dbentries> = <dbentry>1[+<dbentry>2...]
    <dbentry> = <db:entry> | <kid> | <org:gene>

Each database entry is identified by::

    db:entry

where "db" is the database name or its abbreviation shown above and
"entry" is the entry name or the accession number that is uniquely
assigned within the database. In reality "db" may be omitted, for
the entry name called the KEGG object identifier (kid) is unique
across KEGG.::

    kid = database-dependent prefix + five-digit number

In the KEGG GENES database the db:entry combination must be specified. This is
more specifically written as::

    org:gene

where "org" is the three- or four-letter KEGG organism code or
the T number genome identifier and "gene" is the gene identifier,
usually locus_tag or ncbi GeneID, or the primary gene name.


"""
import re

try:
    from functools import reduce  # python3 compat
except:
    pass
from bioservices.services import REST, BioServicesError
import webbrowser
import copy
from bioservices import logger

logger.name = __name__


from easydev.logging_tools import Logging

__all__ = ["KEGG", "KEGGParser"]


[docs]class KEGG(REST): """Interface to the `KEGG <http://www.genome.jp/kegg/pathway.html>`_ service This class provides an interface to the KEGG REST API. The weblink tools are partially accesible. All dbentries can be parsed into dictionaries using the :class:`KEGGParser` Here are some examples. In order to retrieve the entry of the gene identifier 7535 of the **hsa** organism, type:: from bioservices import KEGG s = KEGG() print(s.get("hsa:7535")) The output is the raw ouput sent by KEGG API. See :class:`KEGGParser` to parse this output. .. seealso:: The :ref:`db_entries` to know more about the db entries format. Another example here below shows how to print the list of pathways of the human organism:: print(s.list("pathway", organism="hsa")) Further post processing would allow you to retrieve the pathway Ids. However, we provide additional functions to the KEGG API so the previous code and post processing to extract the pathway Ids can be written as:: s.organism = "hsa" s.pathwayIds and similarly you can get all :meth:`databases` output and database Ids easily. For example, for the reaction database:: s.reaction # equivalent to s.list("reaction") s.reactionIds Other methods of interest are :meth:`conv`, :meth:`find`, :meth:`get`. .. seealso:: :ref:`kegg_database`, :ref:`db_entries`, :ref:`terminology`. """ #: valid databases _valid_DB_base = [ "module", "disease", "drug", "environ", "ko", "genome", "compound", "glycan", "reaction", "rpair", "rclass", "enzyme", ] _valid_DB = _valid_DB_base + [ "pathway", "brite", "genes", "ligand", "organism", "genomes", "orthology", ] _valid_databases_info = _valid_DB_base + [ "pathway", "brite", "genes", "ligand", "genomes", "kegg", ] _valid_databases_list = _valid_DB_base + ["pathway", "brite", "organism"] _valid_databases_find = _valid_DB_base + ["pathway", "genes", "ligand"] _valid_databases_link = _valid_DB_base + ["pathway", "brite"] _docIds = "\n\n.. seealso:: :meth:`list`\n" def __init__(self, verbose=False, cache=False): """.. rubric:: Constructor :param bool verbose: prints informative messages """ super(KEGG, self).__init__(name="KEGG", url="http://rest.kegg.jp", verbose=verbose, cache=cache) self.easyXMLConversion = False self._organism = None self._organisms = None self._organisms_tnumbers = None self._pathway = None self._glycan = None self._compound = None self._ko = None self._enzyme = None self._reaction = None self._drug = None self._brite = None self.keggParser = KEGGParser(verbose=verbose) # we could use this to retrieve all databases Ids but def __getattr__(self, req): """for x in s._valid_databases_list: print len(getattr(s, x))""" if req.endswith("Ids"): db = req[0:-3] res = self.list(db) if db in ["", ""]: Ids = [x.split()[1] for x in res.split("\n") if len(x)] else: Ids = [x.split()[0] for x in res.split("\n") if len(x)] return Ids elif req in self.databases: res = self.list(req) return res
[docs] def code2Tnumber(self, code): """Converts organism code to its T number .. doctest:: >>> from bioservices import KEGG >>> s = KEGG() >>> s.code2Tnumber("hsa") 'T01001' """ index = self.organismIds.index(code) return self.organismTnumbers[index]
[docs] def Tnumber2code(self, Tnumber): """Converts organism T number to its code .. doctest:: >>> from bioservices import KEGG >>> s = KEGG() >>> s.Tnumber2code("T01001") 'hsa' """ index = self.organismTnumbers.index(Tnumber) return self.organismIds[index]
[docs] def isOrganism(self, org): """Checks if org is a KEGG organism :param str org: :return: True if org is in the KEGG organism list (code or Tnumber) :: >>> from bioservices import KEGG >>> s = KEGG() >>> s.isOrganism("hsa") True """ if org in self.organismIds: return True if org in self.organismTnumbers: return True else: return False
def _checkDB(self, database=None, mode=None): self.logging.info("checking database %s (mode=%s)" % (database, mode)) isOrg = self.isOrganism(database) if mode == "info": if database not in KEGG._valid_databases_info and isOrg is False: self.logging.error("database or organism provided is not correct (mode=info)") raise elif mode == "list": if database not in KEGG._valid_databases_list and isOrg is False: self.logging.error("database provided is not correct (mode=list)") raise elif mode == "find": if database not in KEGG._valid_databases_find and isOrg is False: self.logging.error("database provided is not correct (mode=find)") raise elif mode == "link": if database not in KEGG._valid_databases_link and isOrg is False: self.logging.error("database provided is not correct (mode=link)") raise else: raise ValueError("mode can be only info, list, ")
[docs] def dbinfo(self, database="kegg"): """Displays the current statistics of a given database :param str database: can be one of: kegg (default), brite, module, disease, drug, environ, ko, genome, compound, glycan, reaction, rpair, rclass, enzyme, genomes, genes, ligand or any :attr:`organismIds`. :: from bioservices import KEGG s = KEGG() s.dbinfo("hsa") # human organism s.dbinfo("T01001") # same as above s.dbinfo("pathway") .. versionchanged:: 1.4.1 renamed info method into :meth:`dbinfo`, which clashes with Logging framework info() method. """ self._checkDB(database, mode="info") res = self.http_get("info/" + database, frmt="txt") return res
[docs] def list(self, query, organism=None): """Returns a list of entry identifiers and associated definition for a given database or a given set of database entries :param str query: can be one of pathway, brite, module, disease, drug, environ, ko, genome, compound, glycan, reaction, rpair, rclass, enzyme, organism **or** an organism from the :attr:`organismIds` attribute **or** a valid dbentry (see below). If a dbentry query is provided, organism should not be used! :param str organism: a valid organism identifier that can be provided. If so, database can be only "pathway" or "module". If not provided, the default value is chosen (:attr:`organism`) :return: A string with a structure that depends on the query Here is an example that shows how to extract the pathways IDs related to the hsa organism:: >>> s = KEGG() >>> res = s.list("pathway", organism="hsa") >>> pathways = [x.split()[0] for x in res.strip().split("\\n")] >>> len(pathways) # as of Dec 2012 261 Note, however, that there are convenient aliases to some of the databases. For instance, the pathway Ids can also be retrieved as a list from the :attr:`pathwayIds` attribute (after defining the :attr:`organism` attribute). .. note:: If you set the query to a valid organism, then the second argument rganism is irrelevant and ignored. .. note:: If the query is not a database or an organism, it is supposed to be a valid dbentries string and the maximum number of entries is 100. Other examples:: s.list("pathway") # returns the list of reference pathways s.list("pathway", "hsa") # returns the list of human pathways s.list("organism") # returns the list of KEGG organisms with taxonomic classification s.list("hsa") # returns the entire list of human genes s.list("T01001") # same as above s.list("hsa:10458+ece:Z5100") # returns the list of a human gene and an E.coli O157 gene s.list("cpd:C01290+gl:G00092")# returns the list of a compound entry and a glycan entry s.list("C01290+G00092") # same as above """ url = "list" if query: # can be something else than a database so we can not use checkDB # self._checkDB(database, mode="list") url += "/" + query if organism: if organism not in self.organismIds: self.logging.error("""Invalid organism provided (%s). See the organismIds attribute""" % organism) raise BioServicesError("Not a valid organism") if query not in ["pathway", "module"]: self.logging.error( """ If organism is set, then the first argument (database) must be either 'pathway' or 'module'. You provided %s""" % query ) raise url += "/" + organism res = self.http_get(url, "txt") return res
[docs] def find(self, database, query, option=None): """finds entries with matching query keywords or other query data in a given database :param str database: can be one of pathway, module, disease, drug, environ, ko, genome, compound, glycan, reaction, rpair, rclass, enzyme, genes, ligand or an organism code (see :attr:`organismIds` attributes) or T number (see :attr:`organismTnumbers` attribute). :param str query: See examples :param str option: If option provided, database can be only 'compound' or 'drug'. Option can be 'formula', 'exact_mass' or 'mol_weight' .. note:: Keyword search against brite is not supported. Use /list/brite to retrieve a short list. :: # search for pathways that contain Viral in the definition s.find("pathway", "Viral") # for keywords "shiga" and "toxin" s.find("genes", "shiga+toxin") # for keywords "shiga toxin" s.find("genes", ""shiga toxin") # for chemical formula "C7H10O5" s.find("compound", "C7H10O5", "formula") # for chemical formula containing "O5" and "C7" s.find("compound", "O5C7","formula") # for 174.045 =< exact mass < 174.055 s.find("compound", "174.05","exact_mass") # for 300 =< molecular weight =< 310 s.find("compound", "300-310","mol_weight") """ _valid_options = ["formula", "exact_mass", "mol_weight"] _valid_db_options = ["compound", "drug"] self._checkDB(database, mode="find") url = "find/" + database + "/" + query if option: if database not in _valid_db_options: raise ValueError( "invalid database. Since option was provided, database must be in %s" % _valid_db_options ) if option not in _valid_options: raise ValueError("invalid option. Must be in %s " % _valid_options) url += "/" + option res = self.http_get(url, frmt="txt") return res
[docs] def show_entry(self, entry): """Opens URL corresponding to a valid entry :: s.www_bget("path:hsa05416") """ url = "http://www.kegg.jp/dbget-bin/www_bget?" + entry self.logging.info(url) webbrowser.open(url)
[docs] def get(self, dbentries, option=None, parse=False): """Retrieves given database entries :param str dbentries: KEGG database entries involving the following database: pathway, brite, module, disease, drug, environ, ko, genome compound, glycan, reaction, rpair, rclass, enzyme **or** any organism using the KEGG organism code (see :attr:`organismIds` attributes) or T number (see :attr:`organismTnumbers` attribute). :param str option: one of: aaseq, ntseq, mol, kcf, image, kgml .. note:: you can add the option at the end of dbentries in which case the parameter option must not be used (see example) :: from bioservices import KEGG s = KEGG() # retrieves a compound entry and a glycan entry s.get("cpd:C01290+gl:G00092") # same as above s.get("C01290+G00092") # retrieves a human gene entry and an E.coli O157 gene entry s.get("hsa:10458+ece:Z5100") # retrieves amino acid sequences of a human gene and an E.coli O157 gene s.get("hsa:10458+ece:Z5100/aaseq") # retrieves the image file of a pathway map s.get("hsa05130/image") # same as above s.get("hsa05130", "image") Another example here below shows how to save the image of a given pathway:: res = s.get("hsa05130/image") # same as : res = s.get("hsa05130","image") f = open("test.png", "w") f.write(res) f.close() .. note:: The input is limited up to 10 entries (KEGG restriction). """ _valid_options = ["aaseq", "ntseq", "mol", "kcf", "image", "kgml"] _valid_db_options = ["compound", "drug"] # self._checkDB(database, mode="find") url = "get/" + dbentries if option: if option not in _valid_options: raise ValueError("invalid option. Must be in %s " % _valid_options) url += "/" + option res = self.http_get(url, frmt="txt") if parse is True: res = self.parse(res) return res
[docs] def conv(self, target, source): """convert KEGG identifiers to/from outside identifiers :param str target: the target database (e.g., a KEGG organism). :param str source: the source database (e.g., uniprot) or a valid dbentries; see below for details. :return: a dictionary with keys being the source and values being the target. Here are the rules to set the target and source parameters. If the second argument is not a **dbentries**, source and target parameters can be of two types: #. gene identifiers. If the target is a KEGG Id, then the source must be one of *ncbi-gi*, *ncbi-geneid* or *uniprot*. .. note:: source and target can be swapped. #. chemical substance identifiers. If the target is one of the following kegg database: drug, compound, glycan then the source must be one of *pubchem* or *chebi*. .. note:: again, source and target can be swapped If the second argument is a **dbentries**, it can be again of two types: #. gene identifiers. The database used can be one ncbi-gi, ncbi-geneid, uniprot or any KEGG organism #. chemical substance identifiers. The database used can be one of drug, compound, glycan, pubchem or chebi only. .. note:: if the second argument is a dbentries, target and dbentries cannot be swapped. :: # conversion from NCBI GeneID to KEGG ID for E. coli genes conv("eco","ncbi-geneid") # inverse of the above example conv("eco","ncbi-geneid") #conversion from KEGG ID to NCBI GI conv("ncbi-gi","hsa:10458+ece:Z5100") To make it clear by taking another example, you can either convert an entire database to another (e.g., from uniprot to KEGG Id all human gene IDs):: uniprot_ids, kegg_ids = s.conv("hsa", "uniprot") or a subset by providing a valid **dbentries**:: s.conv("hsa","up:Q9BV86+") .. warning:: dbentries are not check and are supposed to be correct. See :meth:`check_idbentries` to help you checking a dbentries. .. warning:: call to this function may be long. conv("hsa", "uniprot") takes a minute suprinsingly, conv("uniprot", "hsa") takes just a few seconds. .. versionchanged:: 1.1 the output is now a dictionary, not a list of tuples """ # The second argument may be a source_db or a dbentries so checking # second argument is kind of tricky because dbentries take lots of # different form. # for now, we only check the first argument. # gene identifiers isOrg = self.isOrganism(target) if isOrg is False and target not in [ "ncbi-gi", "ncbi-geneid", "uniprot", "pubchem", "chebi", "drug", "compound", "glycan", ]: raise ValueError( """ Invalid syntax. target must be a KEGG ID or one of the allowed database. See documentation og :meth:`conv` for details""" ) """if target in self.organismIds: if source not in ['ncbi-gi', 'ncbi-geneid', 'uniprot']: raise ValueError("Invalid source (must be ncbi-gi, ncbi-geneid or uniprot)") elif target in ['ncbi-gi', 'ncbi-geneid', 'uniprot']: if source not in self.organismIds: raise ValueError("Invalid source (must be a valid KEGG organism)") # for chenical substance identifiers elif target in ['drug', 'compound', 'glycan']: if source not in ['chebi', 'pubchem']: raise ValueError("Invalid source. Must be chebi or pubchem") elif target in ['chebi', 'pubchem']: if source not in ['drug', 'compound', 'glycan']: raise ValueError("Invalid source. Must be drug, compound or glycan") else: self.logging.info("arguments not checked") """ url = "conv/" + target + "/" + source res = self.http_get(url, frmt="txt") try: t = [x.split("\t")[0] for x in res.strip().split("\n")] s = [x.split("\t")[1] for x in res.strip().split("\n")] return dict([(x, y) for x, y in zip(t, s)]) except: return res
[docs] def entry(self, dbentries): """Retrieve entry There is a weblink service (see http://www.genome.jp/kegg/rest/weblink.html) Since it is equivalent to :meth:`get`, we do not implement it for now """ raise NotImplementedError("Use :meth:`get` instead")
[docs] def show_pathway(self, pathId, scale=None, dcolor="pink", keggid={}, show=True): """Show a given pathway inside a web browser :param str pathId: a valid pathway Id. See :meth:`pathwayIds` :param int scale: you can scale the image with a value between 0 and 100 :param str dcolor: set the default background color of nodes :param dict keggid: set color of entries contained in the pathway as key/value pairs; can also be a list, in which case all nodes have the same default color (red) .. note:: if scale is provided, dcolor and keggid are ignored. :: # show a pathway in the browser s.show_pathway("path:hsa05416", scale=50) # Same as above but also highlights some KEGG Ids (red for all) s.show_pathway("path:hsa05416", dcolor="white", keggid=['1525', '1604', '2534']) # You can refine the colors using a dictionary: s.show_pathway("path:hsa05416", dcolor="white", keggid={'1525':'yellow,red', '1604':'blue,green', '2534':"blue"}) .. anotherway to higlight: http://www.kegg.jp/pathway/map00010+K00873+C00022 """ if pathId.startswith("path:"): pathId = pathId.split(":")[1] if scale: scale = int(scale / 100.0 * 100) / 100.0 # just need 2 digits and a value in [0,1] url = "http://www.kegg.jp/kegg-bin/show_pathway?scale=" + str(scale) url += "&query=&map=" + pathId else: url = "http://www.kegg.jp/kegg-bin/show_pathway?" + pathId if dcolor: url += "/default%%3d%s/" % dcolor if isinstance(keggid, dict): if len(keggid.keys()) > 0: for k, v in keggid.items(): if "," in v: url += "/%s%%09%s/" % (k, v) else: url += "/%s%%09,%s/" % (k, v) elif isinstance(keggid, list): for k in keggid: url += "/%s%%09,%s/" % (k, "red") self.logging.info(url) if show: webbrowser.open(url) return url
[docs] def save_pathway(self, pathId, filename, scale=None, keggid={}, params={}): """Save KEGG pathway in PNG format :param pathId: a valid pathway identifier :param str filename: output PNG file :param params: valid kegg params expected """ import requests url = self.show_pathway(pathId, scale, keggid=keggid, show=False) html_page = requests.post(url, data=params) html_page = html_page.content.decode() links_to_png = [x for x in html_page.split() if "png" in x and x.startswith("src")] link_to_png = links_to_png[0].replace("src=", "").replace('"', "") r = requests.get("https://www.kegg.jp/{}".format(link_to_png)) if filename is None: filename = "{}.png".format(pathway_ID) with open(filename, "wb") as fout: fout.write(r.content)
[docs] def show_module(self, modId): """Show a given module inside a web browser :param str modId: a valid module Id. See :meth:`moduleIds` Validity of modId is not checked but if wrong the URL will not open a proper web page. """ if modId.startswith("md:"): modId = modId.split(":")[1] url = "http://www.kegg.jp/module/" + modId self.logging.info(url) res = webbrowser.open(url) return res
# wrapper of all databases to ease access to them (buffered) def _get_db(self): return KEGG._valid_DB databases = property(_get_db, doc="Returns list of valid KEGG databases.") def _get_database(self, dbname, mode=0): res = self.list(dbname) assert mode in [0, 1] return [x.split()[mode] for x in res.split("\n") if len(x)] def _get_organisms(self): if self._organisms is None: self._organisms = self._get_database("organism", 1) return self._organisms organismIds = property(_get_organisms, doc="Returns list of organism Ids") def _get_reactions(self): if self._reaction is None: self._reaction = self._get_database("reaction", 0) return self._reaction reactionIds = property(_get_reactions, doc="returns list of reaction Ids") def _get_enzyme(self): if self._enzyme is None: self._enzyme = self._get_database("enzyme", 0) return self._enzyme enzymeIds = property(_get_enzyme, doc="returns list of enzyme Ids" + _docIds) def _get_organisms_tnumbers(self): if self._organisms_tnumbers is None: self._organisms_tnumbers = self._get_database("organism", 0) return self._organisms_tnumbers organismTnumbers = property(_get_organisms_tnumbers, doc="returns list of organisms (T numbers)" + _docIds) def _get_glycans(self): if self._glycan is None: self._glycan = self._get_database("glycan", 0) return self._glycan glycanIds = property(_get_glycans, doc="Returns list of glycan Ids" + _docIds) def _get_brite(self): if self._brite is None: self._brite = self._get_database("brite", 0) return self._brite briteIds = property(_get_brite, doc="returns list of brite Ids." + _docIds) def _get_kos(self): if self._ko is None: self._ko = self._get_database("ko", 0) return self._ko koIds = property(_get_kos, doc="returns list of ko Ids" + _docIds) def _get_compound(self): if self._compound is None: self._compound = self._get_database("compound", 0) return self._compound compoundIds = property(_get_compound, doc="returns list of compound Ids" + _docIds) def _get_drug(self): if self._drug is None: self._drug = self._get_database("drug", 0) return self._drug drugIds = property(_get_drug, doc="returns list of drug Ids" + _docIds) # set the default organism used by pathways retrieval def _get_organism(self): return self._organism def _set_organism(self, organism): if organism in self.organismIds: self._organism = organism self._pathway = None self._module = None self._ko = None self._glycan = None self._compound = None self._enzyme = None self._drug = None self._reaction = None self._brite = None else: self.logging.error("Invalid organism. Check the list in :attr:`organismIds` attribute") raise organism = property(_get_organism, _set_organism, doc="returns the current default organism ") def _get_pathways(self): if self.organism is None: self.logging.warning("You must set the organism first (e.g., self.organism = 'hsa')") return if self._pathway is None: res = self.http_get("list/pathway/%s" % self.organism, frmt="txt") orgs = [x.split()[0] for x in res.split("\n") if len(x)] self._pathway = orgs[:] return self._pathway pathwayIds = property( _get_pathways, doc="""returns list of pathway Ids for the default organism. :attr:`organism` must be set. :: s = KEGG() s.organism = "hsa" s.pathwayIds """, ) def _get_modules(self): if self.organism is None: self.logging.warning("You must set the organism first (e.g., self.organism = 'hsa')") return if self._module is None: res = self.http_get("list/module/%s" % self.organism) orgs = [x.split()[0] for x in res.split("\n") if len(x)] self._module = orgs[:] return self._module moduleIds = property( _get_modules, doc="""returns list of module Ids for the default organism. :attr:`organism` must be set. :: s = KEGG() s.organism = "hsa" s.moduleIds """, )
[docs] def lookfor_organism(self, query): """Look for a specific organism :param str query: your search term. upper and lower cases are ignored :return: a list of definition that matches the query """ matches = [] definitions = [" ".join(x.split()) for x in self.list("organism").split("\n")] for i, item in enumerate(definitions): if query.lower() in item.lower(): matches.append(i) return [definitions[i] for i in matches]
[docs] def lookfor_pathway(self, query): """Look for a specific pathway :param str query: your search term. upper and lower cases are ignored :return: a list of definition that matches the query """ matches = [] definitions = [" ".join(x.split()) for x in self.list("pathway").split("\n")] for i, item in enumerate(definitions): if query.lower() in item.lower(): matches.append(i) return [definitions[i] for i in matches]
[docs] def get_pathway_by_gene(self, gene, organism): """Search for pathways that contain a specific gene :param str gene: a valid gene Id :param str organism: a valid organism (e.g., hsa) :return: list of pathway Ids that contain the gene :: >>> s.get_pathway_by_gene("7535", "hsa") ['path:hsa04064', 'path:hsa04650', 'path:hsa04660', 'path:hsa05340'] """ res = self.get(":".join([organism, gene])) dic = self.parse(res) if not isinstance(dic, int) and "PATHWAY" in dic.keys(): return dic["PATHWAY"] else: self.logging.info("No pathway found ?")
[docs] def parse_kgml_pathway(self, pathwayId, res=None): """Parse the pathway in KGML format and returns a dictionary (relations and entries) :param str pathwayId: a valid pathwayId e.g. hsa04660 :param str res: if you already have the output of the query get(pathwayId), you can provide it, otherwise it is queried. :return: a dictionary with relations and entries as keys. Values of relations is a list of relations, each relation being dictionary with entry1, entry2, link, value, name. The list os entries is a list of dictionary as well. Entry contains contains more details about the entry found in the relation. See example below for details. :: >>> res = s.parse_kgml_pathway("hsa04660") >>> set([x['name'] for x in res['relations']]) >>> res['relations'][-1] {'entry1': u'15', 'entry2': u'13', 'link': u'PPrel', 'name': u'phosphorylation', 'value': u'+p'} >>> set([x['link'] for x in res['relations']]) set([u'PPrel', u'PCrel']) >>> # get information about an entry : >>> res['entries'][4] .. seealso:: `KEGG API <http://www.kegg.jp/kegg/xml/docs/>`_ """ output = {"relations": [], "entries": []} # Fixing bug #24 assembla if res is None: res = self.easyXML(self.get(pathwayId, "kgml")) else: res = self.easyXML(res) # read and parse the entries entries = [x for x in res.findAll("entry")] for entry in entries: output["entries"].append( { "id": entry.get("id"), "name": entry.get("name"), "type": entry.get("type"), "link": entry.get("link"), "gene_names": entry.find("graphics").get("name"), } ) relations = [(x.get("entry1"), x.get("entry2"), x.get("type")) for x in res.findAll("relation")] subtypes = [x.findAll("subtype") for x in res.findAll("relation")] assert len(subtypes) == len(relations) for relation, subtype in zip(relations, subtypes): if len(subtype) == 0: # nothing to do with the species ??? TODO pass else: for this in subtype: value = this.get("value") name = this.get("name") output["relations"].append( { "entry1": relation[0], "entry2": relation[1], "link": relation[2], "value": value, "name": name, } ) # we need to map back to KEgg IDs... return output
[docs] def pathway2sif(self, pathwayId, uniprot=True): """Extract protein-protein interaction from KEGG pathway to a SIF format .. warning:: experimental Not tested on all pathway. should be move to another package such as cellnopt :param str pathwayId: a valid pathway Id :param bool uniprot: convert to uniprot Id or not (default is True) :return: a list of relations (A 1 B) for activation and (A -1 B) for inhibitions This is longish due to the conversion from KEGGIds to UniProt. This method can be useful to provide prior knowledge network to software such as CellNOpt (see http://www.cellnopt.org) """ res = self.parse_kgml_pathway(pathwayId) sif = [] for rel in res["relations"]: # types can be PPrel (protein-protein interaction only if rel["link"] != "PPrel": continue if rel["name"] == "activation": Id1 = rel["entry1"] Id2 = rel["entry2"] name1 = res["entries"][[x["id"] for x in res["entries"]].index(Id1)]["name"] name2 = res["entries"][[x["id"] for x in res["entries"]].index(Id2)]["name"] type1 = res["entries"][[x["id"] for x in res["entries"]].index(Id1)]["type"] type2 = res["entries"][[x["id"] for x in res["entries"]].index(Id2)]["type"] # print("names:", rel, name1, name2) # print(type1, type2) if type1 != "gene" or type2 != "gene": continue if uniprot: try: # FIXME sometimes, there are more than one name name1 = name1.split()[0] name2 = name2.split()[0] name1 = self.conv("uniprot", name1)[name1] name2 = self.conv("uniprot", name2)[name2] except Exception: self.logging.info(name1) self.logging.info(name2) raise Exception # print(name1, 1, name2) sif.append([name1, 1, name2]) elif rel["name"] == "inhibition": Id1 = rel["entry1"] Id2 = rel["entry2"] name1 = res["entries"][[x["id"] for x in res["entries"]].index(Id1)]["name"] name2 = res["entries"][[x["id"] for x in res["entries"]].index(Id2)]["name"] type1 = res["entries"][[x["id"] for x in res["entries"]].index(Id1)]["type"] type2 = res["entries"][[x["id"] for x in res["entries"]].index(Id2)]["type"] # print("names:", rel, name1, name2) # print(type1, type2) if type1 != "gene" or type2 != "gene": continue if uniprot: name1 = name1.split()[0] name2 = name2.split()[0] name1 = self.conv("uniprot", name1)[name1] name2 = self.conv("uniprot", name2)[name2] # print(name1, -1, name2) sif.append([name1, -1, name2]) else: pass # print("#", rel['entry1'], rel['name'], rel['entry2']) return sif
[docs] def parse(self, entry): """See :class:`KEGGParser` for details Parse entry returned by :meth:`get` :: k = KEGG() res = k.get("hsa04150") d = k.parse(res) """ parse = dict() try: parse = self.keggParser.parse(entry) except: self.logging.warning("Could not parse the entry correctly.") return parse
[docs]class KEGGParser(object): """This is an extension of the :class:`KEGG` class to ease parsing of dbentries This class provides a generic method :meth:`parse` that will read the output of a dbentry returned by :meth:`KEGG.get` and converts it into a dictionary ready to use. The :meth:`parse` method parses any entry. It can be a pathway, a gene, a compound... :: from bioservices import * s = KEGG() # Retrieve a KEGG entry res = s.get("hsa04150") # parse it d = s.parse(res) As a pedagogical example, you can then further process this dictionary. Here below, we convert the gene Ids found in the pathway into UniProt Ids:: # Get the KEGG Ids in the pathway kegg_geneIds = [x for x in d['GENE']] # Convert them db_up, db_kegg = s.conv("hsa", "uniprot") # Get the corresponding uniprot Ids indices = [db_kegg.index("hsa:%s" % x ) for x in kegg_geneIds] uniprot_geneIds = [db_up[x] for x in indices] However, you could also have done it simply as follows:: kegg_geneIds = [x for x in d['gene']] uprot_geneIds = [s.parse(s.get("hsa:"+str(e)))['DBLINKS']["UniProt:"] for e in d['GENE']] .. note:: The 2 outputs are slightly different. .. seealso:: http://www.kegg.jp/kegg/rest/dbentry.html """ def __init__(self, verbose=False): super(KEGGParser, self).__init__() self.logging = Logging("bioservices:keggparser", verbose)
[docs] def parse(self, res): """Parse to any outputs returned by :meth:`KEGG.get` :param str res: output of a :meth:`KEGG.get`. :return: a dictionary. Keys are those found in the KEGG entry (e.g., REACTION, ENTRY, EQUATION, ...). The format of each value is various. It could be a string, a list (of strings generally), a dictionary, a float depending on the key. Depdending on the type of the entry (e.g., module, pathway), the type of the value may also differ (e.g., REACTION can be either a list of reactions or a dictionary depending on the content) :: >>> # Parses a drug entry >>> res = s.get("dr:D00001") >>> # Parses a pathway entry >>> res = s.get("path:hsa10584") >>> # Parses a module entry >>> res = s.get("md:hsa_M00554") >>> # Parses a disease entry >>> res = s.get("ds:H00001") >>> # Parses a environ entry >>> res = s.get("ev:E00001") >>> # Parses Orthology entry >>> res = s.get("ko:K00001") >>> # Parses a Genome entry >>> res = s.get('genome:T00001') >>> # Parses a gene entry >>> res = s.get("hsa:1525") >>> # Parses a compound entry >>> s.get("cpd:C00001") >>> # Parses a glycan entry >>> res = s.get("gl:G00001") >>> # Parses a reaction entry >>> res = s.get("rn:R00001") >>> # Parses a rpair entry >>> res = s.get("rp:RP00001") >>> # Parses a rclass entry >>> res = s.get("rc:RC00001") >>> # Parses an enzyme entry >>> res = s.get('ec:1.1.1.1') >>> d = s.parse(res) """ parser = dict() dbentry = "?" # get() should return a large amount of text data if not res or not isinstance(res, str): raise ValueError("Unexpected input, unable to parse data of type %s" % type(res)) if res[:5] == "ENTRY": dbentry = res.split("\n")[0].split(None, 2)[2] else: raise ValueError("Unable to parse data, it does not comform to the expected KEGG format") try: parser = self._parse(res) except Exception as err: self.logging.warning("Could not parse the entry %s correctly" % dbentry) self.logging.warning(err) return parser
def _parse(self, res): keys = [x.split(" ")[0] for x in res.split("\n") if len(x) and x[0] != " " and x != "///"] # let us go line by to not forget anything and know which entries are # found in the RHS. We may have duplicated once as can be found in th # keys variable as well. entries = [] entry = "" start = True for line in res.split("\n"): if line == "///": entries.append(entry) elif len(line) == 0: pass elif line[0] != " ": if start == True: start = False else: entries.append(entry) entry = line[:] else: entry += "\n" + line[:] # we can now look at each entry and create a dictionary. # The dictionary will contain as key the name found in the LHS # e.g., REACTION and the value will be either the entry content # as a string or a list of strings if the key is not unique # e.g., for references. This could be a bit annoying since # for example References could appear only once if some cases. # This can be tested though by checking the type output = {} for entry in entries: name = entry.split("\n")[0].split()[0] if keys.count(name) == 1: output[name] = entry[:] else: if name in output.keys(): output[name].append(entry[:]) else: output[name] = [entry[:]] # remove name that are now the keys of the dictionary anyway # if the values is not a list for k, v in output.items(): if k in ["CHROMOSOME", "TAXONOMY"]: continue try: output[k] = output[k].strip().replace(k, "", 1).strip() except: # skip the lists pass # Now, let us do the real stuff. # This is tricky since format is not consistent with the names e,g # REACTIONS could be sometimes a list of names and sometimes list # of reactions with their description. self.raw_parsing = copy.deepcopy(output) for key, value in output.items(): # convert to a dict if key == "STATISTICS": data = [x.split(":", 1) for x in output[key].split("\n")] data = dict([(x[0].strip(), float(x[1].strip())) for x in data]) output[key] = data # strip only: expecting a single line (string) elif key in [ "ANNOTATION", "CATEGORY", "CLASS", "COMPOSITION", "CREATED", "DATA_SOURCE", "DEFINITION", "DESCRIPTION", "ENTRY", "EFFICACY", "EQUATION", "FORMULA", "KEYWORDS", "HISTORY", "MASS", "ORGANISM", "ORG_CODE", "POSITION", "RCLASS", "KO_PATHWAY", "SYMBOL", "STRUCTURE", "TYPE", "SYSNAME", "REL_PATHWAY", ]: # get rid of \n if "\n" in value: # happens in description path:hsa04915 value = value.replace("\n", " ") # nothing to do here except strip output[key] = value.strip() # print(key) # list : set of lines. Could be split by ; character but we use the # \n instead to be sure # COMMENT is sometimes on several lines elif key in [ "NAME", "REMARK", "ACTIVITY", "COMMENT", "ORIGINAL_DB", "SUBSTRATE", "PRODUCT", "ENV_FACTOR", "PATHOGEN", ]: output[key] = [x.strip() for x in value.split("\n")] # list: long string splitted into items and therefore converted to a list elif key in ["ENZYME", "REACTION", "RPAIR", "RELATEDPAIR", "ALL_REAC"]: # RPAIR/rn:R00005 should be a dict if "_" found # REACTION/md:hsa_M00554 should be a dict if '->' found if "->" in value or "_" in value: kp = {} for line in value.split("\n"): try: k, v = line.strip().split(None, 1) except: # self.logging.warning("empty line in %s %s" % (key, line)) k = line.strip() v = "" kp[k] = v output[key] = kp.copy() else: output[key] = [x.strip() for x in value.split()] # transform to dictionary by splitting line and then split on ";" elif key in "GENE": kp = {} for i, line in enumerate(value.split("\n")): try: k, v = line.strip().split(None, 1) except: # self.logging.warning("empty line in %s %s" % (key, line)) k = line.strip() v = "" kp[k] = v output[key] = kp.copy() elif key in [ "DRUG", "ORTHOLOGY", "COMPOUND", "RMODULE", "DISEASE", "PATHWAY_MAP", "STR_MAP", "OTHER_MAP", "PATHWAY", "MODULE", "GENES", ]: kp = {} for line in value.split("\n"): try: # empty orthology in rc:RC00004 k, v = line.strip().split(None, 1) except: # self.logging.warning("empty line in %s %s" % (key, line)) k = line.strip() v = "" if k.endswith(":"): k = k.strip().rstrip(":") kp[k] = v output[key] = kp.copy() elif key in ["NETWORK"]: kp = {"ELEMENT": {}} ELEMENT = False for line in value.split("\n"): try: # empty orthology in rc:RC00004 k, v = line.strip().split(None, 1) except: # self.logging.warning("empty line in %s %s" % (key, line)) k = line.strip() v = "" if k.endswith(":"): k = k.strip().rstrip(":") if k == "ELEMENT": ELEMENT = True k, v = v.split(None, 1) if ELEMENT is False: kp[k] = v else: kp["ELEMENT"][k] = v output[key] = kp.copy() # list of dictionaries elif key == "REFERENCE": # transform to a list since you may have several entries newvalue = [self._interpret_references(this) for this in self._tolist(value)] output[key] = newvalue # list of dictionaries elif key == "PLASMID": newvalue = [self._interpret_plasmid(this) for this in self._tolist(value)] output[key] = newvalue # list of dictionaries elif key == "CHROMOSOME": newvalue = [self._interpret_chromosome(this) for this in self._tolist(value)] output[key] = newvalue # list of dictionaries elif key == "TAXONOMY": newvalue = [self._interpret_taxonomy(this) for this in self._tolist(value)] output[key] = newvalue elif key == "SEQUENCE": newvalue = [self._interpret_sequence(this) for this in self._tolist(value)] output[key] = newvalue # dictionary, interpreted as follows # on each line, there is an identifier followed by : character # looks like there is just one line... elif key in ["DRUG_TARGET", "MOTIF", "PDB"]: # STRUCTURE PDB can be long and span over several lines. e.g., # hsa:1525 new = {} # somehow the import at the top is lost import re value = re.sub("\n {6,20}", " ", value) try: for line in value.split("\n"): thiskey, content = line.split(":", 1) new[thiskey] = content output[key] = new except ValueError: output[key] = value elif key in ["DBLINKS", "INTERACTION", "METABOLISM"]: # D01441 for metabolism # DBLINKS for C00624 should work out of the box new = {} import re value = re.sub("\n {12,12+1}", "\n", value) # Sometimes (e.g. INTERACTION in D00136) values are empty # see issue #85 if len(value): for line in value.split("\n"): thiskey, content = line.strip().split(":", 1) new[thiskey] = content.strip() output[key] = new else: output[key] = value # floats elif key in ["EXACT_MASS", "MOL_WEIGHT"]: output[key] = float(value) # get rid of the length elif key in ["AASEQ", "NTSEQ"]: output[key] = value.split("\n", 1)[1].replace("\n", "") output[key] = output[key].replace(" ", "") elif key.startswith("ENTRY"): newvalue = self._interpret_entry(value) output[key] = newvalue.strip() # extract list of strings from structure. These strings are not # interpreted elif key in ["ATOM", "BOND", "NODE", "EDGE", "ALIGN", "RDM"]: # starts with a number that defines number of entries. Let us # get rid of that number and then send a list output[key] = self._interpret_enumeration(output[key]) # not interpreted elif key in [ "BRACKET", "COMPONENT", "SOURCE", "BRITE", "TARGET", "CARCINOGEN", "MARKER", ]: # do not interpret to keep structure pass else: self.logging.warning( """Found keyword %s, which has not special parsing for now. please report this issue with the KEGG identifier (%s) into github.com/bioservices. Thanks T.C.""" % (key, output["ENTRY"]) ) return output def _interpret_enumeration(self, data): N = data.strip().split("\n")[0] # must be a number N = int(N) lines = data.strip().split("\n")[1:] lines = [line.strip() for line in lines] if len(lines) != N: self.logging.warning("number of lines not as expected in %s" % data) if N == 0: return [] else: return lines def _tolist(self, value): # transform to a list since you may have several entries if isinstance(value, list) is False: value = [value] return value def _interpret_entry(self, data): res = {} for this in data.split("\n"): if this.strip().startswith("ENTRY"): pass elif this.strip().startswith("COMPOUND"): res["COMPOUND"] = this.strip().split(None, 1)[1] elif this.strip().startswith("ATOM"): res["AUTHORS"] = this.strip().split(None, 1)[1] elif this.strip().startswith("TITLE"): res["TITLE"] = this.strip().split(None, 1)[1] return res def _interpret_sequence(self, data): fields = ["GENE", "ORGANISM", "TYPE"] current_field = "SEQUENCE" res = dict({current_field: ""}) for this in data.split("\n"): this = this.strip() for f in fields: if this.startswith(f): fields.remove(f) current_field = f this = this.split(None, 1)[1] res[current_field] = "" break res[current_field] += this + " " if res["SEQUENCE"] == "": res.pop("SEQUENCE") for k in res.keys(): res[k] = res[k].strip() return res # changed v1.4.18 issue #79 """for this in data.split("\n"): if this.strip().startswith("GENE"): res['GENE'] = this.strip().split(None,1)[1] gene = True elif this.strip().startswith('ORGANISM'): res['ORGANISM'] = this.strip().split(None,1)[1] gene = False elif this.strip().startswith('TYPE'): res['TYPE'] = this.strip().split(None, 1)[1] gene = False elif gene is True: res['GENE'] += this.strip() else: assert gene is False res['SEQUENCE'] = this.strip() return res """ """ [u' 0 Mtk 1 Mtd 2 Mad 3 Man 4 Mte 5 Mtk 6 Mtd 7 Man', u' GENE 0-2 mycAI [UP:Q83WF0]; 3 mycAII [UP:Q83WE9]; 4-5 mycAIII', u' [UP:Q83WE8]; 6 mycAIV [UP:Q83WE7]; 7 mycAV [UP:Q83WE6]', u' ORGANISM Micromonospora griseorubida', u' TYPE PK'] """ def _interpret_taxonomy(self, data): res = {} for this in data.split("\n"): if this.strip().startswith("TAXONOMY"): res["TAXONOMY"] = this.strip() elif this.strip().startswith("LINEAGE"): res["LINEAGE"] = this.strip().split(None, 1)[1] return res def _interpret_references(self, data): res = {} for this in data.split("\n"): fields = this.strip().split(None, 1) if len(fields) <= 1: continue if fields[0] in ("REFERENCE", "AUTHORS", "TITLE", "JOURNAL"): res[fields[0]] = fields[1] # if this.strip().startswith("REFERENCE"): # res['REFERENCE'] = this.strip().split(None,1)[1] # elif this.strip().startswith("JOURNAL"): # res['JOURNAL'] = this.strip().split(None,1)[1] # elif this.strip().startswith("AUTHORS"): # res['AUTHORS'] = this.strip().split(None,1)[1] # elif this.strip().startswith("TITLE"): # res['TITLE'] = this.strip().split(None,1)[1] return res def _interpret_plasmid(self, data): res = {} for this in data.split("\n"): if this.strip().startswith("PLASMID"): res["PLASMID"] = this.strip().split(None, 1)[1] elif this.strip().startswith("LENGTH"): res["LENGTH"] = this.strip().split(None, 1)[1] elif this.strip().startswith("SEQUENCE"): res["SEQUENCE"] = this.strip().split(None, 1)[1] return res def _interpret_chromosome(self, data): res = {} for this in data.split("\n"): if this.strip().startswith("CHROMOSOME"): try: res["CHROMOSOME"] = this.strip().split(None, 1)[1] except: # genome:T00012 has no name res["CHROMOSOME"] = this.strip() elif this.strip().startswith("LENGTH"): res["LENGTH"] = this.strip().split(None, 1)[1] elif this.strip().startswith("SEQUENCE"): res["SEQUENCE"] = this.strip().split(None, 1)[1] return res
class KEGGTools(KEGG): """Load all genes from the database. :: k = kegg.KEGGTools() k.load_genes("hsa") genes = k.scan_genes() """ def __init__(self, verbose=False, organism="hsa"): self.kegg = KEGG() self.parser = KEGGParser() self.kegg.logging.info("Initialisation. Please wait") self.load_genes(organism) def load_genes(self, organism): res = self.kegg.list(organism) self.genes = [x.split("\t")[0] for x in res.strip().split("\n")] return self.genes def scan_genes(self): from easydev import progress_bar pb = progress_bar(len(self.genes)) import time genes = {} for i, gene in enumerate(self.genes): genes[gene] = self.parser.parse(self.kegg.get(self.genes[i])) pb.animate(i, time.time() - pb.start) return genes def load_reactions(self, organism): reactions = self.kegg.list("reaction") self.reactions = [x.split()[0] for x in reactions.split("\n") if len(x)] return self.reactions def scan_reactions(self): from easydev import progress_bar pb = progress_bar(len(self.reactions)) import time reactions = {} for i, this in enumerate(self.reactions): reactions[this] = self.parser.parse(self.kegg.get(self.reactions[i])) pb.animate(i, time.time() - pb.start) return reactions