#
# This file is part of bioservices software
#
# Copyright (c) 2013-2014 - EBI-EMBL
#
# File author(s):
# Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
# website: https://github.com/cokelaer/bioservices
# documentation: http://packages.python.org/bioservices
#
##############################################################################
"""This module provides a class :class:`~KEGG` to access to the
REST KEGG interface. There are additional methods and functionalities added by
**BioServices**.
.. note:: a previous imterface to the KEGG WSDL service was designed but the
WSDL closed in Dec 2012.
.. topic:: What is KEGG ?
:URL: http://www.kegg.jp/
:REST: http://www.kegg.jp/kegg/rest/keggapi.html
:weblink: http://www.genome.jp/kegg/rest/weblink.html
:dbentries: http://www.genome.jp/kegg/rest/dbentry.html
.. highlights::
"KEGG is a database resource for understanding high-level functions and
utilities of the biological system, such as the cell, the organism and the
ecosystem, from molecular-level information, especially large-scale molecular
datasets generated by genome sequencing and other high-throughput experimental
technologies (See Release notes for new and updated features). "
-- KEGG home page, Jan 2013
.. _terminology:
Some terminology
--------------------
The following list is a simplified list of terminology taken from KEGG API
pages.
* organisms (**org**) are made of a three-letter (or four-letter) code (e.g.,
**hsa** stands for Human Sapiens) used in KEGG (see :attr:`~bioservices.kegg.KEGG.organismIds`).
* **db** is a database name. See :attr:`~bioservices.kegg.KEGG.databases`
attribute and :ref:`kegg_database` section.
* **entry_id** is a unique identifier. It is a combination of the database name
and the identifier of an entry joined by a colon sign (e.g. 'embl:J00231').
Here are some examples of entry Ids:
* **genes_id**: A KEGG organism and a gene name (e.g. 'eco:b0001').
* **enzyme_id**: 'ec' and an enzyme code. (e.g. 'ec:1.1.1.1').
See :attr:`~bioservices.kegg.KEGG.enzymeIds`.
* **compound_id**: 'cpd' and a compound number (e.g. 'cpd:C00158').
Some compounds also have 'glycan_id' and
both IDs are accepted and converted internally.
See :attr:`~bioservices.kegg.KEGG.compoundIds`.
* **drug_id**: 'dr' and a drug number (e.g. 'dr:D00201'). See
:attr:`~bioservices.kegg.KEGG.drugIds`.
* **glycan_id**: 'gl' and a glycan number (e.g.
* 'gl:G00050'). Some glycans also have 'compound_id' and both
IDs are accepted and converted internally. see
:attr:`~bioservices.kegg.KEGG.glycanIds` attribute.
* **reaction_id**: 'rn' and a reaction number (e.g.
* 'rn:R00959' is a reaction which catalyze cpd:C00103 into cpd:C00668).
See :attr:`~bioservices.kegg.KEGG.reactionIds` attribute.
* **pathway_id**: 'path' and a pathway number. Pathway numbers prefixed
by 'map' specify the reference pathway and pathways prefixed by
a KEGG organism specify pathways specific to the organism (e.g.
'path:map00020', 'path:eco00020').
See :attr:`~bioservices.kegg.KEGG.pathwayIds` attribute.
* **motif_id**: a motif database names ('ps' for prosite, 'bl' for blocks,
'pr' for prints, 'pd' for prodom, and 'pf' for pfam) and a motif entry
name. (e.g. 'pf:DnaJ' means a Pfam database entry 'DnaJ').
* **ko_id**: identifier made of 'ko' and a ko number (e.g. 'ko:K02598').
See :attr:`~bioservices.kegg.KEGG.koIds` attribute.
.. _kegg_database:
KEGG Databases Names and Abbreviations
-------------------------------------------
Here is a list of databases used in KEGG API with their name and abbreviation:
=============== =========== ==========
Database Name Abbrev kid
=============== =========== ==========
pathway path map number
brite br br number
module md M number
disease ds H number
drug dr D number
environ ev E number
orthology ko K number
genome genome T number
genomes gn T number
genes - -
ligand ligand -
compound cpd C number
glycan gl G number
reaction rn R number
rpair rp RP number
rclass rc RC number
enzyme ec -
=============== =========== ==========
.. _db_entries:
Database Entries
-------------------
Database entries can be written in on of the following ways::
<dbentries> = <dbentry>1[+<dbentry>2...]
<dbentry> = <db:entry> | <kid> | <org:gene>
Each database entry is identified by::
db:entry
where "db" is the database name or its abbreviation shown above and
"entry" is the entry name or the accession number that is uniquely
assigned within the database. In reality "db" may be omitted, for
the entry name called the KEGG object identifier (kid) is unique
across KEGG.::
kid = database-dependent prefix + five-digit number
In the KEGG GENES database the db:entry combination must be specified. This is
more specifically written as::
org:gene
where "org" is the three- or four-letter KEGG organism code or
the T number genome identifier and "gene" is the gene identifier,
usually locus_tag or ncbi GeneID, or the primary gene name.
"""
import re
try:
from functools import reduce # python3 compat
except:
pass
from bioservices.services import REST, BioServicesError
import webbrowser
import copy
from bioservices import logger
logger.name = __name__
from easydev.logging_tools import Logging
__all__ = ["KEGG", "KEGGParser"]
[docs]class KEGG(REST):
"""Interface to the `KEGG <http://www.genome.jp/kegg/pathway.html>`_ service
This class provides an interface to the KEGG REST API. The weblink tools
are partially accesible. All dbentries can be parsed into dictionaries using
the :class:`KEGGParser`
Here are some examples. In order to retrieve the entry of the
gene identifier 7535 of the **hsa** organism, type::
from bioservices import KEGG
s = KEGG()
print(s.get("hsa:7535"))
The output is the raw ouput sent by KEGG API. See :class:`KEGGParser` to
parse this output.
.. seealso:: The :ref:`db_entries` to know more about the db entries format.
Another example here below shows how to print the list of pathways of
the human organism::
print(s.list("pathway", organism="hsa"))
Further post processing would allow you to retrieve the pathway Ids. However,
we provide additional functions to the KEGG API so the previous code and post
processing to extract the pathway Ids can be written as::
s.organism = "hsa"
s.pathwayIds
and similarly you can get all :meth:`databases` output and database Ids easily.
For example, for the reaction database::
s.reaction # equivalent to s.list("reaction")
s.reactionIds
Other methods of interest are :meth:`conv`, :meth:`find`, :meth:`get`.
.. seealso:: :ref:`kegg_database`, :ref:`db_entries`, :ref:`terminology`.
"""
#: valid databases
_valid_DB_base = [
"module",
"disease",
"drug",
"environ",
"ko",
"genome",
"compound",
"glycan",
"reaction",
"rpair",
"rclass",
"enzyme",
]
_valid_DB = _valid_DB_base + [
"pathway",
"brite",
"genes",
"ligand",
"organism",
"genomes",
"orthology",
]
_valid_databases_info = _valid_DB_base + [
"pathway",
"brite",
"genes",
"ligand",
"genomes",
"kegg",
]
_valid_databases_list = _valid_DB_base + ["pathway", "brite", "organism"]
_valid_databases_find = _valid_DB_base + ["pathway", "genes", "ligand"]
_valid_databases_link = _valid_DB_base + ["pathway", "brite"]
_docIds = "\n\n.. seealso:: :meth:`list`\n"
def __init__(self, verbose=False, cache=False):
""".. rubric:: Constructor
:param bool verbose: prints informative messages
"""
super(KEGG, self).__init__(name="KEGG", url="http://rest.kegg.jp", verbose=verbose, cache=cache)
self.easyXMLConversion = False
self._organism = None
self._organisms = None
self._organisms_tnumbers = None
self._pathway = None
self._glycan = None
self._compound = None
self._ko = None
self._enzyme = None
self._reaction = None
self._drug = None
self._brite = None
self.keggParser = KEGGParser(verbose=verbose)
# we could use this to retrieve all databases Ids but
def __getattr__(self, req):
"""for x in s._valid_databases_list: print len(getattr(s, x))"""
if req.endswith("Ids"):
db = req[0:-3]
res = self.list(db)
if db in ["", ""]:
Ids = [x.split()[1] for x in res.split("\n") if len(x)]
else:
Ids = [x.split()[0] for x in res.split("\n") if len(x)]
return Ids
elif req in self.databases:
res = self.list(req)
return res
[docs] def code2Tnumber(self, code):
"""Converts organism code to its T number
.. doctest::
>>> from bioservices import KEGG
>>> s = KEGG()
>>> s.code2Tnumber("hsa")
'T01001'
"""
index = self.organismIds.index(code)
return self.organismTnumbers[index]
[docs] def Tnumber2code(self, Tnumber):
"""Converts organism T number to its code
.. doctest::
>>> from bioservices import KEGG
>>> s = KEGG()
>>> s.Tnumber2code("T01001")
'hsa'
"""
index = self.organismTnumbers.index(Tnumber)
return self.organismIds[index]
[docs] def isOrganism(self, org):
"""Checks if org is a KEGG organism
:param str org:
:return: True if org is in the KEGG organism list (code or Tnumber)
::
>>> from bioservices import KEGG
>>> s = KEGG()
>>> s.isOrganism("hsa")
True
"""
if org in self.organismIds:
return True
if org in self.organismTnumbers:
return True
else:
return False
def _checkDB(self, database=None, mode=None):
self.logging.info("checking database %s (mode=%s)" % (database, mode))
isOrg = self.isOrganism(database)
if mode == "info":
if database not in KEGG._valid_databases_info and isOrg is False:
self.logging.error("database or organism provided is not correct (mode=info)")
raise
elif mode == "list":
if database not in KEGG._valid_databases_list and isOrg is False:
self.logging.error("database provided is not correct (mode=list)")
raise
elif mode == "find":
if database not in KEGG._valid_databases_find and isOrg is False:
self.logging.error("database provided is not correct (mode=find)")
raise
elif mode == "link":
if database not in KEGG._valid_databases_link and isOrg is False:
self.logging.error("database provided is not correct (mode=link)")
raise
else:
raise ValueError("mode can be only info, list, ")
[docs] def dbinfo(self, database="kegg"):
"""Displays the current statistics of a given database
:param str database: can be one of: kegg (default), brite, module,
disease, drug, environ, ko, genome, compound, glycan, reaction,
rpair, rclass, enzyme, genomes, genes, ligand or any
:attr:`organismIds`.
::
from bioservices import KEGG
s = KEGG()
s.dbinfo("hsa") # human organism
s.dbinfo("T01001") # same as above
s.dbinfo("pathway")
.. versionchanged:: 1.4.1 renamed info method into :meth:`dbinfo`,
which clashes with Logging framework info() method.
"""
self._checkDB(database, mode="info")
res = self.http_get("info/" + database, frmt="txt")
return res
[docs] def list(self, query, organism=None):
"""Returns a list of entry identifiers and associated definition for a given database or a given set of database entries
:param str query: can be one of pathway, brite, module,
disease, drug, environ, ko, genome, compound,
glycan, reaction, rpair, rclass, enzyme, organism
**or** an organism from the :attr:`organismIds` attribute **or** a valid
dbentry (see below). If a dbentry query is provided, organism
should not be used!
:param str organism: a valid organism identifier that can be
provided. If so, database can be only "pathway" or "module". If
not provided, the default value is chosen (:attr:`organism`)
:return: A string with a structure that depends on the query
Here is an example that shows how to extract the pathways IDs related to
the hsa organism::
>>> s = KEGG()
>>> res = s.list("pathway", organism="hsa")
>>> pathways = [x.split()[0] for x in res.strip().split("\\n")]
>>> len(pathways) # as of Dec 2012
261
Note, however, that there are convenient aliases to some of the databases.
For instance, the pathway Ids can also be retrieved as a list from the
:attr:`pathwayIds` attribute (after defining the :attr:`organism` attribute).
.. note:: If you set the query to a valid organism, then the second
argument rganism is irrelevant and ignored.
.. note:: If the query is not a database or an organism, it is supposed
to be a valid dbentries string and the maximum number of entries is 100.
Other examples::
s.list("pathway") # returns the list of reference pathways
s.list("pathway", "hsa") # returns the list of human pathways
s.list("organism") # returns the list of KEGG organisms with taxonomic classification
s.list("hsa") # returns the entire list of human genes
s.list("T01001") # same as above
s.list("hsa:10458+ece:Z5100") # returns the list of a human gene and an E.coli O157 gene
s.list("cpd:C01290+gl:G00092")# returns the list of a compound entry and a glycan entry
s.list("C01290+G00092") # same as above
"""
url = "list"
if query:
# can be something else than a database so we can not use checkDB
# self._checkDB(database, mode="list")
url += "/" + query
if organism:
if organism not in self.organismIds:
self.logging.error("""Invalid organism provided (%s). See the organismIds attribute""" % organism)
raise BioServicesError("Not a valid organism")
if query not in ["pathway", "module"]:
self.logging.error(
"""
If organism is set, then the first argument
(database) must be either 'pathway' or 'module'. You provided %s"""
% query
)
raise
url += "/" + organism
res = self.http_get(url, "txt")
return res
[docs] def find(self, database, query, option=None):
"""finds entries with matching query keywords or other query data in a given database
:param str database: can be one of pathway, module, disease, drug,
environ, ko, genome, compound, glycan, reaction, rpair, rclass,
enzyme, genes, ligand or an organism code (see :attr:`organismIds`
attributes) or T number (see :attr:`organismTnumbers` attribute).
:param str query: See examples
:param str option: If option provided, database can be only 'compound'
or 'drug'. Option can be 'formula', 'exact_mass' or 'mol_weight'
.. note:: Keyword search against brite is not supported. Use /list/brite to
retrieve a short list.
::
# search for pathways that contain Viral in the definition
s.find("pathway", "Viral")
# for keywords "shiga" and "toxin"
s.find("genes", "shiga+toxin")
# for keywords "shiga toxin"
s.find("genes", ""shiga toxin")
# for chemical formula "C7H10O5"
s.find("compound", "C7H10O5", "formula")
# for chemical formula containing "O5" and "C7"
s.find("compound", "O5C7","formula")
# for 174.045 =< exact mass < 174.055
s.find("compound", "174.05","exact_mass")
# for 300 =< molecular weight =< 310
s.find("compound", "300-310","mol_weight")
"""
_valid_options = ["formula", "exact_mass", "mol_weight"]
_valid_db_options = ["compound", "drug"]
self._checkDB(database, mode="find")
url = "find/" + database + "/" + query
if option:
if database not in _valid_db_options:
raise ValueError(
"invalid database. Since option was provided, database must be in %s" % _valid_db_options
)
if option not in _valid_options:
raise ValueError("invalid option. Must be in %s " % _valid_options)
url += "/" + option
res = self.http_get(url, frmt="txt")
return res
[docs] def show_entry(self, entry):
"""Opens URL corresponding to a valid entry
::
s.www_bget("path:hsa05416")
"""
url = "http://www.kegg.jp/dbget-bin/www_bget?" + entry
self.logging.info(url)
webbrowser.open(url)
[docs] def get(self, dbentries, option=None, parse=False):
"""Retrieves given database entries
:param str dbentries: KEGG database entries involving the following
database: pathway, brite, module, disease, drug, environ, ko, genome
compound, glycan, reaction, rpair, rclass, enzyme **or** any organism
using the KEGG organism code (see :attr:`organismIds`
attributes) or T number (see :attr:`organismTnumbers` attribute).
:param str option: one of: aaseq, ntseq, mol, kcf, image, kgml
.. note:: you can add the option at the end of dbentries in which case
the parameter option must not be used (see example)
::
from bioservices import KEGG
s = KEGG()
# retrieves a compound entry and a glycan entry
s.get("cpd:C01290+gl:G00092")
# same as above
s.get("C01290+G00092")
# retrieves a human gene entry and an E.coli O157 gene entry
s.get("hsa:10458+ece:Z5100")
# retrieves amino acid sequences of a human gene and an E.coli O157 gene
s.get("hsa:10458+ece:Z5100/aaseq")
# retrieves the image file of a pathway map
s.get("hsa05130/image")
# same as above
s.get("hsa05130", "image")
Another example here below shows how to save the image of a given pathway::
res = s.get("hsa05130/image")
# same as : res = s.get("hsa05130","image")
f = open("test.png", "w")
f.write(res)
f.close()
.. note:: The input is limited up to 10 entries (KEGG restriction).
"""
_valid_options = ["aaseq", "ntseq", "mol", "kcf", "image", "kgml"]
_valid_db_options = ["compound", "drug"]
# self._checkDB(database, mode="find")
url = "get/" + dbentries
if option:
if option not in _valid_options:
raise ValueError("invalid option. Must be in %s " % _valid_options)
url += "/" + option
res = self.http_get(url, frmt="txt")
if parse is True:
res = self.parse(res)
return res
[docs] def conv(self, target, source):
"""convert KEGG identifiers to/from outside identifiers
:param str target: the target database (e.g., a KEGG organism).
:param str source: the source database (e.g., uniprot) or a valid
dbentries; see below for details.
:return: a dictionary with keys being the source and values being the target.
Here are the rules to set the target and source parameters.
If the second argument is not a **dbentries**, source and target
parameters can be of two types:
#. gene identifiers. If the target is a KEGG Id, then the source
must be one of *ncbi-gi*, *ncbi-geneid* or *uniprot*.
.. note:: source and target can be swapped.
#. chemical substance identifiers. If the target is one of the
following kegg database: drug, compound, glycan then the source
must be one of *pubchem* or *chebi*.
.. note:: again, source and target can be swapped
If the second argument is a **dbentries**, it can be again of two types:
#. gene identifiers. The database used can be one ncbi-gi,
ncbi-geneid, uniprot or any KEGG organism
#. chemical substance identifiers. The database used can be one of
drug, compound, glycan, pubchem or chebi only.
.. note:: if the second argument is a dbentries, target and dbentries
cannot be swapped.
::
# conversion from NCBI GeneID to KEGG ID for E. coli genes
conv("eco","ncbi-geneid")
# inverse of the above example
conv("eco","ncbi-geneid")
#conversion from KEGG ID to NCBI GI
conv("ncbi-gi","hsa:10458+ece:Z5100")
To make it clear by taking another example, you can either convert an
entire database to another (e.g., from uniprot to KEGG Id all human gene
IDs)::
uniprot_ids, kegg_ids = s.conv("hsa", "uniprot")
or a subset by providing a valid **dbentries**::
s.conv("hsa","up:Q9BV86+")
.. warning:: dbentries are not check and are supposed to be correct.
See :meth:`check_idbentries` to help you checking a dbentries.
.. warning:: call to this function may be long. conv("hsa", "uniprot") takes a minute
suprinsingly, conv("uniprot", "hsa") takes just a few seconds.
.. versionchanged:: 1.1
the output is now a dictionary, not a list of tuples
"""
# The second argument may be a source_db or a dbentries so checking
# second argument is kind of tricky because dbentries take lots of
# different form.
# for now, we only check the first argument.
# gene identifiers
isOrg = self.isOrganism(target)
if isOrg is False and target not in [
"ncbi-gi",
"ncbi-geneid",
"uniprot",
"pubchem",
"chebi",
"drug",
"compound",
"glycan",
]:
raise ValueError(
"""
Invalid syntax. target must be a KEGG ID or
one of the allowed database. See documentation og :meth:`conv` for
details"""
)
"""if target in self.organismIds:
if source not in ['ncbi-gi', 'ncbi-geneid', 'uniprot']:
raise ValueError("Invalid source (must be ncbi-gi, ncbi-geneid or uniprot)")
elif target in ['ncbi-gi', 'ncbi-geneid', 'uniprot']:
if source not in self.organismIds:
raise ValueError("Invalid source (must be a valid KEGG organism)")
# for chenical substance identifiers
elif target in ['drug', 'compound', 'glycan']:
if source not in ['chebi', 'pubchem']:
raise ValueError("Invalid source. Must be chebi or pubchem")
elif target in ['chebi', 'pubchem']:
if source not in ['drug', 'compound', 'glycan']:
raise ValueError("Invalid source. Must be drug, compound or glycan")
else:
self.logging.info("arguments not checked")
"""
url = "conv/" + target + "/" + source
res = self.http_get(url, frmt="txt")
try:
t = [x.split("\t")[0] for x in res.strip().split("\n")]
s = [x.split("\t")[1] for x in res.strip().split("\n")]
return dict([(x, y) for x, y in zip(t, s)])
except:
return res
[docs] def link(self, target, source):
"""Find related entries by using database cross-references
:param str target: the target KEGG database or organism (see below for the list).
:param str source: the source KEGG database or organism (see below for
the list) or a valid dbentries involving one of the database; see
below for details.
The valid list of databases is pathway, brite, module, disease, drug, environ,
ko, genome, compound, glycan, reaction, rpair, rclass, enzyme
::
# KEGG pathways linked from each of the human genes
s.link("pathway", "hsa")
# human genes linked from each of the KEGG pathways
s.link("hsa", "pathway")
# KEGG pathways linked from a human gene and an E. coli O157 gene.
s.link("pathway", "hsa:10458+ece:Z5100")
"""
self._checkDB(target, mode="link")
url = "link/" + target + "/" + source
res = self.http_get(url, frmt="txt")
return res
[docs] def entry(self, dbentries):
"""Retrieve entry
There is a weblink service (see http://www.genome.jp/kegg/rest/weblink.html)
Since it is equivalent to :meth:`get`, we do not implement it for now
"""
raise NotImplementedError("Use :meth:`get` instead")
[docs] def show_pathway(self, pathId, scale=None, dcolor="pink", keggid={}, show=True):
"""Show a given pathway inside a web browser
:param str pathId: a valid pathway Id. See :meth:`pathwayIds`
:param int scale: you can scale the image with a value between 0 and 100
:param str dcolor: set the default background color of nodes
:param dict keggid: set color of entries contained in the pathway as
key/value pairs; can also be a list, in which case all nodes have
the same default color (red)
.. note:: if scale is provided, dcolor and keggid are ignored.
::
# show a pathway in the browser
s.show_pathway("path:hsa05416", scale=50)
# Same as above but also highlights some KEGG Ids (red for all)
s.show_pathway("path:hsa05416", dcolor="white",
keggid=['1525', '1604', '2534'])
# You can refine the colors using a dictionary:
s.show_pathway("path:hsa05416", dcolor="white",
keggid={'1525':'yellow,red', '1604':'blue,green', '2534':"blue"})
.. anotherway to higlight: http://www.kegg.jp/pathway/map00010+K00873+C00022
"""
if pathId.startswith("path:"):
pathId = pathId.split(":")[1]
if scale:
scale = int(scale / 100.0 * 100) / 100.0 # just need 2 digits and a value in [0,1]
url = "http://www.kegg.jp/kegg-bin/show_pathway?scale=" + str(scale)
url += "&query=&map=" + pathId
else:
url = "http://www.kegg.jp/kegg-bin/show_pathway?" + pathId
if dcolor:
url += "/default%%3d%s/" % dcolor
if isinstance(keggid, dict):
if len(keggid.keys()) > 0:
for k, v in keggid.items():
if "," in v:
url += "/%s%%09%s/" % (k, v)
else:
url += "/%s%%09,%s/" % (k, v)
elif isinstance(keggid, list):
for k in keggid:
url += "/%s%%09,%s/" % (k, "red")
self.logging.info(url)
if show:
webbrowser.open(url)
return url
[docs] def save_pathway(self, pathId, filename, scale=None, keggid={}, params={}):
"""Save KEGG pathway in PNG format
:param pathId: a valid pathway identifier
:param str filename: output PNG file
:param params: valid kegg params expected
"""
import requests
url = self.show_pathway(pathId, scale, keggid=keggid, show=False)
html_page = requests.post(url, data=params)
html_page = html_page.content.decode()
links_to_png = [x for x in html_page.split() if "png" in x and x.startswith("src")]
link_to_png = links_to_png[0].replace("src=", "").replace('"', "")
r = requests.get("https://www.kegg.jp/{}".format(link_to_png))
if filename is None:
filename = "{}.png".format(pathway_ID)
with open(filename, "wb") as fout:
fout.write(r.content)
[docs] def show_module(self, modId):
"""Show a given module inside a web browser
:param str modId: a valid module Id. See :meth:`moduleIds`
Validity of modId is not checked but if wrong the URL will not open a
proper web page.
"""
if modId.startswith("md:"):
modId = modId.split(":")[1]
url = "http://www.kegg.jp/module/" + modId
self.logging.info(url)
res = webbrowser.open(url)
return res
# wrapper of all databases to ease access to them (buffered)
def _get_db(self):
return KEGG._valid_DB
databases = property(_get_db, doc="Returns list of valid KEGG databases.")
def _get_database(self, dbname, mode=0):
res = self.list(dbname)
assert mode in [0, 1]
return [x.split()[mode] for x in res.split("\n") if len(x)]
def _get_organisms(self):
if self._organisms is None:
self._organisms = self._get_database("organism", 1)
return self._organisms
organismIds = property(_get_organisms, doc="Returns list of organism Ids")
def _get_reactions(self):
if self._reaction is None:
self._reaction = self._get_database("reaction", 0)
return self._reaction
reactionIds = property(_get_reactions, doc="returns list of reaction Ids")
def _get_enzyme(self):
if self._enzyme is None:
self._enzyme = self._get_database("enzyme", 0)
return self._enzyme
enzymeIds = property(_get_enzyme, doc="returns list of enzyme Ids" + _docIds)
def _get_organisms_tnumbers(self):
if self._organisms_tnumbers is None:
self._organisms_tnumbers = self._get_database("organism", 0)
return self._organisms_tnumbers
organismTnumbers = property(_get_organisms_tnumbers, doc="returns list of organisms (T numbers)" + _docIds)
def _get_glycans(self):
if self._glycan is None:
self._glycan = self._get_database("glycan", 0)
return self._glycan
glycanIds = property(_get_glycans, doc="Returns list of glycan Ids" + _docIds)
def _get_brite(self):
if self._brite is None:
self._brite = self._get_database("brite", 0)
return self._brite
briteIds = property(_get_brite, doc="returns list of brite Ids." + _docIds)
def _get_kos(self):
if self._ko is None:
self._ko = self._get_database("ko", 0)
return self._ko
koIds = property(_get_kos, doc="returns list of ko Ids" + _docIds)
def _get_compound(self):
if self._compound is None:
self._compound = self._get_database("compound", 0)
return self._compound
compoundIds = property(_get_compound, doc="returns list of compound Ids" + _docIds)
def _get_drug(self):
if self._drug is None:
self._drug = self._get_database("drug", 0)
return self._drug
drugIds = property(_get_drug, doc="returns list of drug Ids" + _docIds)
# set the default organism used by pathways retrieval
def _get_organism(self):
return self._organism
def _set_organism(self, organism):
if organism in self.organismIds:
self._organism = organism
self._pathway = None
self._module = None
self._ko = None
self._glycan = None
self._compound = None
self._enzyme = None
self._drug = None
self._reaction = None
self._brite = None
else:
self.logging.error("Invalid organism. Check the list in :attr:`organismIds` attribute")
raise
organism = property(_get_organism, _set_organism, doc="returns the current default organism ")
def _get_pathways(self):
if self.organism is None:
self.logging.warning("You must set the organism first (e.g., self.organism = 'hsa')")
return
if self._pathway is None:
res = self.http_get("list/pathway/%s" % self.organism, frmt="txt")
orgs = [x.split()[0] for x in res.split("\n") if len(x)]
self._pathway = orgs[:]
return self._pathway
pathwayIds = property(
_get_pathways,
doc="""returns list of pathway Ids for the default organism.
:attr:`organism` must be set.
::
s = KEGG()
s.organism = "hsa"
s.pathwayIds
""",
)
def _get_modules(self):
if self.organism is None:
self.logging.warning("You must set the organism first (e.g., self.organism = 'hsa')")
return
if self._module is None:
res = self.http_get("list/module/%s" % self.organism)
orgs = [x.split()[0] for x in res.split("\n") if len(x)]
self._module = orgs[:]
return self._module
moduleIds = property(
_get_modules,
doc="""returns list of module Ids for the default organism.
:attr:`organism` must be set.
::
s = KEGG()
s.organism = "hsa"
s.moduleIds
""",
)
[docs] def lookfor_organism(self, query):
"""Look for a specific organism
:param str query: your search term. upper and lower cases are ignored
:return: a list of definition that matches the query
"""
matches = []
definitions = [" ".join(x.split()) for x in self.list("organism").split("\n")]
for i, item in enumerate(definitions):
if query.lower() in item.lower():
matches.append(i)
return [definitions[i] for i in matches]
[docs] def lookfor_pathway(self, query):
"""Look for a specific pathway
:param str query: your search term. upper and lower cases are ignored
:return: a list of definition that matches the query
"""
matches = []
definitions = [" ".join(x.split()) for x in self.list("pathway").split("\n")]
for i, item in enumerate(definitions):
if query.lower() in item.lower():
matches.append(i)
return [definitions[i] for i in matches]
[docs] def get_pathway_by_gene(self, gene, organism):
"""Search for pathways that contain a specific gene
:param str gene: a valid gene Id
:param str organism: a valid organism (e.g., hsa)
:return: list of pathway Ids that contain the gene
::
>>> s.get_pathway_by_gene("7535", "hsa")
['path:hsa04064', 'path:hsa04650', 'path:hsa04660', 'path:hsa05340']
"""
res = self.get(":".join([organism, gene]))
dic = self.parse(res)
if not isinstance(dic, int) and "PATHWAY" in dic.keys():
return dic["PATHWAY"]
else:
self.logging.info("No pathway found ?")
[docs] def parse_kgml_pathway(self, pathwayId, res=None):
"""Parse the pathway in KGML format and returns a dictionary (relations and entries)
:param str pathwayId: a valid pathwayId e.g. hsa04660
:param str res: if you already have the output of the query
get(pathwayId), you can provide it, otherwise it is queried.
:return: a dictionary with relations and entries as keys. Values
of relations is a list of relations, each relation being
dictionary with entry1, entry2, link, value, name. The
list os entries is a list of dictionary as well.
Entry contains contains more details about the entry found in the
relation. See example below for details.
::
>>> res = s.parse_kgml_pathway("hsa04660")
>>> set([x['name'] for x in res['relations']])
>>> res['relations'][-1]
{'entry1': u'15',
'entry2': u'13',
'link': u'PPrel',
'name': u'phosphorylation',
'value': u'+p'}
>>> set([x['link'] for x in res['relations']])
set([u'PPrel', u'PCrel'])
>>> # get information about an entry :
>>> res['entries'][4]
.. seealso:: `KEGG API <http://www.kegg.jp/kegg/xml/docs/>`_
"""
output = {"relations": [], "entries": []}
# Fixing bug #24 assembla
if res is None:
res = self.easyXML(self.get(pathwayId, "kgml"))
else:
res = self.easyXML(res)
# read and parse the entries
entries = [x for x in res.findAll("entry")]
for entry in entries:
output["entries"].append(
{
"id": entry.get("id"),
"name": entry.get("name"),
"type": entry.get("type"),
"link": entry.get("link"),
"gene_names": entry.find("graphics").get("name"),
}
)
relations = [(x.get("entry1"), x.get("entry2"), x.get("type")) for x in res.findAll("relation")]
subtypes = [x.findAll("subtype") for x in res.findAll("relation")]
assert len(subtypes) == len(relations)
for relation, subtype in zip(relations, subtypes):
if len(subtype) == 0: # nothing to do with the species ??? TODO
pass
else:
for this in subtype:
value = this.get("value")
name = this.get("name")
output["relations"].append(
{
"entry1": relation[0],
"entry2": relation[1],
"link": relation[2],
"value": value,
"name": name,
}
)
# we need to map back to KEgg IDs...
return output
[docs] def pathway2sif(self, pathwayId, uniprot=True):
"""Extract protein-protein interaction from KEGG pathway to a SIF format
.. warning:: experimental Not tested on all pathway. should be move to
another package such as cellnopt
:param str pathwayId: a valid pathway Id
:param bool uniprot: convert to uniprot Id or not (default is True)
:return: a list of relations (A 1 B) for activation and (A -1 B) for
inhibitions
This is longish due to the conversion from KEGGIds to UniProt.
This method can be useful to provide prior knowledge network to software
such as CellNOpt (see http://www.cellnopt.org)
"""
res = self.parse_kgml_pathway(pathwayId)
sif = []
for rel in res["relations"]:
# types can be PPrel (protein-protein interaction only
if rel["link"] != "PPrel":
continue
if rel["name"] == "activation":
Id1 = rel["entry1"]
Id2 = rel["entry2"]
name1 = res["entries"][[x["id"] for x in res["entries"]].index(Id1)]["name"]
name2 = res["entries"][[x["id"] for x in res["entries"]].index(Id2)]["name"]
type1 = res["entries"][[x["id"] for x in res["entries"]].index(Id1)]["type"]
type2 = res["entries"][[x["id"] for x in res["entries"]].index(Id2)]["type"]
# print("names:", rel, name1, name2)
# print(type1, type2)
if type1 != "gene" or type2 != "gene":
continue
if uniprot:
try:
# FIXME sometimes, there are more than one name
name1 = name1.split()[0]
name2 = name2.split()[0]
name1 = self.conv("uniprot", name1)[name1]
name2 = self.conv("uniprot", name2)[name2]
except Exception:
self.logging.info(name1)
self.logging.info(name2)
raise Exception
# print(name1, 1, name2)
sif.append([name1, 1, name2])
elif rel["name"] == "inhibition":
Id1 = rel["entry1"]
Id2 = rel["entry2"]
name1 = res["entries"][[x["id"] for x in res["entries"]].index(Id1)]["name"]
name2 = res["entries"][[x["id"] for x in res["entries"]].index(Id2)]["name"]
type1 = res["entries"][[x["id"] for x in res["entries"]].index(Id1)]["type"]
type2 = res["entries"][[x["id"] for x in res["entries"]].index(Id2)]["type"]
# print("names:", rel, name1, name2)
# print(type1, type2)
if type1 != "gene" or type2 != "gene":
continue
if uniprot:
name1 = name1.split()[0]
name2 = name2.split()[0]
name1 = self.conv("uniprot", name1)[name1]
name2 = self.conv("uniprot", name2)[name2]
# print(name1, -1, name2)
sif.append([name1, -1, name2])
else:
pass
# print("#", rel['entry1'], rel['name'], rel['entry2'])
return sif
[docs] def parse(self, entry):
"""See :class:`KEGGParser` for details
Parse entry returned by :meth:`get`
::
k = KEGG()
res = k.get("hsa04150")
d = k.parse(res)
"""
parse = dict()
try:
parse = self.keggParser.parse(entry)
except:
self.logging.warning("Could not parse the entry correctly.")
return parse
[docs]class KEGGParser(object):
"""This is an extension of the :class:`KEGG` class to ease parsing of dbentries
This class provides a generic method :meth:`parse` that will read the output
of a dbentry returned by :meth:`KEGG.get` and converts it into a dictionary ready to use.
The :meth:`parse` method parses any entry. It can be a pathway, a gene, a compound...
::
from bioservices import *
s = KEGG()
# Retrieve a KEGG entry
res = s.get("hsa04150")
# parse it
d = s.parse(res)
As a pedagogical example, you can then further process this dictionary. Here below, we convert
the gene Ids found in the pathway into UniProt Ids::
# Get the KEGG Ids in the pathway
kegg_geneIds = [x for x in d['GENE']]
# Convert them
db_up, db_kegg = s.conv("hsa", "uniprot")
# Get the corresponding uniprot Ids
indices = [db_kegg.index("hsa:%s" % x ) for x in kegg_geneIds]
uniprot_geneIds = [db_up[x] for x in indices]
However, you could also have done it simply as follows::
kegg_geneIds = [x for x in d['gene']]
uprot_geneIds = [s.parse(s.get("hsa:"+str(e)))['DBLINKS']["UniProt:"] for e in d['GENE']]
.. note:: The 2 outputs are slightly different.
.. seealso:: http://www.kegg.jp/kegg/rest/dbentry.html
"""
def __init__(self, verbose=False):
super(KEGGParser, self).__init__()
self.logging = Logging("bioservices:keggparser", verbose)
[docs] def parse(self, res):
"""Parse to any outputs returned by :meth:`KEGG.get`
:param str res: output of a :meth:`KEGG.get`.
:return: a dictionary. Keys are those found in the KEGG entry (e.g.,
REACTION, ENTRY, EQUATION, ...). The format of each value is
various. It could be a string, a list (of strings generally),
a dictionary, a float depending on the key. Depdending on
the type of the entry (e.g., module, pathway), the
type of the value may also differ (e.g., REACTION can be either
a list of reactions or a dictionary depending on the content)
::
>>> # Parses a drug entry
>>> res = s.get("dr:D00001")
>>> # Parses a pathway entry
>>> res = s.get("path:hsa10584")
>>> # Parses a module entry
>>> res = s.get("md:hsa_M00554")
>>> # Parses a disease entry
>>> res = s.get("ds:H00001")
>>> # Parses a environ entry
>>> res = s.get("ev:E00001")
>>> # Parses Orthology entry
>>> res = s.get("ko:K00001")
>>> # Parses a Genome entry
>>> res = s.get('genome:T00001')
>>> # Parses a gene entry
>>> res = s.get("hsa:1525")
>>> # Parses a compound entry
>>> s.get("cpd:C00001")
>>> # Parses a glycan entry
>>> res = s.get("gl:G00001")
>>> # Parses a reaction entry
>>> res = s.get("rn:R00001")
>>> # Parses a rpair entry
>>> res = s.get("rp:RP00001")
>>> # Parses a rclass entry
>>> res = s.get("rc:RC00001")
>>> # Parses an enzyme entry
>>> res = s.get('ec:1.1.1.1')
>>> d = s.parse(res)
"""
parser = dict()
dbentry = "?"
# get() should return a large amount of text data
if not res or not isinstance(res, str):
raise ValueError("Unexpected input, unable to parse data of type %s" % type(res))
if res[:5] == "ENTRY":
dbentry = res.split("\n")[0].split(None, 2)[2]
else:
raise ValueError("Unable to parse data, it does not comform to the expected KEGG format")
try:
parser = self._parse(res)
except Exception as err:
self.logging.warning("Could not parse the entry %s correctly" % dbentry)
self.logging.warning(err)
return parser
def _parse(self, res):
keys = [x.split(" ")[0] for x in res.split("\n") if len(x) and x[0] != " " and x != "///"]
# let us go line by to not forget anything and know which entries are
# found in the RHS. We may have duplicated once as can be found in th
# keys variable as well.
entries = []
entry = ""
start = True
for line in res.split("\n"):
if line == "///":
entries.append(entry)
elif len(line) == 0:
pass
elif line[0] != " ":
if start == True:
start = False
else:
entries.append(entry)
entry = line[:]
else:
entry += "\n" + line[:]
# we can now look at each entry and create a dictionary.
# The dictionary will contain as key the name found in the LHS
# e.g., REACTION and the value will be either the entry content
# as a string or a list of strings if the key is not unique
# e.g., for references. This could be a bit annoying since
# for example References could appear only once if some cases.
# This can be tested though by checking the type
output = {}
for entry in entries:
name = entry.split("\n")[0].split()[0]
if keys.count(name) == 1:
output[name] = entry[:]
else:
if name in output.keys():
output[name].append(entry[:])
else:
output[name] = [entry[:]]
# remove name that are now the keys of the dictionary anyway
# if the values is not a list
for k, v in output.items():
if k in ["CHROMOSOME", "TAXONOMY"]:
continue
try:
output[k] = output[k].strip().replace(k, "", 1).strip()
except: # skip the lists
pass
# Now, let us do the real stuff.
# This is tricky since format is not consistent with the names e,g
# REACTIONS could be sometimes a list of names and sometimes list
# of reactions with their description.
self.raw_parsing = copy.deepcopy(output)
for key, value in output.items():
# convert to a dict
if key == "STATISTICS":
data = [x.split(":", 1) for x in output[key].split("\n")]
data = dict([(x[0].strip(), float(x[1].strip())) for x in data])
output[key] = data
# strip only: expecting a single line (string)
elif key in [
"ANNOTATION",
"CATEGORY",
"CLASS",
"COMPOSITION",
"CREATED",
"DATA_SOURCE",
"DEFINITION",
"DESCRIPTION",
"ENTRY",
"EFFICACY",
"EQUATION",
"FORMULA",
"KEYWORDS",
"HISTORY",
"MASS",
"ORGANISM",
"ORG_CODE",
"POSITION",
"RCLASS",
"KO_PATHWAY",
"SYMBOL",
"STRUCTURE",
"TYPE",
"SYSNAME",
"REL_PATHWAY",
]:
# get rid of \n
if "\n" in value:
# happens in description path:hsa04915
value = value.replace("\n", " ")
# nothing to do here except strip
output[key] = value.strip()
# print(key)
# list : set of lines. Could be split by ; character but we use the
# \n instead to be sure
# COMMENT is sometimes on several lines
elif key in [
"NAME",
"REMARK",
"ACTIVITY",
"COMMENT",
"ORIGINAL_DB",
"SUBSTRATE",
"PRODUCT",
"ENV_FACTOR",
"PATHOGEN",
]:
output[key] = [x.strip() for x in value.split("\n")]
# list: long string splitted into items and therefore converted to a list
elif key in ["ENZYME", "REACTION", "RPAIR", "RELATEDPAIR", "ALL_REAC"]:
# RPAIR/rn:R00005 should be a dict if "_" found
# REACTION/md:hsa_M00554 should be a dict if '->' found
if "->" in value or "_" in value:
kp = {}
for line in value.split("\n"):
try:
k, v = line.strip().split(None, 1)
except:
# self.logging.warning("empty line in %s %s" % (key, line))
k = line.strip()
v = ""
kp[k] = v
output[key] = kp.copy()
else:
output[key] = [x.strip() for x in value.split()]
# transform to dictionary by splitting line and then split on ";"
elif key in "GENE":
kp = {}
for i, line in enumerate(value.split("\n")):
try:
k, v = line.strip().split(None, 1)
except:
# self.logging.warning("empty line in %s %s" % (key, line))
k = line.strip()
v = ""
kp[k] = v
output[key] = kp.copy()
elif key in [
"DRUG",
"ORTHOLOGY",
"COMPOUND",
"RMODULE",
"DISEASE",
"PATHWAY_MAP",
"STR_MAP",
"OTHER_MAP",
"PATHWAY",
"MODULE",
"GENES",
]:
kp = {}
for line in value.split("\n"):
try: # empty orthology in rc:RC00004
k, v = line.strip().split(None, 1)
except:
# self.logging.warning("empty line in %s %s" % (key, line))
k = line.strip()
v = ""
if k.endswith(":"):
k = k.strip().rstrip(":")
kp[k] = v
output[key] = kp.copy()
elif key in ["NETWORK"]:
kp = {"ELEMENT": {}}
ELEMENT = False
for line in value.split("\n"):
try: # empty orthology in rc:RC00004
k, v = line.strip().split(None, 1)
except:
# self.logging.warning("empty line in %s %s" % (key, line))
k = line.strip()
v = ""
if k.endswith(":"):
k = k.strip().rstrip(":")
if k == "ELEMENT":
ELEMENT = True
k, v = v.split(None, 1)
if ELEMENT is False:
kp[k] = v
else:
kp["ELEMENT"][k] = v
output[key] = kp.copy()
# list of dictionaries
elif key == "REFERENCE":
# transform to a list since you may have several entries
newvalue = [self._interpret_references(this) for this in self._tolist(value)]
output[key] = newvalue
# list of dictionaries
elif key == "PLASMID":
newvalue = [self._interpret_plasmid(this) for this in self._tolist(value)]
output[key] = newvalue
# list of dictionaries
elif key == "CHROMOSOME":
newvalue = [self._interpret_chromosome(this) for this in self._tolist(value)]
output[key] = newvalue
# list of dictionaries
elif key == "TAXONOMY":
newvalue = [self._interpret_taxonomy(this) for this in self._tolist(value)]
output[key] = newvalue
elif key == "SEQUENCE":
newvalue = [self._interpret_sequence(this) for this in self._tolist(value)]
output[key] = newvalue
# dictionary, interpreted as follows
# on each line, there is an identifier followed by : character
# looks like there is just one line...
elif key in ["DRUG_TARGET", "MOTIF", "PDB"]:
# STRUCTURE PDB can be long and span over several lines. e.g.,
# hsa:1525
new = {}
# somehow the import at the top is lost
import re
value = re.sub("\n {6,20}", " ", value)
try:
for line in value.split("\n"):
thiskey, content = line.split(":", 1)
new[thiskey] = content
output[key] = new
except ValueError:
output[key] = value
elif key in ["DBLINKS", "INTERACTION", "METABOLISM"]:
# D01441 for metabolism
# DBLINKS for C00624 should work out of the box
new = {}
import re
value = re.sub("\n {12,12+1}", "\n", value)
# Sometimes (e.g. INTERACTION in D00136) values are empty
# see issue #85
if len(value):
for line in value.split("\n"):
thiskey, content = line.strip().split(":", 1)
new[thiskey] = content.strip()
output[key] = new
else:
output[key] = value
# floats
elif key in ["EXACT_MASS", "MOL_WEIGHT"]:
output[key] = float(value)
# get rid of the length
elif key in ["AASEQ", "NTSEQ"]:
output[key] = value.split("\n", 1)[1].replace("\n", "")
output[key] = output[key].replace(" ", "")
elif key.startswith("ENTRY"):
newvalue = self._interpret_entry(value)
output[key] = newvalue.strip()
# extract list of strings from structure. These strings are not
# interpreted
elif key in ["ATOM", "BOND", "NODE", "EDGE", "ALIGN", "RDM"]:
# starts with a number that defines number of entries. Let us
# get rid of that number and then send a list
output[key] = self._interpret_enumeration(output[key])
# not interpreted
elif key in [
"BRACKET",
"COMPONENT",
"SOURCE",
"BRITE",
"TARGET",
"CARCINOGEN",
"MARKER",
]: # do not interpret to keep structure
pass
else:
self.logging.warning(
"""Found keyword %s, which has not special
parsing for now. please report this issue with the KEGG
identifier (%s) into github.com/bioservices. Thanks T.C."""
% (key, output["ENTRY"])
)
return output
def _interpret_enumeration(self, data):
N = data.strip().split("\n")[0]
# must be a number
N = int(N)
lines = data.strip().split("\n")[1:]
lines = [line.strip() for line in lines]
if len(lines) != N:
self.logging.warning("number of lines not as expected in %s" % data)
if N == 0:
return []
else:
return lines
def _tolist(self, value):
# transform to a list since you may have several entries
if isinstance(value, list) is False:
value = [value]
return value
def _interpret_entry(self, data):
res = {}
for this in data.split("\n"):
if this.strip().startswith("ENTRY"):
pass
elif this.strip().startswith("COMPOUND"):
res["COMPOUND"] = this.strip().split(None, 1)[1]
elif this.strip().startswith("ATOM"):
res["AUTHORS"] = this.strip().split(None, 1)[1]
elif this.strip().startswith("TITLE"):
res["TITLE"] = this.strip().split(None, 1)[1]
return res
def _interpret_sequence(self, data):
fields = ["GENE", "ORGANISM", "TYPE"]
current_field = "SEQUENCE"
res = dict({current_field: ""})
for this in data.split("\n"):
this = this.strip()
for f in fields:
if this.startswith(f):
fields.remove(f)
current_field = f
this = this.split(None, 1)[1]
res[current_field] = ""
break
res[current_field] += this + " "
if res["SEQUENCE"] == "":
res.pop("SEQUENCE")
for k in res.keys():
res[k] = res[k].strip()
return res
# changed v1.4.18 issue #79
"""for this in data.split("\n"):
if this.strip().startswith("GENE"):
res['GENE'] = this.strip().split(None,1)[1]
gene = True
elif this.strip().startswith('ORGANISM'):
res['ORGANISM'] = this.strip().split(None,1)[1]
gene = False
elif this.strip().startswith('TYPE'):
res['TYPE'] = this.strip().split(None, 1)[1]
gene = False
elif gene is True:
res['GENE'] += this.strip()
else:
assert gene is False
res['SEQUENCE'] = this.strip()
return res
"""
"""
[u' 0 Mtk 1 Mtd 2 Mad 3 Man 4 Mte 5 Mtk 6 Mtd 7 Man',
u' GENE 0-2 mycAI [UP:Q83WF0]; 3 mycAII [UP:Q83WE9]; 4-5 mycAIII',
u' [UP:Q83WE8]; 6 mycAIV [UP:Q83WE7]; 7 mycAV [UP:Q83WE6]',
u' ORGANISM Micromonospora griseorubida',
u' TYPE PK']
"""
def _interpret_taxonomy(self, data):
res = {}
for this in data.split("\n"):
if this.strip().startswith("TAXONOMY"):
res["TAXONOMY"] = this.strip()
elif this.strip().startswith("LINEAGE"):
res["LINEAGE"] = this.strip().split(None, 1)[1]
return res
def _interpret_references(self, data):
res = {}
for this in data.split("\n"):
fields = this.strip().split(None, 1)
if len(fields) <= 1:
continue
if fields[0] in ("REFERENCE", "AUTHORS", "TITLE", "JOURNAL"):
res[fields[0]] = fields[1]
# if this.strip().startswith("REFERENCE"):
# res['REFERENCE'] = this.strip().split(None,1)[1]
# elif this.strip().startswith("JOURNAL"):
# res['JOURNAL'] = this.strip().split(None,1)[1]
# elif this.strip().startswith("AUTHORS"):
# res['AUTHORS'] = this.strip().split(None,1)[1]
# elif this.strip().startswith("TITLE"):
# res['TITLE'] = this.strip().split(None,1)[1]
return res
def _interpret_plasmid(self, data):
res = {}
for this in data.split("\n"):
if this.strip().startswith("PLASMID"):
res["PLASMID"] = this.strip().split(None, 1)[1]
elif this.strip().startswith("LENGTH"):
res["LENGTH"] = this.strip().split(None, 1)[1]
elif this.strip().startswith("SEQUENCE"):
res["SEQUENCE"] = this.strip().split(None, 1)[1]
return res
def _interpret_chromosome(self, data):
res = {}
for this in data.split("\n"):
if this.strip().startswith("CHROMOSOME"):
try:
res["CHROMOSOME"] = this.strip().split(None, 1)[1]
except:
# genome:T00012 has no name
res["CHROMOSOME"] = this.strip()
elif this.strip().startswith("LENGTH"):
res["LENGTH"] = this.strip().split(None, 1)[1]
elif this.strip().startswith("SEQUENCE"):
res["SEQUENCE"] = this.strip().split(None, 1)[1]
return res
class KEGGTools(KEGG):
"""Load all genes from the database.
::
k = kegg.KEGGTools()
k.load_genes("hsa")
genes = k.scan_genes()
"""
def __init__(self, verbose=False, organism="hsa"):
self.kegg = KEGG()
self.parser = KEGGParser()
self.kegg.logging.info("Initialisation. Please wait")
self.load_genes(organism)
def load_genes(self, organism):
res = self.kegg.list(organism)
self.genes = [x.split("\t")[0] for x in res.strip().split("\n")]
return self.genes
def scan_genes(self):
from easydev import progress_bar
pb = progress_bar(len(self.genes))
import time
genes = {}
for i, gene in enumerate(self.genes):
genes[gene] = self.parser.parse(self.kegg.get(self.genes[i]))
pb.animate(i, time.time() - pb.start)
return genes
def load_reactions(self, organism):
reactions = self.kegg.list("reaction")
self.reactions = [x.split()[0] for x in reactions.split("\n") if len(x)]
return self.reactions
def scan_reactions(self):
from easydev import progress_bar
pb = progress_bar(len(self.reactions))
import time
reactions = {}
for i, this in enumerate(self.reactions):
reactions[this] = self.parser.parse(self.kegg.get(self.reactions[i]))
pb.animate(i, time.time() - pb.start)
return reactions