Source code for bioservices.pubchem

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s):
#      Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
"""Interface to the PubChem PUG REST web service

.. topic:: What is PubChem?

    :URL: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest

    .. highlights::

        PubChem is the world's largest collection of freely accessible chemical
        information. The PUG REST (Power User Gateway REST) API provides
        programmatic access to PubChem's compound, substance and assay data.

        -- PubChem web site


"""
import time

from bioservices.services import REST

__all__ = ["PubChem", "COMPOUND_PROPERTIES", "XREF_TYPES"]

#: Properties available via the ``/property/`` endpoint of the PUG REST API.
COMPOUND_PROPERTIES = [
    "MolecularFormula",
    "MolecularWeight",
    "CanonicalSMILES",
    "IsomericSMILES",
    "InChI",
    "InChIKey",
    "IUPACName",
    "Title",
    "XLogP",
    "ExactMass",
    "MonoisotopicMass",
    "TPSA",
    "Complexity",
    "Charge",
    "HBondDonorCount",
    "HBondAcceptorCount",
    "RotatableBondCount",
    "HeavyAtomCount",
    "IsotopeAtomCount",
    "AtomStereoCount",
    "DefinedAtomStereoCount",
    "UndefinedAtomStereoCount",
    "BondStereoCount",
    "DefinedBondStereoCount",
    "UndefinedBondStereoCount",
    "CovalentUnitCount",
    "Volume3D",
    "XStericQuadrupole3D",
    "YStericQuadrupole3D",
    "ZStericQuadrupole3D",
    "FeatureCount3D",
    "FeatureAcceptorCount3D",
    "FeatureDonorCount3D",
    "FeatureAnionCount3D",
    "FeatureCationCount3D",
    "FeatureRingCount3D",
    "FeatureHydrophobeCount3D",
    "ConformerDependentDescriptorCount",
    "ConformerCount3D",
    "Fingerprint2D",
]

#: Valid cross-reference types for the ``/xrefs/`` endpoint of the PUG REST API.
XREF_TYPES = [
    "RegistryID",
    "RN",
    "PubMedID",
    "MMDBID",
    "DBURL",
    "SBURL",
    "AmericanChemicalSocietyID",
    "WikipediaURL",
    "PatentID",
    "GeneID",
    "ProteinGI",
    "TaxonomyID",
    "MIMID",
    "BioSystemID",
    "ReactomeID",
    "BioCycID",
]


[docs]class PubChem: """Interface to the `PubChem <https://pubchem.ncbi.nlm.nih.gov>`_ PUG REST service. The PubChem PUG REST API provides access to compound, substance and assay data stored in PubChem. URL structure follows the pattern:: https://pubchem.ncbi.nlm.nih.gov/rest/pug/{domain}/{namespace}/{identifier}/{operation}/{format} Example usage:: from bioservices import PubChem p = PubChem() # Get CIDs for aspirin by name cids = p.get_cids_by_name("aspirin") # Get compound record by CID record = p.get_compound_by_cid(2244) # Get specific properties for aspirin (CID 2244) props = p.get_properties(2244, properties=["MolecularFormula", "MolecularWeight"]) # Get synonyms for aspirin synonyms = p.get_synonyms(2244) .. seealso:: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest """ _url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" def __init__(self, verbose=False, cache=False): """**Constructor** :param bool verbose: set to False to prevent informative messages :param bool cache: set to True to cache requests """ self.services = REST(name="PubChem", url=PubChem._url, verbose=verbose, cache=cache) # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ def _wait_for_result(self, res, path, frmt, max_attempts=10, interval=3): """Poll for asynchronous results when PubChem returns a Waiting response. Some PubChem requests (e.g. formula searches or large cross-domain look-ups) may return a ``Waiting`` response containing a ``ListKey``. This method retries the request using the ``listkey`` endpoint until the real results are available. :param res: initial response (may be a Waiting dict) :param str path: original request path :param str frmt: response format :param int max_attempts: maximum polling attempts before giving up :param int interval: seconds to wait between polling attempts :return: final result dict (or the last Waiting response if timed out) """ attempt = 0 while isinstance(res, dict) and "Waiting" in res and attempt < max_attempts: waiting = res["Waiting"] if not isinstance(waiting, dict) or "ListKey" not in waiting: break list_key = waiting["ListKey"] parts = path.rstrip("/").split("/") if len(parts) < 2: break domain = parts[0] # The output operation is always the second-to-last path segment # (the last segment is the format: JSON, XML, etc.) output = parts[-2] poll_path = f"{domain}/listkey/{list_key}/{output}/{frmt.upper()}" time.sleep(interval) res = self.services.http_get(poll_path, frmt=frmt) attempt += 1 return res def _get(self, path, frmt="json"): """Perform a GET request to the PUG REST API. :param str path: URL path appended to the base URL :param str frmt: response format (json, xml, txt, …) :return: parsed response """ res = self.services.http_get(path, frmt=frmt) return self._wait_for_result(res, path, frmt) def _post(self, path, data, frmt="json"): """Perform a POST request to the PUG REST API. POST is used when the identifier may contain characters that cannot be safely embedded in a URL (e.g. SMILES or InChI strings). :param str path: URL path appended to the base URL :param str data: URL-encoded form data (e.g. ``"smiles=CC(=O)O"``) :param str frmt: response format (json, xml, …) :return: parsed response """ res = self.services.http_post( path, frmt=frmt, data=data, headers={ "User-Agent": self.services.getUserAgent(), "Accept": self.services.content_types[frmt], "Content-Type": "application/x-www-form-urlencoded", }, ) return self._wait_for_result(res, path, frmt) # ------------------------------------------------------------------ # Compound lookup – return CIDs # ------------------------------------------------------------------
[docs] def get_cids_by_name(self, name, frmt="json"): """Return CIDs for a compound name. :param str name: compound name (e.g. ``"aspirin"``) :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``CID`` list Example:: p.get_cids_by_name("aspirin") """ return self._get(f"compound/name/{name}/cids/{frmt.upper()}", frmt=frmt)
[docs] def get_cids_by_smiles(self, smiles, frmt="json"): """Return CIDs for a SMILES string. Uses a POST request so that special characters in the SMILES are handled correctly. :param str smiles: SMILES string (e.g. ``"CC(=O)Oc1ccccc1C(=O)O"``) :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``CID`` list Example:: p.get_cids_by_smiles("CC(=O)Oc1ccccc1C(=O)O") """ return self._post(f"compound/smiles/cids/{frmt.upper()}", data=f"smiles={smiles}", frmt=frmt)
[docs] def get_cids_by_inchi(self, inchi, frmt="json"): """Return CIDs for an InChI string. Uses a POST request to safely transmit InChI strings that contain special characters. :param str inchi: InChI string :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``CID`` list """ return self._post(f"compound/inchi/cids/{frmt.upper()}", data=f"inchi={inchi}", frmt=frmt)
[docs] def get_cids_by_inchikey(self, inchikey, frmt="json"): """Return CIDs for an InChIKey. :param str inchikey: InChIKey (e.g. ``"BSYNRYMUTXBXSQ-UHFFFAOYSA-N"``) :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``CID`` list Example:: p.get_cids_by_inchikey("BSYNRYMUTXBXSQ-UHFFFAOYSA-N") """ return self._get(f"compound/inchikey/{inchikey}/cids/{frmt.upper()}", frmt=frmt)
[docs] def get_cids_by_formula(self, formula, frmt="json"): """Return CIDs for a molecular formula. :param str formula: molecular formula (e.g. ``"C9H8O4"``) :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``CID`` list Example:: p.get_cids_by_formula("C9H8O4") """ return self._get(f"compound/formula/{formula}/cids/{frmt.upper()}", frmt=frmt)
# ------------------------------------------------------------------ # Compound records # ------------------------------------------------------------------
[docs] def get_compound_by_cid(self, cid, frmt="json"): """Return the full compound record for a CID. :param cid: PubChem compound identifier (integer or string) :param str frmt: response format (default ``"json"``) :return: full compound record Example:: p.get_compound_by_cid(2244) # aspirin """ return self._get(f"compound/cid/{cid}/{frmt.upper()}", frmt=frmt)
[docs] def get_compound_by_name(self, name, frmt="json"): """Return the full compound record for a compound name. :param str name: compound name (e.g. ``"aspirin"``) :param str frmt: response format (default ``"json"``) :return: full compound record Example:: p.get_compound_by_name("aspirin") """ return self._get(f"compound/name/{name}/{frmt.upper()}", frmt=frmt)
# ------------------------------------------------------------------ # Compound properties, synonyms and descriptions # ------------------------------------------------------------------
[docs] def get_properties(self, identifier, namespace="cid", properties=None, frmt="json"): """Return computed properties for a compound. :param identifier: compound identifier (e.g. CID ``2244`` or name ``"aspirin"``) :param str namespace: identifier type – one of ``"cid"``, ``"name"``, ``"smiles"``, ``"inchikey"`` (default ``"cid"``) :param properties: property name(s) to retrieve. Either a comma-separated string or a list of names from :data:`~bioservices.pubchem.COMPOUND_PROPERTIES`. Defaults to all properties when ``None``. :param str frmt: response format (default ``"json"``) :return: dict containing ``PropertyTable`` with the requested properties Example:: p.get_properties(2244, properties=["MolecularFormula", "MolecularWeight"]) p.get_properties("aspirin", namespace="name", properties="InChIKey,XLogP") """ if properties is None: prop_str = ",".join(COMPOUND_PROPERTIES) elif isinstance(properties, list): prop_str = ",".join(properties) else: prop_str = properties if namespace in ("smiles", "inchi"): return self._post( f"compound/{namespace}/property/{prop_str}/{frmt.upper()}", data=f"{namespace}={identifier}", frmt=frmt, ) return self._get(f"compound/{namespace}/{identifier}/property/{prop_str}/{frmt.upper()}", frmt=frmt)
[docs] def get_synonyms(self, identifier, namespace="cid", frmt="json"): """Return synonyms for a compound. :param identifier: compound identifier :param str namespace: identifier type (default ``"cid"``) :param str frmt: response format (default ``"json"``) :return: dict containing ``InformationList`` with synonym lists Example:: p.get_synonyms(2244) """ if namespace in ("smiles", "inchi"): return self._post( f"compound/{namespace}/synonyms/{frmt.upper()}", data=f"{namespace}={identifier}", frmt=frmt, ) return self._get(f"compound/{namespace}/{identifier}/synonyms/{frmt.upper()}", frmt=frmt)
[docs] def get_description(self, identifier, namespace="cid", frmt="json"): """Return the description for a compound. :param identifier: compound identifier :param str namespace: identifier type (default ``"cid"``) :param str frmt: response format (default ``"json"``) :return: dict containing ``InformationList`` with description text Example:: p.get_description(2244) p.get_description("aspirin", namespace="name") """ if namespace in ("smiles", "inchi"): return self._post( f"compound/{namespace}/description/{frmt.upper()}", data=f"{namespace}={identifier}", frmt=frmt, ) return self._get(f"compound/{namespace}/{identifier}/description/{frmt.upper()}", frmt=frmt)
[docs] def get_xrefs(self, identifier, xref_type, namespace="cid", frmt="json"): """Return cross-references for a compound. :param identifier: compound identifier :param str xref_type: cross-reference type, one of ``"RegistryID"``, ``"RN"``, ``"PubMedID"``, ``"MMDBID"``, ``"PatentID"``, ``"WikipediaURL"``, ``"GeneID"``, etc. See :data:`~bioservices.pubchem.XREF_TYPES` for the full list. :param str namespace: identifier type (default ``"cid"``) :param str frmt: response format (default ``"json"``) :return: dict containing cross-reference list Example:: p.get_xrefs(2244, "PatentID") """ return self._get(f"compound/{namespace}/{identifier}/xrefs/{xref_type}/{frmt.upper()}", frmt=frmt)
# ------------------------------------------------------------------ # Compound cross-domain links # ------------------------------------------------------------------
[docs] def get_sids_by_cid(self, cid, frmt="json"): """Return substance IDs (SIDs) deposited for a given compound CID. :param cid: PubChem compound identifier :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``SID`` list Example:: p.get_sids_by_cid(2244) """ return self._get(f"compound/cid/{cid}/sids/{frmt.upper()}", frmt=frmt)
[docs] def get_aids_by_cid(self, cid, frmt="json"): """Return assay IDs (AIDs) that tested a given compound CID. :param cid: PubChem compound identifier :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``AID`` list Example:: p.get_aids_by_cid(2244) """ return self._get(f"compound/cid/{cid}/aids/{frmt.upper()}", frmt=frmt)
[docs] def get_assay_summary(self, cid, frmt="json"): """Return a bioactivity summary for a compound. :param cid: PubChem compound identifier :param str frmt: response format (default ``"json"``) :return: dict containing assay summary data Example:: p.get_assay_summary(2244) """ return self._get(f"compound/cid/{cid}/assaysummary/{frmt.upper()}", frmt=frmt)
# ------------------------------------------------------------------ # Substance operations # ------------------------------------------------------------------
[docs] def get_substance_by_sid(self, sid, frmt="json"): """Return the full substance record for a SID. :param sid: PubChem substance identifier :param str frmt: response format (default ``"json"``) :return: full substance record Example:: p.get_substance_by_sid(100) """ return self._get(f"substance/sid/{sid}/{frmt.upper()}", frmt=frmt)
[docs] def get_cids_by_sid(self, sid, frmt="json"): """Return compound CIDs standardised from a given substance SID. :param sid: PubChem substance identifier :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``CID`` list Example:: p.get_cids_by_sid(100) """ return self._get(f"substance/sid/{sid}/cids/{frmt.upper()}", frmt=frmt)
# ------------------------------------------------------------------ # Assay operations # ------------------------------------------------------------------
[docs] def get_assay(self, aid, frmt="json"): """Return the full assay record for an AID. :param aid: PubChem assay identifier :param str frmt: response format (default ``"json"``) :return: full assay record Example:: p.get_assay(1) """ return self._get(f"assay/aid/{aid}/{frmt.upper()}", frmt=frmt)
[docs] def get_assay_description(self, aid, frmt="json"): """Return the description section of an assay. :param aid: PubChem assay identifier :param str frmt: response format (default ``"json"``) :return: dict containing assay description Example:: p.get_assay_description(1) """ return self._get(f"assay/aid/{aid}/description/{frmt.upper()}", frmt=frmt)
[docs] def get_cids_by_aid(self, aid, frmt="json"): """Return CIDs tested in a given assay. :param aid: PubChem assay identifier :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``CID`` list Example:: p.get_cids_by_aid(1) """ return self._get(f"assay/aid/{aid}/cids/{frmt.upper()}", frmt=frmt)
[docs] def get_sids_by_aid(self, aid, frmt="json"): """Return SIDs tested in a given assay. :param aid: PubChem assay identifier :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``SID`` list Example:: p.get_sids_by_aid(1) """ return self._get(f"assay/aid/{aid}/sids/{frmt.upper()}", frmt=frmt)
# ------------------------------------------------------------------ # Backward compatibility # ------------------------------------------------------------------
[docs] def get_compound_by_smiles(self, identifier, frmt="json"): """Return CIDs for a SMILES string. .. deprecated:: Use :meth:`get_cids_by_smiles` instead. This method is kept for backward compatibility. :param str identifier: SMILES string :param str frmt: response format (default ``"json"``) :return: dict with ``IdentifierList`` key containing ``CID`` list """ return self.get_cids_by_smiles(identifier, frmt=frmt)