Source code for bioservices.cog

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2023 - EBI-EMBL
#
#  File author(s):
#      Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
"""Interface to the COG (Clusters of Orthologous Genes) web service

.. topic:: What is COG service?

    :URL: https://www.ncbi.nlm.nih.gov/research/cog/webservices/
    :Citation:

    .. highlights::

        Database of Clusters of Orthologous Genes (COGs)

        -- From COG web site, Jan 2021


"""
from tqdm import tqdm

from bioservices import logger
from bioservices.services import REST

logger.name = __name__


__all__ = ["COG"]


[docs]class COG: """Interface to the COG service Note that in addition to the original COG service from NCBI, this interface also helps you in searching for organisms, and retrieves all pages in a single command (rather than paginating manually). Here is an example of getting the COGs for E. coli. You first need the exact matching name. Bioservices provides a helper to search for the organism name understood by the COG service (e.g. ``Escherichia_coli_K-12_sub_MG1655`` — not easy to guess):: from bioservices import COG c = COG() c.search_organism('coli') # the output of the previous command gives you the name c.get_cogs_by_organism('Escherichia_coli_K-12_sub_MG1655') """ _url = "https://www.ncbi.nlm.nih.gov/research/cog/api" def __init__(self, verbose=False, cache=False): """**Constructor**""" self.services = REST(name="cog", url=COG._url, verbose=verbose, cache=cache) self.show_progress = True def _get_all(self, service_name="cog", params={}): page = 1 params["page"] = page res = self.services.http_get(service_name, frmt="json", params=params) total = res["count"] pbar = tqdm(total=total, disable=not self.show_progress, leave=False) # sometimes, a 404 is returned, let us try several times. trials = 3 while True: params["page"] += 1 for _ in range(trials): other = self.services.http_get(service_name, frmt="json", params=params) try: res["results"].extend(other["results"]) break except TypeError: pass except Exception as err: raise (err) pbar.update(len(other["results"])) if other["next"] is None: break pbar.close() return res
[docs] def get_cogs(self, **kwargs): """Get COGs. Unfortunately, the API sends 10 COGS at a time given a specific page. The dictionary returned contains the results, count, previous and next page. """ if kwargs.get("page") is None: res = self._get_all("cog", params=kwargs) else: res = self.services.http_get("cog", frmt="json", params=kwargs) return res
[docs] def get_cogs_by_gene(self, gene, page=None): """Filter COGs by gene tag: MK0280""" return self.get_cogs(**{"gene": gene, "page": page})
[docs] def get_cogs_by_id(self, cog_id, page=None): """Filter COGs by COG ID tag: COG0003""" return self.get_cogs(**{"cog": cog_id, "page": page})
[docs] def get_cogs_by_assembly_id(self, assembly_id, page=None): """Filter COGs by assembly ID: GCA_000007185.1""" return self.get_cogs(**{"assembly": assembly_id, "page": page})
[docs] def get_cogs_by_organism(self, name, page=None): """Filter COGs by organism name: Nitrosopumilus_maritimus_SCM1""" return self.get_cogs(**{"organism": name, "page": page})
[docs] def get_cogs_by_taxon_id(self, taxon_id, page=None): """Filter COGs by taxid: 1229908""" return self.get_cogs(**{"taxid": taxon_id, "page": page})
[docs] def get_cogs_by_category(self, category, page=None): """Filter COGs by Taxonomic Category: ACTINOBACTERIA""" return self.get_cogs(**{"category": category, "page": page})
[docs] def get_cogs_by_category_id(self, category, page=None): """Filter COGs by Taxonomic Category taxid: 651137""" return self.get_cogs(**{"cat_taxid": category, "page": page})
[docs] def get_cogs_by_protein_name(self, protein, page=None): """Filter COGs by Protein name: AJP49128.1""" return self.get_cogs(**{"protein": protein, "page": page})
[docs] def get_cogs_by_id_and_category(self, cog_id, category, page=None): """Filter COGs by COG id and Taxonomy Categories: COG0004 and CYANOBACTERIA""" return self.get_cogs(**{"cog": cog_id, "category": category, "page": page})
[docs] def get_cogs_by_id_and_organism(self, cog_id, organism, page=None): """Filter COGs by COG id and organism: COG0004 and Escherichia_coli_K-12_sub_MG1655""" return self.get_cogs(**{"cog": cog_id, "organism,": organism, "page": page})
[docs] def get_all_cogs_definition(self, page=None): """Get all COG definitions""" if page is None: res = self._get_all("cogdef") else: res = self.services.http_get("cogdef", frmt="json", params={"page": page}) return res
[docs] def get_cog_definition_by_cog_id(self, cog_id): """Get specific COG Definitions by COG: COG0003""" return self.services.http_get("cogdef", frmt="json", params={"cog": cog_id})
[docs] def get_cog_definition_by_name(self, cog, page=None): """Get specific COG Definitions by name: Thiamin-binding stress-response protein YqgV, UPF0045 family""" if page is None: res = self._get_all("cogdef", params={"name": cog}) else: res = self.services.http_get("cogdef", frmt="json", params={"name": cog}) return res
[docs] def get_taxonomic_categories(self, page=None): """Get all Taxonomic Categories. if page is set, only that page is returned. There are 10 entires per page. if page is unset (default), all results are returned. :: from bioservices import COG c = COG() names = [x['name'] for x in c.get_taxonomic_categories()['results']] """ if page is None: res = self._get_all("taxonomy", params={}) else: res = self.services.http_get("taxonomy", frmt="json", params={"page": page}) return res
[docs] def get_taxonomic_category_by_name(self, name, page=None): """Get specific Taxonomic Category by name :: c.get_taxonomic_category_by_name("ALPHAPROTEOBACTERIA") """ if page is None: res = self._get_all("taxonomy", params={"name": name}) else: res = self.services.http_get("taxonomy", frmt="json", params={"name": name, "page": page}) return res
[docs] def search_organism(self, name): """Return candidates that match the input name. :param str name: search string matched case-insensitively against genome names :return: list of items. Each item is a dictionary with genome name, assembly identifier and taxon identifier. """ results = self.get_taxonomic_categories() candidates = [] for x in results["results"]: for y in x["organisms"]: if name in y["genome_name"].lower(): candidates.append(y) return candidates