Source code for bioservices.hgnc

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s):
#      Thomas Cokelaer <cokelaer@ebi.ac.uk>
#
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
"""Interface to HUGO/HGNC web services

.. topic:: What is HGNC ?

    :URL: http://www.genenames.org
    :Citation:

    .. highlights::

        "The HUGO Gene Nomenclature Committee (HGNC) has assigned unique gene symbols and
        names to over 37,000 human loci, of which around 19,000 are protein coding.
        genenames.org is a curated online repository of HGNC-approved gene nomenclature
        and associated resources including links to genomic, proteomic and phenotypic
        information, as well as dedicated gene family pages."

        -- From HGNC web site, July 2013


"""
from bioservices import REST
from bioservices.xmltools import bs4
import easydev
from bioservices import logger

logger.name = __name__


try:
    from urllib.error import HTTPError
except:
    from urllib2 import HTTPError


__all__ = ["HGNC", "HGNCDeprecated"]


[docs]class HGNC:
    """Wrapper to the genenames web service


    See details at http://www.genenames.org/help/rest-web-service-help

    """

    def __init__(self, verbose=False, cache=False):
        url = "http://rest.genenames.org/"
        self.services = REST("HGNC", url=url, verbose=verbose, cache=cache)

        self._info = self.get_info()
        self.searchable_fields = self._info["searchableFields"]
        self.stored_fields = self._info["storedFields"]

[docs]    def get_info(self, frmt="json"):
        """Request information about the service

        Fields are when the server was last updated (lastModified),
        the number of documents (numDoc), which fields can be queried
        using search and fetch (searchableFields) and which fields may
        be returned by fetch (storedFields).


        """
        headers = self.services.get_headers(content=frmt)
        res = self.services.http_get("info", frmt=frmt, headers=headers)
        return res

[docs]    def fetch(self, database, query, frmt="json"):
        """Retrieve particular records from a searchable fields

        Returned object is a json object with fields as in
        :attr:`stored_field`, which is returned from :meth:`get_info` method.

        Only one query at a time. No wild cards are accepted.
        ::

            >>> h = HGNC()
            >>> h.fetch('symbol', 'ZNF3')
            >>> h.fetch('alias_name', 'A-kinase anchor protein, 350kDa')
        """
        easydev.check_param_in_list(database, self.searchable_fields)
        url = "fetch/{0}/{1}".format(database, query)
        headers = self.services.get_headers(content=frmt)
        res = self.services.http_get(url, frmt=frmt, headers=headers)
        return res

[docs]    def search(self, database_or_query=None, query=None, frmt="json"):
        """Search a searchable field (database) for a pattern

        The search request is more powerful than fetch for querying the
        database, but search will only returns the fields hgnc_id, symbol and
        score. This is because this tool is mainly intended to query the server
        to find possible entries of interest or to check data (such as your own
        symbols) rather than to fetch information about the genes. If you want
        to retrieve all the data for a set of genes from the search result, the
        user could use the hgnc_id returned by search to then fire off a fetch
        request by hgnc_id.

        :param database: if not provided, search all databases.


        ::

            # Search all searchable fields for the tern BRAF
            h.search('BRAF')

            # Return all records that have symbols that start with ZNF
            h.search('symbol', 'ZNF*')

            # Return all records that have symbols that start with ZNF
            # followed by one and only one character (e.g. ZNF3)
            # Nov 2015 does not work neither here nor in within in the
            # official documentation
            h.search('symbol', 'ZNF?')

            # search for symbols starting with ZNF that have been approved
            # by HGNC
            h.search('symbol', 'ZNF*+AND+status:Approved')

            # return ZNF3 and ZNF12
            h.search('symbol', 'ZNF3+OR+ZNF12')

            # Return all records that have symbols that start with ZNF which
            # are not approved (ie entry withdrawn)
            h.search('symbol', 'ZNF*+NOT+status:Approved')

        """
        if database_or_query is None and query is None:
            raise ValueError("you must provide at least one parameter")
        elif database_or_query is not None and query is None:
            # presumably user wants to search all databases
            query = database_or_query
            url = "search/{0}".format(query)
        else:
            database = database_or_query
            easydev.check_param_in_list(database, self.searchable_fields)
            url = "search/{0}/{1}".format(database, query)

        headers = self.services.get_headers(content=frmt)
        res = self.services.http_get(url, frmt=frmt, headers=headers)
        return res


[docs]class HGNCDeprecated(REST):
    """Interface to the `HGNC <http://www.genenames.org>`_ service


    ::

        >>> from bioservices import *
        >>> # Fetch XML document for gene ZAP70
        >>> s = HGNC()
        >>> xml = s.get_xml("ZAP70")
        >>> # You can fetch several gene names:
        >>> xml = s.get_xml("ZAP70;INSR")
        >>> # Wrong gene name request returns an empty list
        >>> s.get_xml("wrong")
        []

    For a single name, the following methods are available::

        >>> # get the aliases of a given gene
        >>> print(s.get_aliases("ZAP70"))
        [u'ZAP-70', u'STD']
        >>> # get UniProt accession code
        >>> s.get_xrefs("ZAP70")['UniProt']['xkey']
        'P43403'
        >>> # get XML link to a UniProt cross-reference
        >>> s.get_xrefs("ZAP70", "xml")['UniProt']['link']
        ['http://www.uniprot.org/uniprot/P43403.xml']


    You can access to the links of a cross reference as well::

        values = s.get_xrefs("ZAP70")
        s.on_web(values['EntrezGene']['link'][0])


    :references: http://www.avatar.se/HGNC/doc/tutorial.html

    .. warning:: this maybe not the official.

    """

    def __init__(self, verbose=False, cache=False):
        url = "http://www.avatar.se/HGNC/wr/"
        super(HGNC, self).__init__("HGNC", url=url, verbose=verbose, cache=cache)
        self.logging.warning("Service unavailable when testing (Aug 2014). May not work")

        self._always_return_list = False

        # FIXME
        #: Force XML to be checked for unicode consistency see :class:`Service`
        # self._fixing_unicode = True
        # self._fixing_encoding = "utf-8"

    def _set_return(self, mode):
        assert mode in [False, True]
        self._always_return_list = mode

    def _get_return(self):
        return self.always_return_list

[docs]    def get_xml(self, gene):
        """Returns XML of a single gene or list of genes


        :param str gene: a valid gene name. Several gene names can be concatenated with
            comma ; character (e.g., 'ZAP70;INSR')


        .. doctest::

            >>> from bioservices import *
            >>> s = HGNC()
            >>> res = s.get_xml("ZAP70")
            >>> res.findAll("alias")
            >>> [x.text for x in res.findAll("alias")]
            [u'ZAP-70', u'STD']

        .. seealso:: :meth:`get_aliases`
        """
        try:
            if ";" in gene:
                res = self.services.http_get("genes/%s" % gene)
            else:
                res = self.services.http_get("gene/%s.xml" % gene)
                res = self.easyXML(res)
            # res = bs4.BeautifulSoup(res)
        except HTTPError:
            self.logging.critical("!!BioServices HTTPError caught in HGNC. Probably an invalid gene name")

            res = bs4.BeautifulSoup()
        # except Exception:
        #    raise Exception
        return res

    def _get_attribute(self, gene, attribute):
        res = self.get_xml(gene)
        values = [x.text.strip() for x in res.findAll(attribute)][0]
        if len(values) == 1:
            return values[0]
        else:
            return values

[docs]    def get_aliases(self, gene):
        """Get aliases for a single gene name"""
        res = self.get_xml(gene)
        aliases = [x.text for x in res.findAll("alias")]
        return aliases

[docs]    def get_name(self, gene):
        """Get name for a single gene name"""
        return self._get_attribute(gene, "name")

[docs]    def get_chromosome(self, gene):
        """Get chromosome for a single gene name"""
        return self._get_attribute(gene, "chromosome")

[docs]    def get_previous_symbols(self, gene):
        """Get previous symbols for a single gene name"""
        return self._get_attribute(gene, "previous_symbols")

[docs]    def get_withdrawn_symbols(self, gene):
        """Get withdrawn symbols for a single gene name"""
        return self._get_attribute(gene, "withdrawn_symbols")

[docs]    def get_previous_names(self, gene):
        """Get previous names for a single gene name"""
        return self._get_attribute(gene, "previous_names")

[docs]    def get_xrefs(self, gene, keep="html"):
        """Get the cross references for a given single gene name

        ::


            >>> databases = s.get_xrefs("ZAP70").keys()

            >>> # get XML link to a UniProt cross-reference
            >>> s.get_xrefs("ZAP70", "xml")['UniProt']['link']
            ['http://www.uniprot.org/uniprot/P43403.xml']


        """
        assert keep in ["html", "xml", "fasta", "rdf", "txt", "gff", None]
        # returns list of dictionary that contains the attributes of each
        # reference as well as a list of links provided for each reference.

        xml = self.get_xml(gene)
        values = self._get_xref(xml, keep)
        return values

    def _get_xref(self, xml, keep):
        # get all dbs and build up a dict out of it
        dbs = [x.attrs["xdb"] for x in xml.findAll("xref")]
        values = dict([(this, {}) for this in dbs])

        # rescan the xml to get the other attributes
        refs = [x.attrs for x in xml.findAll("xref")]
        for ref in refs:
            db = ref["xdb"]
            values[db] = ref.copy()
            # this looks quite complicated so here is a bit of explanation:
            # Each reference may have a few links. However, we are interested
            # only in this function by the HTML format. So, for a given database (res.findAll(xref)),
            # we search for all links (findAll(link)) and for each link found, we keep only those where
            # format is HTML. Finally; we get only the attribute 'xlink:href'
            links = [
                y.attrs["xlink:href"]
                for y in [x for x in xml.findAll("xref") if x["xdb"] == db][0].findAll("link")
                if y.attrs["format"] == keep
            ]
            values[db]["link"] = links[:]

        return values

[docs]    def lookfor(self, pattern):
        """Finds all genes that starts with a given pattern

        :param str pattern: a string. Could be the wild character `*`
        :return: list of dictionary. Each dictionary contains the 'acc',
            'xlink:href' and 'xlink:title' keys


        .. doctest::

            >>> from bioservices import *
            >>> s = HGNC()
            >>> s.lookfor("ZAP")
            [{'acc': 'HGNC:12858',
            'xlink:href': '/HGNC/wr/gene/ZAP70',
            'xlink:title': 'ZAP70'}]


        This function may be used to count the number of entries::


            len(s.lookfor('*'))

        """
        params = {"search": "symbol", "value": pattern}
        # note the extra s before ;index.xml
        xml = self.services.http_get("s;index.xml?" + self.urlencode(params))
        xml = self.easyXML(xml)
        res = [x.attrs for x in xml.findAll("gene")]
        return res

[docs]    def get_all_names(self):
        """Returns all gene names"""
        entries = self.lookfor("*")
        names = [entry["xlink:title"] for entry in entries]
        return names

[docs]    def mapping(self, value):
        """maps an identifier from a database onto HGNC database

        :param str value: a valid DB:id string (e.g. "UniProt:P36888")
        :return: a list of dictionary with the keys 'acc', 'xlink:href', 'xlink:title'

            >>> value = "UniProt:P43403"
            >>> res = s.mapping(value)
            >>> res[0]['xlink:title']
            'ZAP70'
            >>> res[0]['acc']
            'HGNC:12858'


        .. seealso:: :meth:`mapping_all`
        """

        xml = self.services.http_get("s;index.xml?" + self.urlencode({"search": "xref", "value": value}))
        xml = self.easyXML(xml)
        genes = xml.findAll("gene")
        res = [g.attrs for g in genes]
        return res

[docs]    def mapping_all(self, entries=None):
        """Retrieves cross references for more than one entry

        :param entries: list of values entries (e.g., returned by the :meth:`lookfor` method.)
            if not provided, this method looks for all entries.
        :returns: list of dictionaries with keys being all entry names. Values is a
            dictionary of cross references.

        .. warning:: takes 10 minutes

        """
        from math import ceil

        results = {}

        if entries is None:
            print("First, get all entries")
            entries = self.lookfor("*")

        names = [entry["xlink:title"] for entry in entries]
        N = len(names)

        # split query in sets of 300 names

        dn = 300
        N = len(names)
        n = int(ceil(N / float(dn)))
        for i in range(0, n):
            print("Completed ", i + 1, "/", n)
            query = ";".join(names[i * dn : (i + 1) * dn])
            xml = self.get_xml(query)
            genes = xml.findAll("gene")
            for gene in genes:
                res = self._get_xref(gene, None)
                # acc = gene.attrs['acc'] not needed. can be access from ['HGNC']['xkey']
                name = gene.attrs["symbol"]
                results[name] = res.copy()
        return results