Source code for bioservices.biomart

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2017 - EBI-EMBL
#
#  File author(s): Nick Weiner and others
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
"""This module provides a class :class:`~BioMart` that allows easy access
to the BioMart service.


.. topic:: What is BioMart ?

    :URL: http://www.biomart.org/
    :REST: http://www.biomart.org/martservice.html

    .. highlights::

        The BioMart project provides free software and data services to the
        international scientific community in order to foster scientific collaboration
        and facilitate the scientific discovery process. The project adheres to the open
        source philosophy that promotes collaboration and code reuse.

        -- from BioMart March 2013

.. note:: SOAP and REST are available. We use REST for the wrapping.
"""
from functools import wraps
from io import StringIO

from bioservices import REST, BioServicesError, logger

logger.name = __name__
import pandas as pd

__all__ = ["BioMart"]


def require_host(f):
    @wraps(f)
    def wrapper(*args, **kargs):
        if args[0].host is None:
            logger.error("You must set the host (e.g. f.host='www.ensembl.org')")
            return
        return f(*args, **kargs)

    return wrapper


[docs]class BioMart(REST):
    r"""Interface to the `BioMart <http://www.biomart.org>`_ service

    BioMart is made of different views. Each view correspond to a specific **MART**.
    For instance the UniProt service has a `BioMart view <http://www.ebi.ac.uk/uniprot/biomart/martview/>`_.

    The registry can help to find the different services available through
    BioMart.

        >>> from bioservices import *
        >>> s = BioMart()
        >>> ret = s.registry() # to get information about existing services

    The registry is a list of dictionaries. Some aliases are available to get
    all the names or databases::

        >>> s.names      # alias to list of valid service names from registry
        >>> "unimart" in s.names
        True

    Once you selected a view, you will want to select a database associated with
    this view and then a dataset. The datasets can be retrieved as follows::

        >>> s.datasets("prod-intermart_1")  # retrieve datasets available for this mart

    The main issue is how to figure out the database name (here **prod-intermart_1**) ?
    Indeed, from the web site, what you see is the **displayName** and you must
    introspect the registry to get this information. In **BioServices**, we provide
    the :meth:`~bioservices.biomart.BioMart.lookfor` method to help you. For instance, to
    retrieve the database name of **interpro**, type::

        >>> s = BioMart(verbose=False)
        >>> s.lookfor("interpro")
        Candidate:
             database: intermart_1
            MART name: prod-intermart_1
          displayName: INTERPRO (EBI UK)
                hosts: www.ebi.ac.uk

    The display name (INTERPRO) correspond to the MART name
    prod-intermart_1. Let us you it to retrieve the datasets::

        >>> s.datasets("prod-intermart_1")
        ['protein', 'entry', 'uniparc']

    Now that we have the dataset names, we can select one and build a
    query. Queries are XML that contains the dataset name, some
    attributes and filters. The dataset name is one of the element
    returned by the datasets method. Let us suppose that we want to query
    **protein**, we need to add this dataset to the query::

        >>> s.add_dataset_to_xml("protein")

    Then, you can add attributes (one of the keys of the dictionary
    returned by attributes("protein")::

        >>> s.add_attribute_to_xml("protein_accession")

    Optional filters can be used::

        >>> s.add_filter_to_xml("protein_length_greater_than", 1000)

    Finally, you can retrieve the XML query::

        >>> xml_query = s.get_xml()

    and send the request to biomart::

        >>> res = s.query(xml_query)
        >>> len(res)
        12801
        # print the first 10 accession numbers
        >>> res = res.split("\n")
        >>> for x in res[0:10]: print(x)
        ['P18656',
         'Q81998',
         'O09585',
         'O77624',
         'Q9R3A1',
         'E7QZH5',
         'O46454',
         'Q9T3F4',
         'Q9TCA3',
         'P72759']


    REACTOME example::

        s.lookfor("reactome")
        s.datasets("REACTOME")
        ['interaction', 'complex', 'reaction', 'pathway']

        s.new_query()
        s.add_dataset_to_xml("pathway")
        s.add_filter_to_xml("species_selection", "Homo sapiens")
        s.add_attribute_to_xml("pathway_db_id")
        s.add_attribute_to_xml("_displayname")
        xmlq = s.biomartQuery.get_xml()
        res = s.query(xmlq)

    .. note:: the biomart sevice is slow (in my experience, 2013-2014) so please be patient...

    """
    _xml_example = """<?xml version="1.0" encoding="UTF-8"?>
                <!DOCTYPE Query>
                    <Query virtualSchemaName="default" formatter="CSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
                    <Dataset name="mmusculus_gene_ensembl" interface="default">
                    <Filter name="ensembl_gene_id" value="ENSMUSG00000086981"/>
                    <Attribute name="ensembl_gene_id"/>
                    <Attribute name="ensembl_transcript_id"/>
                    <Attribute name="transcript_start"/>
                    <Attribute name="transcript_end"/>
                    <Attribute name="exon_chrom_start"/>
                    <Attribute name="exon_chrom_end"/>
                    </Dataset>
                    </Query>"""

    def __init__(self, host=None, verbose=False, cache=False, secure=False):
        """.. rubric:: Constructor


        URL required to use biomart change quite often. Experience has
        shown that BioMart class in Bioservices may fail. This is not a
        bioservices issue but due to API changes on server side.

        For that reason the host is not filled anymore and one must set it
        manually.

        Let us take the example of the ensembl biomart. The host is

            www.ensembl.org

        Note that there is no prefix *http* and that the actual URL looked for
        internally is http://www.ensembl.org/biomart/martview

        (It used to be martservice in 2012-2016)

        Another reason to not set any default host is that servers may be busy or
        take lots of time to initialise (if many MARTS are available). Usually,
        one knows which MART to look at, in which case you may want to use a
        specific host (e.g., www.ensembl.org) that will speed up significantly the
        initialisation time.

        :param str host: a valid host (e.g. "www.ensembl.org", gramene.org)

        List of databases are available in this webpage http://www.biomart.org/community.html


        """
        url = None
        super(BioMart, self).__init__("BioMart", url=url, verbose=verbose, cache=cache, url_defined_later=True)

        self._names = None
        self._marts = None
        self._databases = None
        self._display_names = None
        self._valid_attributes = None
        self._hosts = None
        self._host = None
        self._secure = secure

        if host is None:
            i = 0
            hosts = ["www.ensembl.org", "asia.ensembl.org", "useast.ensembl.org"]
            while self.host is None and (i < 3):
                self.host = hosts[i]
                i += 1
            if self.host is None:
                raise IOError("no host provided and no default hosts {} not reachable".format(hosts))
        else:
            self.host = host
        self._biomartQuery = BioMartQuery()

[docs]    def custom_query(self, **args):
        self._biomartQuery = BioMartQuery(**args)

    def _get_host(self):
        return self._host

    def _set_host(self, host):
        import requests

        secure = ""
        if self._secure:
            secure = "s"
        url = "http{}://{}/biomart/martservice".format(secure, host)
        request = requests.head(url)
        if request.status_code in [200]:
            self._host = host
            self.url = url
            self._init()
        else:
            self.logging.warning("host {} is not reachable ".format(host))

    host = property(_get_host, _set_host)

    def _set_format(self, format):
        self.format = format

    def _init(self):
        temp = self.logging.level
        self.logging.setLevel("ERROR")
        _ = self.lookfor("uniprot", verbose=False)
        _ = self.valid_attributes
        self.logging.setLevel(temp)

[docs]    @require_host
    def registry(self):
        """to retrieve registry information

         the XML contains list of children called MartURLLocation made
         of attributes. We parse the xml to return a list of dictionary.
         each dictionary correspond to one MART.

        aliases to some keys are provided: names, databases, displayNames

        """
        ret = self.http_get("?type=registry", frmt="xml")
        import xml.etree.ElementTree as ET

        root = ET.fromstring(ret)
        ret = [x.attrib for x in root]
        return ret

[docs]    @require_host
    def datasets(self, mart, raw=False):
        """to retrieve datasets available for a mart:

        :param str mart: e.g. ensembl. see :attr:`names` for a list of valid
            MART names the mart is the database. see lookfor method or
            databases attributes

        >>> s = BioMart(verbose=False)
        >>> s.datasets("prod-intermart_1")
        ['protein', 'entry', 'uniparc']

        """
        if mart not in self.names:
            raise BioServicesError("Provided mart name (%s) is not valid. see 'names' attribute" % mart)
        ret = self.http_get("?type=datasets&mart=%s" % mart, frmt="txt")

        if raw is False:
            try:
                ret2 = [x.split("\t") for x in ret.split("\n") if len(x.strip())]
                ret = [x[1] for x in ret2]
            except Exception:
                ret = ["?"]
        return ret

[docs]    def get_datasets(self, mart):
        """Retrieve datasets with description"""
        if mart not in self.names:
            raise BioServicesError("Provided mart name (%s) is not valid. see 'names' attribute" % mart)

        ret = self.http_get("?type=datasets&mart=%s" % mart, frmt="txt")
        df = pd.read_csv(
            StringIO(ret),
            sep="\t",
            header=None,
            usecols=[1, 2],
            names=["name", "description"],
        )
        return df

[docs]    @require_host
    def attributes(self, dataset):
        """to retrieve attributes available for a dataset:

        :param str dataset: e.g. oanatinus_gene_ensembl

        """
        # assert dataset in self.names
        if dataset not in [x for k in self.valid_attributes.keys() for x in self.valid_attributes[k]]:
            raise ValueError("provided dataset (%s) is not found. see valid_attributes" % dataset)
        ret = self.http_get("?type=attributes&dataset=%s" % dataset, frmt="txt")

        ret = [x for x in ret.split("\n") if len(x)]
        results = {}
        for line in ret:
            key = line.split("\t")[0]
            results[key] = line.split("\t")[1:]
        return results

[docs]    @require_host
    def filters(self, dataset):
        r"""to retrieve filters available for a dataset:

        :param str dataset: e.g. oanatinus_gene_ensembl

        ::

            >>> s.filters("uniprot").split("\n")[1].split("\t")
            >>> s.filters("pathway")["species_selection"]
            [Arabidopsis thaliana,Bos taurus,Caenorhabditis elegans,Canis familiaris,Danio
            rerio,Dictyostelium discoideum,Drosophila melanogaster,Escherichia coli,Gallus
            gallus,Homo sapiens,Mus musculus,Mycobacterium tuberculosis,Oryza
            sativa,Plasmodium falciparum,Rattus norvegicus,Saccharomyces
            cerevisiae,Schizosaccharomyces pombe,Staphylococcus aureus N315,Sus
            scrofa,Taeniopygia guttata ,Xenopus tropicalis]

        """
        valid_attributes = sorted([x for k in self.valid_attributes.keys() for x in self.valid_attributes[k]])
        if dataset not in valid_attributes:
            raise ValueError(f"provided dataset ({dataset}) is not found. see valid_attributes {valid_attributes}")
        ret = self.http_get("?type=filters&dataset=%s" % dataset, frmt="txt")
        ret = [x for x in ret.split("\n") if len(x)]
        results = {}
        for line in ret:
            key = line.split("\t")[0]
            results[key] = line.split("\t")[1:]
        return results

[docs]    @require_host
    def configuration(self, dataset):
        """to retrieve configuration available for a dataset:

        :param str dataset: e.g. oanatinus_gene_ensembl

        """
        ret = self.http_get("?type=configuration&dataset=%s" % dataset, frmt="xml")
        import bs4

        return bs4.BeautifulSoup(ret, "xml")

[docs]    @require_host
    def version(self, mart):
        """Returns version of a **mart**

        :param str mart: e.g. ensembl

        """
        ret = self.http_get("?type=version&mart=%s" % mart, frmt="xml")
        return ret.strip()

[docs]    @require_host
    def new_query(self):
        """Reset the current query, clearing all datasets, filters and attributes.

        Call this before building a new BioMart XML query from scratch.

        Example::

            >>> from bioservices import BioMart
            >>> s = BioMart(host="www.ensembl.org", verbose=False)
            >>> s.new_query()
            >>> s.add_dataset_to_xml("mmusculus_gene_ensembl")
            >>> s.add_attribute_to_xml("ensembl_gene_id")
            >>> xml = s.get_xml()

        """
        self._biomartQuery.reset()

[docs]    @require_host
    def query(self, xmlq):
        """Send a query to biomart

        The query must be formatted in a XML format which looks like (
        example from https://gist.github.com/keithshep/7776579)::

            <?xml version="1.0" encoding="UTF-8"?>
                <!DOCTYPE Query>
                    <Query virtualSchemaName="default" formatter="CSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
                    <Dataset name="mmusculus_gene_ensembl" interface="default">
                    <Filter name="ensembl_gene_id" value="ENSMUSG00000086981"/>
                    <Attribute name="ensembl_gene_id"/>
                    <Attribute name="ensembl_transcript_id"/>
                    <Attribute name="transcript_start"/>
                    <Attribute name="transcript_end"/>
                    <Attribute name="exon_chrom_start"/>
                    <Attribute name="exon_chrom_end"/>
                    </Dataset>
                    </Query>

        .. warning:: the input XML must be valid. THere is no validation made
            in thiss method.
        """
        ret = self.http_post(None, frmt=None, data={"query": xmlq.strip()}, headers={})
        return ret

[docs]    @require_host
    def add_attribute_to_xml(self, name, dataset=None):
        attr = self.create_attribute(name, dataset)
        self._biomartQuery.add_attribute(attr)

[docs]    @require_host
    def add_filter_to_xml(self, name, value, dataset=None):
        filt = self.create_filter(name, value, dataset)
        self._biomartQuery.add_filter(filt)

[docs]    @require_host
    def add_dataset_to_xml(self, dataset):
        self.attributes(dataset)
        #    raise BioServicesError("invalid dataset names provided. Check names attribute")
        self._biomartQuery.add_dataset(dataset)

[docs]    @require_host
    def get_xml(self):
        return self._biomartQuery.get_xml()

[docs]    @require_host
    def create_filter(self, name, value, dataset=None):
        if dataset:
            valid_filters = self.filters(dataset).keys()
            if name not in valid_filters:
                raise BioServicesError("Invalid filter name. ")
        if isinstance(value, list):
            value = ",".join(str(v) for v in value)
        value = str(value)
        _filter = ""
        if "=" in value:
            _filter = """        <Filter name = "%s" %s/>""" % (name, value)
        else:
            _filter = """        <Filter name = "%s" value = "%s"/>""" % (name, value)
        return _filter

[docs]    @require_host
    def create_attribute(self, name, dataset=None):
        # s.attributes(dataset)
        # valid dataset
        if dataset:
            valid_attributes = self.attributes(dataset).keys()
            if name not in valid_attributes:
                raise BioServicesError("Invalid attribute name. ")
        attrib = """        <Attribute name = "%s" />""" % name
        return attrib

    @require_host
    def _get_names(self):
        if self._names is None:
            ret = self.registry()
            names = [x["name"] for x in ret]
            self._names = names[:]
        return self._names

    names = property(_get_names, doc="list of valid datasets")

    @require_host
    def _get_displayNames(self):
        if self._display_names is None:
            ret = self.registry()
            names = [x["displayName"] for x in ret]
            self._display_names = names[:]
        return self._display_names

    displayNames = property(_get_displayNames, doc="list of valid datasets")

    @require_host
    def _get_databases(self):
        if self._databases is None:
            ret = self.registry()
            names = [x.get("database", "?") for x in ret]
            self._databases = names
        return self._databases

    databases = property(_get_databases, doc="list of valid datasets")

    @require_host
    def _get_marts(self):
        if self._marts is None:
            ret = self.registry()
            df = pd.DataFrame(ret)[["database", "displayName", "name"]]
            self._marts = df
        return self._marts

    marts = property(_get_marts, doc="list of marts")

    @require_host
    def _get_hosts(self):
        if self._hosts is None:
            ret = self.registry()
            names = [x.get("host", "?") for x in ret]
            self._hosts = names[:]
        return self._hosts

    hosts = property(_get_hosts, doc="list of valid hosts")

    @require_host
    def _get_valid_attributes(
        self,
    ):
        res = {}
        if self._valid_attributes is None:
            # we could use a loop and call self.datasets(name, raw=False) but it
            # can be a bit longish, so we use the asynchronous call using
            # requests
            saveme = self.settings.params["general.async_threshold"]
            # TODO: not python3 compatible for now. Waiting for gevent package
            # to be available.
            self.settings.params["general.async_threshold"][0] = 10000  #

            queries = ["?type=datasets&mart=%s" % name for name in self.names]
            results = self.http_get(queries, frmt="txt")

            self.settings.params["general.async_threshold"] = saveme
            # requests.start()
            # requests.wait()
            # results = requests.get_results()

            for i, name in enumerate(self.names):
                try:
                    res[name] = [x.split("\t")[1] for x in results[i].split("\n") if len(x.strip()) > 1]
                except Exception:
                    res[name] = "?"
            self._valid_attributes = res.copy()
        return self._valid_attributes

    valid_attributes = property(_get_valid_attributes, doc="list of valid datasets")

[docs]    @require_host
    def lookfor(self, pattern, verbose=True):
        """Search the registry for MARTs whose name, database, or displayName matches *pattern*.

        :param str pattern: case-insensitive substring to search for
            (e.g., ``"interpro"``, ``"ensembl"``).
        :param bool verbose: if ``True`` (default), print each matching entry.

        Example::

            >>> from bioservices import BioMart
            >>> s = BioMart(host="www.ensembl.org", verbose=False)
            >>> s.lookfor("ensembl")

        """
        for a, x, y, z in zip(self.hosts, self.databases, self.names, self.displayNames):
            found = False
            if pattern.lower() in x.lower():
                found = True
            if pattern.lower() in y.lower():
                found = True
            if pattern.lower() in z.lower():
                found = True
            if found is True and verbose is True:
                print("Candidate:")
                print("     database: %s " % x)
                print("    MART name: %s " % y)
                print("  displayName: %s " % z)
                print("        hosts: %s " % a)


class BioMartQuery(object):
    """Builder for BioMart XML queries.

    Accumulates dataset, filter, and attribute XML fragments and renders
    them into a complete BioMart query document via :meth:`get_xml`.

    Normally used indirectly through :class:`BioMart` helper methods
    (:meth:`~BioMart.add_dataset_to_xml`, :meth:`~BioMart.add_filter_to_xml`,
    :meth:`~BioMart.add_attribute_to_xml`, :meth:`~BioMart.get_xml`).
    """

    def __init__(
        self,
        version="1.0",
        virtualScheme="default",
        formatter="TSV",
        header=0,
        unique=0,
        configVer="0.6",
    ):
        params = {
            "version": version,
            "virtualSchemaName": virtualScheme,
            "formatter": formatter,
            "header": header,
            "uniqueRows": unique,
            "configVersion": configVer,
        }

        self.header = (
            """<?xml version="%(version)s" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "%(virtualSchemaName)s" formatter = "%(formatter)s"
header = "%(header)s" uniqueRows = "%(uniqueRows)s" count = ""
datasetConfigVersion = "%(configVersion)s" >\n"""
            % params
        )

        self.footer = "    </Dataset>\n</Query>"
        self.reset()

    def add_filter(self, filter):
        self.filters.append(filter)

    def add_attribute(self, attribute):
        self.attributes.append(attribute)

    def add_dataset(self, dataset):
        self.dataset = """    <Dataset name = "%s" interface = "default" >""" % dataset

    def reset(self):
        self.attributes = []
        self.filters = []
        self.dataset = None

    def get_xml(self):
        if self.dataset is None:
            raise BioServicesError("data set must be set. Use add_dataset method")
        xml = self.header
        xml += self.dataset + "\n\n"
        for line in self.filters:
            xml += line + "\n"
        for line in self.attributes:
            xml += line + "\n"
        xml += self.footer
        return xml