#
# This file is part of bioservices software
#
# Copyright (c) 2013-2017 - EBI-EMBL
#
# File author(s): Nick Weiner and others
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
# website: https://github.com/cokelaer/bioservices
# documentation: http://packages.python.org/bioservices
#
##############################################################################
"""This module provides a class :class:`~BioModels` that allows an easy access
to all the BioModel service.
.. topic:: What is BioMart ?
:URL: http://www.biomart.org/
:REST: http://www.biomart.org/martservice.html
.. highlights::
The BioMart project provides free software and data services to the
international scientific community in order to foster scientific collaboration
and facilitate the scientific discovery process. The project adheres to the open
source philosophy that promotes collaboration and code reuse.
-- from BioMart March 2013
.. note:: SOAP and REST are available. We use REST for the wrapping.
"""
from io import StringIO
from bioservices import REST, BioServicesError
from functools import wraps
from bioservices import logger
logger.name = __name__
import pandas as pd
__all__ = ["BioMart"]
def require_host(f):
@wraps(f)
def wrapper(*args, **kargs):
if args[0].host is None:
print("You must set the host (e.g. f.host='www.ensembl.org' ")
return
return f(*args, **kargs)
return wrapper
[docs]class BioMart(REST):
r"""Interface to the `BioMart <http://www.biomart.org>`_ service
BioMart is made of different views. Each view correspond to a specific **MART**.
For instance the UniProt service has a `BioMart view <http://www.ebi.ac.uk/uniprot/biomart/martview/>`_.
The registry can help to find the different services available through
BioMart.
>>> from bioservices import *
>>> s = BioMart()
>>> ret = s.registry() # to get information about existing services
The registry is a list of dictionaries. Some aliases are available to get
all the names or databases::
>>> s.names # alias to list of valid service names from registry
>>> "unimart" in s.names
True
Once you selected a view, you will want to select a database associated with
this view and then a dataset. The datasets can be retrieved as follows::
>>> s.datasets("prod-intermart_1") # retrieve datasets available for this mart
The main issue is how to figure out the database name (here **prod-intermart_1**) ?
Indeed, from the web site, what you see is the **displayName** and you must
introspect the registry to get this information. In **BioServices**, we provide
the :meth:`~bioservices.biomart.BioMart.lookfor` method to help you. For instance, to
retrieve the database name of **interpro**, type::
>>> s = BioMart(verbose=False)
>>> s.lookfor("interpro")
Candidate:
database: intermart_1
MART name: prod-intermart_1
displayName: INTERPRO (EBI UK)
hosts: www.ebi.ac.uk
The display name (INTERPRO) correspond to the MART name
prod-intermart_1. Let us you it to retrieve the datasets::
>>> s.datasets("prod-intermart_1")
['protein', 'entry', 'uniparc']
Now that we have the dataset names, we can select one and build a
query. Queries are XML that contains the dataset name, some
attributes and filters. The dataset name is one of the element
returned by the datasets method. Let us suppose that we want to query
**protein**, we need to add this dataset to the query::
>>> s.add_dataset_to_xml("protein")
Then, you can add attributes (one of the keys of the dictionary
returned by attributes("protein")::
>>> s.add_attribute_to_xml("protein_accession")
Optional filters can be used::
>>> s.add_filter_to_xml("protein_length_greater_than", 1000)
Finally, you can retrieve the XML query::
>>> xml_query = s.get_xml()
and send the request to biomart::
>>> res = s.query(xml_query)
>>> len(res)
12801
# print the first 10 accession numbers
>>> res = res.split("\n")
>>> for x in res[0:10]: print(x)
['P18656',
'Q81998',
'O09585',
'O77624',
'Q9R3A1',
'E7QZH5',
'O46454',
'Q9T3F4',
'Q9TCA3',
'P72759']
REACTOME example::
s.lookfor("reactome")
s.datasets("REACTOME")
['interaction', 'complex', 'reaction', 'pathway']
s.new_query()
s.add_dataset_to_xml("pathway")
s.add_filter_to_xml("species_selection", "Homo sapiens")
s.add_attribute_to_xml("pathway_db_id")
s.add_attribute_to_xml("_displayname")
xmlq = s.biomartQuery.get_xml()
res = s.query(xmlq)
.. note:: the biomart sevice is slow (in my experience, 2013-2014) so please be patient...
"""
_xml_example = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName="default" formatter="CSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
<Dataset name="mmusculus_gene_ensembl" interface="default">
<Filter name="ensembl_gene_id" value="ENSMUSG00000086981"/>
<Attribute name="ensembl_gene_id"/>
<Attribute name="ensembl_transcript_id"/>
<Attribute name="transcript_start"/>
<Attribute name="transcript_end"/>
<Attribute name="exon_chrom_start"/>
<Attribute name="exon_chrom_end"/>
</Dataset>
</Query>"""
def __init__(self, host=None, verbose=False, cache=False, secure=False):
""".. rubric:: Constructor
URL required to use biomart change quite often. Experience has
shown that BioMart class in Bioservices may fail. This is not a
bioservices issue but due to API changes on server side.
For that reason the host is not filled anymore and one must set it
manually.
Let us take the example of the ensembl biomart. The host is
www.ensembl.org
Note that there is no prefix *http* and that the actual URL looked for
internally is http://www.ensembl.org/biomart/martview
(It used to be martservice in 2012-2016)
Another reason to not set any default host is that servers may be busy or
take lots of time to initialise (if many MARTS are available). Usually,
one knows which MART to look at, in which case you may want to use a
specific host (e.g., www.ensembl.org) that will speed up significantly the
initialisation time.
:param str host: a valid host (e.g. "www.ensembl.org", gramene.org)
List of databases are available in this webpage http://www.biomart.org/community.html
"""
url = "undefined"
super(BioMart, self).__init__("BioMart", url=url, verbose=verbose, cache=cache, url_defined_later=True)
self._names = None
self._marts = None
self._databases = None
self._display_names = None
self._valid_attributes = None
self._hosts = None
self._host = None
self._secure = secure
if host is None:
i = 0
hosts = ["www.ensembl.org", "asia.ensembl.org", "useast.ensembl.org"]
while self.host is None and (i < 3):
self.host = hosts[i]
i += 1
if self.host is None:
raise IOError("no host provided and no default hosts {} not reachable".format(hosts))
else:
self.host = host
self._biomartQuery = BioMartQuery()
[docs] def custom_query(self, **args):
self._biomartQuery = BioMartQuery(**args)
def _get_host(self):
return self._host
def _set_host(self, host):
import requests
secure = ""
if self._secure:
secure = "s"
url = "http{}://{}/biomart/martservice".format(secure, host)
request = requests.head(url)
if request.status_code in [200]:
self._host = host
self.url = url
self._init()
else:
self.logging.warning("host {} is not reachable ".format(host))
host = property(_get_host, _set_host)
def _set_format(self, format):
self.format = format
def _init(self):
temp = self.logging.level
self.logging.setLevel("ERROR")
_ = self.lookfor("uniprot", verbose=False)
_ = self.valid_attributes
self.logging.setLevel(temp)
[docs] @require_host
def registry(self):
"""to retrieve registry information
the XML contains list of children called MartURLLocation made
of attributes. We parse the xml to return a list of dictionary.
each dictionary correspond to one MART.
aliases to some keys are provided: names, databases, displayNames
"""
ret = self.http_get("?type=registry", frmt="xml")
ret = self.easyXML(ret)
# the XML contains list of children called MartURLLocation made
# of attributes. We parse the xml to return a list of dictionary.
# each dictionary correspond to one MART.
ret = [x.attrib for x in ret.getchildren()]
return ret
[docs] @require_host
def datasets(self, mart, raw=False):
"""to retrieve datasets available for a mart:
:param str mart: e.g. ensembl. see :attr:`names` for a list of valid
MART names the mart is the database. see lookfor method or
databases attributes
>>> s = BioMart(verbose=False)
>>> s.datasets("prod-intermart_1")
['protein', 'entry', 'uniparc']
"""
if mart not in self.names:
raise BioServicesError("Provided mart name (%s) is not valid. see 'names' attribute" % mart)
ret = self.http_get("?type=datasets&mart=%s" % mart, frmt="txt")
if raw is False:
try:
ret2 = [x.split("\t") for x in ret.split("\n") if len(x.strip())]
ret = [x[1] for x in ret2]
except:
ret = ["?"]
return ret
[docs] def get_datasets(self, mart):
"""Retrieve datasets with description"""
if mart not in self.names:
raise BioServicesError("Provided mart name (%s) is not valid. see 'names' attribute" % mart)
ret = self.http_get("?type=datasets&mart=%s" % mart, frmt="txt")
import pandas as pd
df = pd.read_csv(
StringIO(ret),
sep="\t",
header=None,
usecols=[1, 2],
names=["name", "description"],
)
return df
[docs] @require_host
def attributes(self, dataset):
"""to retrieve attributes available for a dataset:
:param str dataset: e.g. oanatinus_gene_ensembl
"""
# assert dataset in self.names
if dataset not in [x for k in self.valid_attributes.keys() for x in self.valid_attributes[k]]:
raise ValueError("provided dataset (%s) is not found. see valid_attributes" % dataset)
ret = self.http_get("?type=attributes&dataset=%s" % dataset, frmt="txt")
ret = [x for x in ret.split("\n") if len(x)]
results = {}
for line in ret:
key = line.split("\t")[0]
results[key] = line.split("\t")[1:]
return results
[docs] @require_host
def filters(self, dataset):
r"""to retrieve filters available for a dataset:
:param str dataset: e.g. oanatinus_gene_ensembl
::
>>> s.filters("uniprot").split("\n")[1].split("\t")
>>> s.filters("pathway")["species_selection"]
[Arabidopsis thaliana,Bos taurus,Caenorhabditis elegans,Canis familiaris,Danio
rerio,Dictyostelium discoideum,Drosophila melanogaster,Escherichia coli,Gallus
gallus,Homo sapiens,Mus musculus,Mycobacterium tuberculosis,Oryza
sativa,Plasmodium falciparum,Rattus norvegicus,Saccharomyces
cerevisiae,Schizosaccharomyces pombe,Staphylococcus aureus N315,Sus
scrofa,Taeniopygia guttata ,Xenopus tropicalis]
"""
if dataset not in [x for k in self.valid_attributes.keys() for x in self.valid_attributes[k]]:
raise ValueError("provided dataset (%s) is not found. see valid_attributes" % dataset)
ret = self.http_get("?type=filters&dataset=%s" % dataset, frmt="txt")
ret = [x for x in ret.split("\n") if len(x)]
results = {}
for line in ret:
key = line.split("\t")[0]
results[key] = line.split("\t")[1:]
return results
[docs] @require_host
def configuration(self, dataset):
"""to retrieve configuration available for a dataset:
:param str dataset: e.g. oanatinus_gene_ensembl
"""
ret = self.http_get("?type=configuration&dataset=%s" % dataset, frmt="xml")
ret = self.easyXML(ret)
return ret
[docs] @require_host
def version(self, mart):
"""Returns version of a **mart**
:param str mart: e.g. ensembl
"""
ret = self.http_get("?type=version&mart=%s" % mart, frmt="xml")
ret = self.easyXML(ret)
return ret.root.strip()
[docs] @require_host
def new_query(self):
self._biomartQuery.reset()
[docs] @require_host
def query(self, xmlq):
"""Send a query to biomart
The query must be formatted in a XML format which looks like (
example from https://gist.github.com/keithshep/7776579)::
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName="default" formatter="CSV" header="0" uniqueRows="0" count="" datasetConfigVersion="0.6">
<Dataset name="mmusculus_gene_ensembl" interface="default">
<Filter name="ensembl_gene_id" value="ENSMUSG00000086981"/>
<Attribute name="ensembl_gene_id"/>
<Attribute name="ensembl_transcript_id"/>
<Attribute name="transcript_start"/>
<Attribute name="transcript_end"/>
<Attribute name="exon_chrom_start"/>
<Attribute name="exon_chrom_end"/>
</Dataset>
</Query>
.. warning:: the input XML must be valid. THere is no validation made
in thiss method.
"""
ret = self.http_post(None, frmt=None, data={"query": xmlq.strip()}, headers={})
return ret
[docs] @require_host
def add_attribute_to_xml(self, name, dataset=None):
attr = self.create_attribute(name, dataset)
self._biomartQuery.add_attribute(attr)
[docs] @require_host
def add_filter_to_xml(self, name, value, dataset=None):
filt = self.create_filter(name, value, dataset)
self._biomartQuery.add_filter(filt)
[docs] @require_host
def add_dataset_to_xml(self, dataset):
self.attributes(dataset)
# raise BioServicesError("invalid dataset names provided. Check names attribute")
self._biomartQuery.add_dataset(dataset)
[docs] @require_host
def get_xml(self):
return self._biomartQuery.get_xml()
[docs] @require_host
def create_filter(self, name, value, dataset=None):
if dataset:
valid_filters = self.filters(dataset).keys()
if name not in valid_filters:
raise BioServicesError("Invalid filter name. ")
_filter = ""
if "=" in value:
_filter = """ <Filter name = "%s" %s/>""" % (name, value)
else:
_filter = """ <Filter name = "%s" value = "%s"/>""" % (name, value)
return _filter
[docs] @require_host
def create_attribute(self, name, dataset=None):
# s.attributes(dataset)
# valid dataset
if dataset:
valid_attributes = self.attributes(dataset).keys()
if name not in valid_attributes:
raise BioServicesError("Invalid attribute name. ")
attrib = """ <Attribute name = "%s" />""" % name
return attrib
@require_host
def _get_names(self):
if self._names is None:
ret = self.registry()
names = [x["name"] for x in ret]
self._names = names[:]
return self._names
names = property(_get_names, doc="list of valid datasets")
@require_host
def _get_displayNames(self):
if self._display_names is None:
ret = self.registry()
names = [x["displayName"] for x in ret]
self._display_names = names[:]
return self._display_names
displayNames = property(_get_displayNames, doc="list of valid datasets")
@require_host
def _get_databases(self):
if self._databases is None:
ret = self.registry()
names = [x.get("database", "?") for x in ret]
self._databases = names
return self._databases
databases = property(_get_databases, doc="list of valid datasets")
@require_host
def _get_marts(self):
if self._marts is None:
ret = self.registry()
df = pd.DataFrame(ret)[["database", "displayName", "name"]]
self._marts = df
return self._marts
marts = property(_get_marts, doc="list of marts")
@require_host
def _get_hosts(self):
if self._hosts is None:
ret = self.registry()
names = [x.get("host", "?") for x in ret]
self._hosts = names[:]
return self._hosts
hosts = property(_get_hosts, doc="list of valid hosts")
@require_host
def _get_valid_attributes(
self,
):
res = {}
if self._valid_attributes is None:
# we could use a loop and call self.datasets(name, raw=False) but it
# can be a bit longish, so we use the asynchronous call using
# requests
saveme = self.settings.params["general.async_threshold"]
# TODO: not python3 compatible for now. Waiting for gevent package
# to be available.
self.settings.params["general.async_threshold"][0] = 10000 #
queries = ["?type=datasets&mart=%s" % name for name in self.names]
results = self.http_get(queries, frmt="txt")
self.settings.params["general.async_threshold"] = saveme
# requests.start()
# requests.wait()
# results = requests.get_results()
for i, name in enumerate(self.names):
try:
res[name] = [x.split("\t")[1] for x in results[i].split("\n") if len(x.strip()) > 1]
except:
res[name] = "?"
self._valid_attributes = res.copy()
return self._valid_attributes
valid_attributes = property(_get_valid_attributes, doc="list of valid datasets")
[docs] @require_host
def lookfor(self, pattern, verbose=True):
for a, x, y, z in zip(self.hosts, self.databases, self.names, self.displayNames):
found = False
if pattern.lower() in x.lower():
found = True
if pattern.lower() in y.lower():
found = True
if pattern.lower() in z.lower():
found = True
if found is True and verbose is True:
print("Candidate:")
print(" database: %s " % x)
print(" MART name: %s " % y)
print(" displayName: %s " % z)
print(" hosts: %s " % a)
class BioMartQuery(object):
def __init__(
self,
version="1.0",
virtualScheme="default",
formatter="TSV",
header=0,
unique=0,
configVer="0.6",
):
params = {
"version": version,
"virtualSchemaName": virtualScheme,
"formatter": formatter,
"header": header,
"uniqueRows": unique,
"configVersion": configVer,
}
self.header = (
"""<?xml version="%(version)s" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName = "%(virtualSchemaName)s" formatter = "%(formatter)s"
header = "%(header)s" uniqueRows = "%(uniqueRows)s" count = ""
datasetConfigVersion = "%(configVersion)s" >\n"""
% params
)
self.footer = " </Dataset>\n</Query>"
self.reset()
def add_filter(self, filter):
self.filters.append(filter)
def add_attribute(self, attribute):
self.attributes.append(attribute)
def add_dataset(self, dataset):
self.dataset = """ <Dataset name = "%s" interface = "default" >""" % dataset
def reset(self):
self.attributes = []
self.filters = []
self.dataset = None
def get_xml(self):
if self.dataset is None:
raise BioServicesError("data set must be set. Use add_dataset method")
xml = self.header
xml += self.dataset + "\n\n"
for line in self.filters:
xml += line + "\n"
for line in self.attributes:
xml += line + "\n"
xml += self.footer
return xml