#
# This file is part of bioservices software
#
# Copyright (c) 2013-2014 - EMBL-EBI
#
# File author(s):
#
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
# website: https://github.com/cokelaer/bioservices
# documentation: http://packages.python.org/bioservices
#
##############################################################################
"""Interface to the NCBI BLAST URL API
.. topic:: What is the NCBI BLAST URL API?
:URL: https://blast.ncbi.nlm.nih.gov/
:API: https://ncbi.github.io/blast-cloud/dev/api.html
.. highlights::
"NCBI BLAST finds regions of similarity between biological sequences.
The program compares nucleotide or protein sequences to sequence
databases and calculates the statistical significance."
-- NCBI BLAST documentation
Unlike :class:`~bioservices.ncbiblast.NCBIblast`, which wraps the EBI
mirror of NCBI BLAST, this class submits jobs **directly to NCBI's own
BLAST server**. This gives access to NCBI databases (``nt``, ``nr``,
``refseq_genomic``, …) under their native GenBank accession format,
with no EBI-specific database name translation required.
Typical usage::
from bioservices import NCBIBlastAPI
b = NCBIBlastAPI()
rid, rtoe = b.run(
program="blastn",
database="nt",
sequence="ATGAAAGCAATTTTCGTACTGAAAGGTTTT",
email="you@example.org",
)
b.wait(rid, rtoe)
xml = b.get_result(rid) # BLAST XML string
"""
import re
import time
from bioservices import logger
from bioservices.services import REST
logger.name = __name__
__all__ = ["NCBIBlastAPI"]
# NCBI recommends ≤3 requests/sec without an API key, ≤10/sec with one.
_MIN_POLL_INTERVAL = 15 # seconds — NCBI asks clients not to poll more often
[docs]class NCBIBlastAPI:
"""Interface to NCBI BLAST via NCBI's own URL API.
Jobs are submitted with :meth:`run`, polled with :meth:`get_status` or
:meth:`wait`, and results retrieved with :meth:`get_result`.
:param bool verbose: print debug messages (default False).
:param api_key: NCBI API key. Raises the rate limit from 3 to 10
requests per second. Obtain one at
https://www.ncbi.nlm.nih.gov/account/
Common databases
----------------
===================== ================================================
``nt`` NCBI nucleotide collection (all GenBank + RefSeq)
``nr`` NCBI non-redundant protein sequences
``refseq_genomic`` RefSeq genomic sequences
``refseq_rna`` RefSeq RNA sequences
``refseq_protein`` RefSeq protein sequences
``swissprot`` UniProtKB/Swiss-Prot
``pdbaa`` PDB protein sequences
``pdbnt`` PDB nucleotide sequences
``env_nt`` Environmental nucleotide sequences (metagenomics)
===================== ================================================
Example::
from bioservices import NCBIBlastAPI
b = NCBIBlastAPI()
rid, rtoe = b.run(
program="blastn",
database="nt",
sequence="ATGAAAGCAATTTTCGTACTGAAAGGTTTT",
email="you@example.org",
hitlist_size=10,
)
b.wait(rid, rtoe)
xml_text = b.get_result(rid)
"""
_url = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
_nucleotide_example = "ATGAAAGCAATTTTCGTACTGAAAGGTTTTGTTGGTTTTTTTTCGTTTTTGAATC"
_protein_example = "MDSTNVRSGMKSRKKKPKTTVIDDDDDCMTCSACQSKLVKISDITKVSLDYINTMRGNTLACAACGSSLKLLNDFAS"
_programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"]
def __init__(self, verbose=False, api_key=None):
self.services = REST(name="NCBIBlastAPI", url=self._url, verbose=verbose)
self.api_key = api_key
self.check_interval = _MIN_POLL_INTERVAL
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
[docs] def run(self, program, database, sequence, email, evalue="1e-10", hitlist_size=100, **kwargs):
"""Submit a BLAST job to NCBI and return the request identifier.
:param str program: BLAST program — one of ``blastn``, ``blastp``,
``blastx``, ``tblastn``, ``tblastx``.
:param str database: target database (e.g. ``"nt"``, ``"nr"``).
:param str sequence: query sequence in FASTA or bare sequence format.
:param str email: contact address forwarded to NCBI (required by their
usage policy).
:param str evalue: E-value threshold (default ``"1e-10"``).
:param int hitlist_size: maximum number of hits to return (default 100).
:param kwargs: additional NCBI BLAST parameters forwarded verbatim,
e.g. ``WORD_SIZE``, ``FILTER``, ``GAPCOSTS``, ``MATRIX_NAME``,
``MEGABLAST``.
:returns: ``(rid, rtoe)`` — the NCBI request ID and estimated wait
time in seconds.
:rtype: tuple[str, int]
Example::
rid, rtoe = b.run(
program="blastn",
database="nt",
sequence="ATGAAAGCAATTTTCGTACTGAAAGGTTTT",
email="you@example.org",
)
"""
if program not in self._programs:
raise ValueError(f"Invalid program '{program}'. Choose from: {self._programs}")
params = {
"CMD": "Put",
"PROGRAM": program,
"DATABASE": database,
"QUERY": sequence,
"HITLIST_SIZE": hitlist_size,
"EXPECT": evalue,
"EMAIL": email,
"TOOL": "bioservices",
"FORMAT_TYPE": "XML",
}
if self.api_key:
params["api_key"] = self.api_key
params.update(kwargs)
response = self.services.session.post(self._url, data=params)
response.raise_for_status()
return self._parse_submission(response.text)
[docs] def get_status(self, rid):
"""Return the current status of a submitted job.
:param str rid: request ID returned by :meth:`run`.
:returns: one of ``"WAITING"``, ``"READY"``, ``"FAILED"``,
``"UNKNOWN"``.
:rtype: str
"""
params = {
"CMD": "Get",
"FORMAT_OBJECT": "SearchInfo",
"RID": rid,
}
if self.api_key:
params["api_key"] = self.api_key
response = self.services.session.get(self._url, params=params)
response.raise_for_status()
return self._parse_status(response.text)
[docs] def get_result(self, rid, format_type="XML"):
"""Retrieve results for a finished job.
:param str rid: request ID returned by :meth:`run`.
:param str format_type: output format. ``"XML"`` (default) returns
standard BLAST XML; ``"Text"`` returns the pairwise text report;
``"Tabular"`` returns tab-separated hits; ``"JSON2"`` returns
JSON.
:returns: result content as a string.
:rtype: str
:raises RuntimeError: if the job is not yet ready.
"""
status = self.get_status(rid)
if status == "WAITING":
raise RuntimeError(f"Job {rid} is still running. Call wait() first.")
if status == "FAILED":
raise RuntimeError(f"Job {rid} failed on NCBI's servers.")
params = {
"CMD": "Get",
"RID": rid,
"FORMAT_TYPE": format_type,
}
if self.api_key:
params["api_key"] = self.api_key
response = self.services.session.get(self._url, params=params)
response.raise_for_status()
return response.text
[docs] def wait(self, rid, rtoe=None, timeout=600):
"""Block until the job identified by *rid* is finished.
:param str rid: request ID returned by :meth:`run`.
:param int rtoe: estimated wait time in seconds returned by
:meth:`run`. When provided, the first poll is delayed by
*rtoe* seconds so NCBI is not hit unnecessarily early.
:param int timeout: maximum number of seconds to wait before giving
up and returning ``"TIMEOUT"`` (default: 600 s / 10 min).
Set to ``None`` to wait indefinitely.
:returns: final status string (``"READY"``, ``"FAILED"``,
``"UNKNOWN"``, or ``"TIMEOUT"``).
:rtype: str
"""
if rtoe:
self.services.logging.info(f"Waiting {rtoe}s before first poll (RTOE from NCBI)…")
time.sleep(max(rtoe, self.check_interval))
elapsed = 0
while timeout is None or elapsed < timeout:
status = self.get_status(rid)
self.services.logging.info(f"Job {rid}: {status} ({elapsed}s elapsed)")
if status in ("READY", "FAILED", "UNKNOWN"):
return status
time.sleep(self.check_interval)
elapsed += self.check_interval
self.services.logging.warning(f"Job {rid} timed out after {timeout}s.")
return "TIMEOUT"
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
@staticmethod
def _parse_submission(html):
"""Extract RID and RTOE from the NCBI submission response."""
rid_match = re.search(r"RID\s*=\s*(\S+)", html)
rtoe_match = re.search(r"RTOE\s*=\s*(\d+)", html)
if not rid_match:
raise RuntimeError(
"Could not extract RID from NCBI BLAST response. "
"NCBI may be temporarily unavailable or the request was malformed."
)
rid = rid_match.group(1)
rtoe = int(rtoe_match.group(1)) if rtoe_match else _MIN_POLL_INTERVAL
return rid, rtoe
@staticmethod
def _parse_status(html):
"""Extract job status from the NCBI status-check response."""
match = re.search(r"Status\s*=\s*(\w+)", html)
if not match:
return "UNKNOWN"
return match.group(1).upper()