#
# This file is part of bioservices software
#
# Copyright (c) 2013-2014 - EBI-EMBL
#
# File author(s):
# Thomas Cokelaer <cokelaer@ebi.ac.uk>
# https://github.com/cokelaer/bioservices
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
# website: https://github.com/cokelaer/bioservices
# documentation: http://bioservices.readthedocs.io
#
##############################################################################
"""Interface to the STRING protein interaction database web service.
.. topic:: What is STRING?
:URL: https://string-db.org
:REST: https://string-db.org/api
.. highlights::
STRING is a database of known and predicted protein-protein interactions.
The interactions include direct (physical) and indirect (functional)
associations; they stem from computational prediction, from knowledge
transfer between organisms, and from interactions aggregated from other
(primary) databases. STRING covers proteins from thousands of organisms.
-- string-db.org home page
The Bioconductor R package ``STRINGdb`` provides a similar interface to the
STRING database. This module provides an equivalent Python interface.
:Reference: Szklarczyk D, et al. The STRING database in 2023: protein–protein
association networks and functional enrichment analyses for any sequenced
genome of interest. Nucleic Acids Res. 2023;51(D1):D638-D646.
doi:10.1093/nar/gkac1000
"""
from bioservices import logger
from bioservices.services import REST
logger.name = __name__
__all__ = ["STRING"]
[docs]class STRING:
"""Interface to the `STRING <https://string-db.org>`_ database.
STRING is a database of known and predicted protein-protein interactions.
It covers both direct (physical) and indirect (functional) associations
derived from genomic context, high-throughput experiments, co-expression,
and the literature.
::
>>> from bioservices import STRING
>>> s = STRING()
>>> interactions = s.get_interactions("ZAP70", species=9606)
>>> partners = s.get_interaction_partners("ZAP70", species=9606)
"""
_url = "https://string-db.org/api"
def __init__(self, verbose=True, cache=False):
"""**Constructor**
:param bool verbose: set to False to prevent informative messages
:param bool cache: set to True to enable caching of requests
"""
self.services = REST(
name="STRING",
url=STRING._url,
verbose=verbose,
cache=cache,
url_defined_later=True,
)
def _identifiers_to_str(self, identifiers):
"""Convert a list or string of identifiers to a ``%0d``-separated string.
The STRING API requires identifiers to be separated by a carriage
return character ``\\r`` (``%0d``) in GET query strings, as documented
in the official STRING API examples.
"""
if isinstance(identifiers, (list, tuple)):
return "\r".join(identifiers)
return str(identifiers)
def _get(self, endpoint, params):
"""Issue a raw GET request to the STRING API.
Unlike :meth:`services.http_get`, this method does **not** add an
``Accept`` header. Sending ``Accept: application/json`` to the STRING
server causes the request to hang indefinitely; the server returns
JSON when the ``/json/`` path prefix is used, regardless of headers.
"""
import requests as _requests
url = f"{STRING._url}/{endpoint}"
resp = _requests.get(url, params=params)
resp.raise_for_status()
return resp.json()
[docs] def get_version(self):
"""Return the current STRING API version information.
:return: dict with version details.
::
>>> from bioservices import STRING
>>> s = STRING()
>>> ver = s.get_version()
>>> "string_version" in ver
True
"""
res = self._get("json/version", params={})
if isinstance(res, list) and len(res) == 1:
return res[0]
return res
[docs] def get_string_ids(self, identifiers, species=None, limit=1, echo_query=True, caller_identity=None):
"""Resolve identifiers to STRING identifiers.
Maps gene/protein names or other identifiers to their STRING IDs.
:param identifiers: identifier(s) to resolve. Multiple identifiers
should be separated by ``%0d`` or provided as a list.
:param int species: NCBI taxonomy ID. For example, 9606 for *Homo sapiens*.
If ``None``, STRING will search across all species.
:param int limit: maximum number of results per input identifier.
Default is 1 (best match).
:param bool echo_query: if True, include the query identifier in the response.
:param str caller_identity: optional application name for tracking.
:return: list of dicts with STRING identifier mappings.
::
>>> from bioservices import STRING
>>> s = STRING()
>>> res = s.get_string_ids("ZAP70", species=9606)
>>> res[0]["stringId"]
'9606.ENSP00000379990'
"""
params = {
"identifiers": self._identifiers_to_str(identifiers),
"echo_query": 1 if echo_query else 0,
"limit": limit,
}
if species is not None:
params["species"] = species
if caller_identity:
params["caller_identity"] = caller_identity
res = self._get("json/get_string_ids", params=params)
return res
[docs] def get_interactions(
self,
identifiers,
species=None,
required_score=None,
network_type="functional",
add_nodes=0,
show_query_node_labels=0,
caller_identity=None,
):
"""Retrieve protein-protein interactions for the given identifiers.
Returns the STRING interaction network for a set of proteins. Each
interaction record includes scores for different evidence channels
(neighbourhood, co-occurrence, co-expression, experimental, database,
text-mining) as well as a combined interaction score.
:param identifiers: gene/protein name(s). Use ``%0d`` as separator for
multiple identifiers, or provide a list.
:param int species: NCBI taxonomy ID (e.g. 9606 for human). Required
when identifiers are gene symbols.
:param int required_score: minimum combined interaction score (0–1000).
Interactions below this threshold are excluded.
:param str network_type: either ``"functional"`` (default) or
``"physical"``.
:param int add_nodes: number of additional white-list nodes to add to
the network.
:param int show_query_node_labels: set to 1 to display labels for input
nodes even when they are not directly connected.
:param str caller_identity: optional application name for tracking.
:return: list of dicts, each representing one interaction with scores.
::
>>> from bioservices import STRING
>>> s = STRING()
>>> res = s.get_interactions("ZAP70", species=9606)
>>> len(res) > 0
True
"""
params = {
"identifiers": self._identifiers_to_str(identifiers),
"network_type": network_type,
}
if species is not None:
params["species"] = species
if required_score is not None:
params["required_score"] = required_score
# Only send add_nodes / show_query_node_labels when explicitly requested;
# for a single-protein query the STRING API automatically sets add_nodes=10
# when the parameter is absent, returning the interaction neighbourhood.
# Sending add_nodes=0 would override that and produce an empty result.
if add_nodes:
params["add_nodes"] = add_nodes
if show_query_node_labels:
params["show_query_node_labels"] = show_query_node_labels
if caller_identity:
params["caller_identity"] = caller_identity
res = self._get("json/network", params=params)
return res
[docs] def get_network(
self,
identifiers,
species=None,
required_score=None,
network_type="functional",
add_nodes=0,
show_query_node_labels=0,
caller_identity=None,
):
"""Retrieve protein-protein interactions for the given identifiers.
This is an alias for :meth:`get_interactions`.
:param identifiers: gene/protein name(s). Use ``%0d`` as separator for
multiple identifiers, or provide a list.
:param int species: NCBI taxonomy ID (e.g. 9606 for human).
:param int required_score: minimum combined interaction score (0–1000).
:param str network_type: either ``"functional"`` (default) or
``"physical"``.
:param int add_nodes: number of additional white-list nodes to add to
the network.
:param int show_query_node_labels: set to 1 to display labels for input
nodes.
:param str caller_identity: optional application name for tracking.
:return: list of dicts, each representing one interaction with scores.
::
>>> from bioservices import STRING
>>> s = STRING()
>>> res = s.get_network(["TP53", "BRCA1"], species=9606)
"""
return self.get_interactions(
identifiers,
species=species,
required_score=required_score,
network_type=network_type,
add_nodes=add_nodes,
show_query_node_labels=show_query_node_labels,
caller_identity=caller_identity,
)
[docs] def get_interaction_partners(
self,
identifiers,
species=None,
required_score=None,
limit=None,
network_type="functional",
caller_identity=None,
):
"""Retrieve interaction partners for the given proteins.
Returns proteins that interact with the query proteins. Compared to
:meth:`get_interactions`, this method returns partners even if they are
not in the original query set.
:param identifiers: gene/protein name(s). Separate multiple identifiers
with ``%0d`` or provide a list.
:param int species: NCBI taxonomy ID (e.g. 9606 for human).
:param int required_score: minimum combined interaction score (0–1000).
:param int limit: maximum number of interaction partners to return per
input protein.
:param str network_type: either ``"functional"`` (default) or
``"physical"``.
:param str caller_identity: optional application name for tracking.
:return: list of dicts, each representing one interaction.
::
>>> from bioservices import STRING
>>> s = STRING()
>>> partners = s.get_interaction_partners("ZAP70", species=9606, limit=5)
>>> len(partners) > 0
True
"""
params = {"identifiers": self._identifiers_to_str(identifiers), "network_type": network_type}
if species is not None:
params["species"] = species
if required_score is not None:
params["required_score"] = required_score
if limit is not None:
params["limit"] = limit
if caller_identity:
params["caller_identity"] = caller_identity
res = self._get("json/interaction_partners", params=params)
return res
[docs] def get_homology(self, identifiers, species=None, species_b=None, required_score=None, caller_identity=None):
"""Retrieve homology data for a set of proteins.
Returns homologous protein pairs between the query species and
``species_b`` (or within the query species if ``species_b`` is not
given).
:param identifiers: gene/protein name(s). Separate multiple identifiers
with ``%0d`` or provide a list.
:param int species: NCBI taxonomy ID of the query species.
:param int species_b: NCBI taxonomy ID of the second species. If
``None``, homologs are retrieved within ``species``.
:param int required_score: minimum combined interaction score (0–1000).
:param str caller_identity: optional application name for tracking.
:return: list of dicts describing homology relationships.
::
>>> from bioservices import STRING
>>> s = STRING()
>>> res = s.get_homology("ZAP70", species=9606, species_b=10090)
"""
params = {"identifiers": self._identifiers_to_str(identifiers)}
if species is not None:
params["species"] = species
if species_b is not None:
params["species_b"] = species_b
if required_score is not None:
params["required_score"] = required_score
if caller_identity:
params["caller_identity"] = caller_identity
res = self._get("json/homology", params=params)
return res
[docs] def get_enrichment(self, identifiers, species=None, background_string_identifiers=None, caller_identity=None):
"""Perform functional enrichment analysis on a set of proteins.
Tests whether the input proteins are significantly enriched for
Gene Ontology (GO) terms, KEGG pathways, Pfam domains, InterPro
signatures, and other annotation categories.
:param identifiers: gene/protein name(s). Separate multiple identifiers
with ``%0d`` or provide a list.
:param int species: NCBI taxonomy ID (e.g. 9606 for human). Required
when identifiers are gene symbols.
:param background_string_identifiers: optional set of proteins to use
as the statistical background. Defaults to the entire proteome.
:param str caller_identity: optional application name for tracking.
:return: list of dicts, each representing an enriched annotation term
with fields such as ``category``, ``term``, ``description``,
``number_of_genes``, ``p_value``, and ``fdr``.
::
>>> from bioservices import STRING
>>> s = STRING()
>>> res = s.get_enrichment("ZAP70,LCK,CD3E,CD3D", species=9606)
>>> len(res) > 0
True
"""
params = {"identifiers": self._identifiers_to_str(identifiers)}
if species is not None:
params["species"] = species
if background_string_identifiers is not None:
params["background_string_identifiers"] = self._identifiers_to_str(background_string_identifiers)
if caller_identity:
params["caller_identity"] = caller_identity
res = self._get("json/enrichment", params=params)
return res
[docs] def get_functional_annotation(self, identifiers, species=None, allow_pubmed=0, caller_identity=None):
"""Get functional annotations for a set of proteins.
Returns GO terms, KEGG pathway membership, and other annotations
for the queried proteins.
:param identifiers: gene/protein name(s). Separate multiple identifiers
with ``%0d`` or provide a list.
:param int species: NCBI taxonomy ID (e.g. 9606 for human).
:param int allow_pubmed: include PubMed references (0 or 1, default: 0).
:param str caller_identity: optional application name for tracking.
:return: list of functional annotation records.
:rtype: list
::
>>> from bioservices import STRING
>>> s = STRING()
>>> res = s.get_functional_annotation("TP53", species=9606)
"""
params = {
"identifiers": self._identifiers_to_str(identifiers),
"allow_pubmed": allow_pubmed,
}
if species is not None:
params["species"] = species
if caller_identity:
params["caller_identity"] = caller_identity
res = self._get("json/functional_annotation", params=params)
return res
[docs] def get_ppi_enrichment(
self, identifiers, species=None, required_score=None, background_string_identifiers=None, caller_identity=None
):
"""Test whether the input proteins are enriched in interactions.
Returns a single record indicating the observed number of interactions,
expected number, *p*-value, and the average interaction score for the
input protein set.
:param identifiers: gene/protein name(s). Separate multiple identifiers
with ``%0d`` or provide a list.
:param int species: NCBI taxonomy ID (e.g. 9606 for human).
:param int required_score: minimum combined interaction score (0–1000).
If None, uses STRING default.
:param background_string_identifiers: optional background gene set for
enrichment calculation.
:param str caller_identity: optional application name for tracking.
:return: dict with keys ``number_of_nodes``, ``number_of_edges``,
``average_node_degree``, ``local_clustering_coefficient``,
``expected_number_of_edges``, and ``p_value``.
::
>>> from bioservices import STRING
>>> s = STRING()
>>> res = s.get_ppi_enrichment("ZAP70,LCK,CD3E", species=9606)
>>> "p_value" in res
True
"""
params = {"identifiers": self._identifiers_to_str(identifiers)}
if species is not None:
params["species"] = species
if required_score is not None:
params["required_score"] = required_score
if background_string_identifiers is not None:
params["background_string_identifiers"] = self._identifiers_to_str(background_string_identifiers)
if caller_identity:
params["caller_identity"] = caller_identity
res = self._get("json/ppi_enrichment", params=params)
if isinstance(res, list) and len(res) == 1:
return res[0]
return res