#
# This file is part of bioservices software
#
# Copyright (c) 2013-2014 - EBI-EMBL
#
# File author(s):
# https://github.com/cokelaer/bioservices
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
# source: http://github.com/cokelaer/bioservices
# documentation: http://packages.python.org/bioservices
#
##############################################################################
"""Interface to PRIDE web service
.. topic:: What is PRIDE ?
:URL: http://www.ebi.ac.uk/pride/ws/archive/v2
.. highlights::
The PRIDE PRoteomics IDEntifications database is a centralized,
standards compliant, public data repository for proteomics data,
including protein and peptide identifications, post-translational
modifications and supporting spectral evidence.
-- From PRIDE web site, Jan 2015
"""
import tqdm
from bioservices import logger
from bioservices.services import REST
logger.name = __name__
__all__ = ["PRIDE"]
[docs]class PRIDE:
"""Interface to the `PRIDE <https://www.ebi.ac.uk/pride/ws/archive/v2>`_ service
::
from bioservices import PRIDE
p = PRIDE()
p.get_peptide_evidence(projectAccession)
.. versionchanged:: 1.10.1
Due to new API:
- the method project_count was dropped.
- get_project_list was renamed in get_project_files
- get_assays, get_assay_count, get_assay_count_project_accession, get_assay_list were dropped in v2
- get_protein_list, get_protein_count, get_protein_count_assay, get_protein_list, get_protein_list_assay
replaced by get_protein_evidences method
- get_peptide_list_assay, get_peptide_count, get_peptide_list, get_peptide_list_sequence,
get_peptide_count_assay replaced by get_peptide_evidence.
"""
_url = "https://www.ebi.ac.uk/pride/ws/archive/v2"
def __init__(self, verbose=False, cache=False):
"""**Constructor**
:param bool verbose: set to False to prevent informative messages
:param bool cache: set to True to use caching. Not recommended for
this service that evolves a lot
"""
self.services = REST(name="PRIDE", url=PRIDE._url, verbose=verbose, cache=cache)
[docs] def get_project(self, identifier):
"""Retrieve project information by accession
List of PRIDE Archive Projects. The following method does not allow
performing search; for search functionality you will need to use
the search/projects. The result list is Paginated using the pageSize and page.
:param str identifier: a valid PRIDE identifier e.g., PRD000001
:return: if identifier is invalid, returns an empty dictionary {}
.. doctest::
>>> from bioservices import PRIDE
>>> p = PRIDE()
>>> res = p.get_project("PRD000001")
>>> res['title']
'COFRADIC proteome of unstimulated human blood platelets'
"""
res = self.services.http_get(f"projects/{identifier}")
if res in (400, 404):
logger.warning(f"Nothing found for {identifier}. may be this is not a valid identifier. Use get_projects")
return {}
return res
[docs] def get_projects(self, pageSize=100, max_pages=1e9):
"""Retrieve all PRIDE projects, paginating automatically.
:param int pageSize: number of results per page (default 100)
:param max_pages: maximum number of pages to fetch (default: all pages)
:return: a list of project dictionaries
"""
results = []
for page in tqdm.tqdm(range(int(max_pages))):
res = self.services.http_get("projects", params={"pageSize": pageSize, "page": page})
if isinstance(res, list):
if not res:
break
results.extend(res)
if len(res) < pageSize:
break
else:
projects = res.get("_embedded", {}).get("projects", [])
results.extend(projects)
total = res.get("page", {}).get("totalElements", 0)
if len(results) >= total or not projects:
break
if page + 1 >= max_pages:
break
return results
[docs] def get_projects_count(self):
"""Return total number of projects.
.. note:: When the API returns a paginated list (new format), this method
returns the count for the first page only, not the total across all pages.
"""
res = self.services.http_get("projects")
if isinstance(res, list):
return len(res)
return res["page"]["totalElements"]
[docs] def get_project_files(self, accession, pageSize=100, page=0, sortConditions=None, sortDirection="DESC", filters=""):
"""list projects or given criteria
:param str accession: the accession number to look for
:param int pageSize: how many results to return per page
:param int page: which page (starting from 0) of the result to return
:param str sortConditions: default is submission_date but more fields
can be separated by comma and passed. Example: submission_date,project_title
:param str sortDirection: the sorting order (ASC or DESC)
:param str filters: Parameters to filter the search results. The structure of
the filter is: field1==value1, field2==value2. Example accession==PRD000001
::
>>> p = PRIDE()
>>> results = p.get_project_files(accession="PRD000001", pageSize=10, page=1)
In v1.10.1 due to new PRIDE API, the method **get_file_count** was dropped. You can use::
len(results['_embedded']['files'])
Similarly the **get_file_list** method was dropped since all results are
stored in the output of this method
"""
params = {
"pageSize": pageSize,
"page": page,
"sortDirection": sortDirection,
"sortConditions": sortConditions,
"filter": filters,
}
res = self.services.http_get(f"projects/{accession}/files", params=params)
try:
res = res["list"]
except Exception:
pass
return res
[docs] def get_protein_evidences(
self,
project_accession=None,
assay_accession=None,
reported_accession=None,
pageSize=100,
page=0,
sortDirection="DESC",
sortConditions="projectAccession",
):
"""Get all proteins evidence
:param str project_accession: filter by PRIDE project accession (optional)
:param str assay_accession: filter by assay accession (optional)
:param str reported_accession: filter by reported protein accession (optional)
:param int pageSize: how many results to return per page (default 100)
:param int page: which page (starting from 0) of the result to return
:param str sortConditions: field(s) to sort by, comma-separated
(default ``"projectAccession"``)
:param str sortDirection: the sorting order (``"ASC"`` or ``"DESC"``)
::
p.get_protein_evidences()['_embedded']['proteinevidences']
"""
params = {}
if project_accession:
params["projectAccession"] = project_accession
if assay_accession: # pragma: no cover
params["assayAccession"] = assay_accession
if reported_accession: # pragma: no cover
params["reportedAccession"] = reported_accession
params["pageSize"] = pageSize
params["page"] = page
params["sortConditions"] = sortConditions
params["sortDirection"] = sortDirection
res = self.services.http_get("proteinevidences", params=params)
return res
[docs] def get_peptide_evidence(
self,
project_accession=None,
assay_accession=None,
protein_accession=None,
peptide_evidence_accession=None,
peptide_sequence=None,
pageSize=100,
page=0,
sortDirection="DESC",
sortConditions="projectAccession",
):
"""Get all the peptide evidences for a specific protein evidence.
:param str project_accession: filter by PRIDE project accession (optional)
:param str assay_accession: filter by assay accession (optional)
:param str protein_accession: filter by protein accession (optional)
:param str peptide_evidence_accession: filter by peptide evidence accession (optional)
:param str peptide_sequence: filter by peptide sequence (optional)
:param int pageSize: how many results to return per page (default 100)
:param int page: which page (starting from 0) of the result to return
:param str sortConditions: field(s) to sort by, comma-separated
(default ``"projectAccession"``)
:param str sortDirection: the sorting order (``"ASC"`` or ``"DESC"``)
Retrieving data from project accession should be fast::
p.get_peptide_evidence(protein_accession="Q8IX30")
but other methods may be slow::
p.get_peptide_evidence(peptide_sequence="CQGSPGASKAMLSCNR")
"""
params = {}
if project_accession:
params["projectAccession"] = project_accession
if assay_accession: # pragma: no cover
params["assayAccession"] = assay_accession
if protein_accession: # pragma: no cover
params["proteinAccession"] = protein_accession
if peptide_evidence_accession: # pragma: no cover
params["peptideEvidenceAccession"] = peptide_evidence_accession
if peptide_sequence: # pragma: no cover
params["peptideSequence"] = peptide_sequence
params["pageSize"] = pageSize
params["page"] = page
params["sortConditions"] = sortConditions
res = self.services.http_get("peptideevidences", params=params)
return res
[docs] def get_stats(self, name):
"""Retrieve statistics by name.
:param str name: statistics name (e.g., ``"SUBMISSIONS_PER_YEAR"``)
:return: statistics data for the given name
::
p.get_stats("SUBMISSIONS_PER_YEAR")
"""
res = self.services.http_get(f"stats/{name}")
return res