Source code for bioservices.pride

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s):
#      https://github.com/cokelaer/bioservices
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  source: http://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################

"""Interface to PRIDE web service

.. topic:: What is PRIDE ?

    :URL: http://www.ebi.ac.uk/pride/ws/archive/v2

    .. highlights::

         The PRIDE PRoteomics IDEntifications database is a centralized,
         standards compliant, public data repository for proteomics data,
         including protein and peptide identifications, post-translational
         modifications and supporting spectral evidence.

        -- From PRIDE web site, Jan 2015


"""
import tqdm

from bioservices import logger
from bioservices.services import REST

logger.name = __name__


__all__ = ["PRIDE"]


[docs]class PRIDE: """Interface to the `PRIDE <https://www.ebi.ac.uk/pride/ws/archive/v2>`_ service :: from bioservices import PRIDE p = PRIDE() p.get_peptide_evidence(projectAccession) .. versionchanged:: 1.10.1 Due to new API: - the method project_count was dropped. - get_project_list was renamed in get_project_files - get_assays, get_assay_count, get_assay_count_project_accession, get_assay_list were dropped in v2 - get_protein_list, get_protein_count, get_protein_count_assay, get_protein_list, get_protein_list_assay replaced by get_protein_evidences method - get_peptide_list_assay, get_peptide_count, get_peptide_list, get_peptide_list_sequence, get_peptide_count_assay replaced by get_peptide_evidence. """ _url = "https://www.ebi.ac.uk/pride/ws/archive/v2" def __init__(self, verbose=False, cache=False): """**Constructor** :param bool verbose: set to False to prevent informative messages :param bool cache: set to True to use caching. Not recommended for this service that evolves a lot """ self.services = REST(name="PRIDE", url=PRIDE._url, verbose=verbose, cache=cache)
[docs] def get_project(self, identifier): """Retrieve project information by accession List of PRIDE Archive Projects. The following method does not allow performing search; for search functionality you will need to use the search/projects. The result list is Paginated using the pageSize and page. :param str identifier: a valid PRIDE identifier e.g., PRD000001 :return: if identifier is invalid, returns an empty dictionary {} .. doctest:: >>> from bioservices import PRIDE >>> p = PRIDE() >>> res = p.get_project("PRD000001") >>> res['title'] 'COFRADIC proteome of unstimulated human blood platelets' """ res = self.services.http_get(f"projects/{identifier}") if res in (400, 404): logger.warning(f"Nothing found for {identifier}. may be this is not a valid identifier. Use get_projects") return {} return res
[docs] def get_projects(self, pageSize=100, max_pages=1e9): """Retrieve all PRIDE projects, paginating automatically. :param int pageSize: number of results per page (default 100) :param max_pages: maximum number of pages to fetch (default: all pages) :return: a list of project dictionaries """ results = [] for page in tqdm.tqdm(range(int(max_pages))): res = self.services.http_get("projects", params={"pageSize": pageSize, "page": page}) if isinstance(res, list): if not res: break results.extend(res) if len(res) < pageSize: break else: projects = res.get("_embedded", {}).get("projects", []) results.extend(projects) total = res.get("page", {}).get("totalElements", 0) if len(results) >= total or not projects: break if page + 1 >= max_pages: break return results
[docs] def get_projects_count(self): """Return total number of projects. .. note:: When the API returns a paginated list (new format), this method returns the count for the first page only, not the total across all pages. """ res = self.services.http_get("projects") if isinstance(res, list): return len(res) return res["page"]["totalElements"]
[docs] def get_project_files(self, accession, pageSize=100, page=0, sortConditions=None, sortDirection="DESC", filters=""): """list projects or given criteria :param str accession: the accession number to look for :param int pageSize: how many results to return per page :param int page: which page (starting from 0) of the result to return :param str sortConditions: default is submission_date but more fields can be separated by comma and passed. Example: submission_date,project_title :param str sortDirection: the sorting order (ASC or DESC) :param str filters: Parameters to filter the search results. The structure of the filter is: field1==value1, field2==value2. Example accession==PRD000001 :: >>> p = PRIDE() >>> results = p.get_project_files(accession="PRD000001", pageSize=10, page=1) In v1.10.1 due to new PRIDE API, the method **get_file_count** was dropped. You can use:: len(results['_embedded']['files']) Similarly the **get_file_list** method was dropped since all results are stored in the output of this method """ params = { "pageSize": pageSize, "page": page, "sortDirection": sortDirection, "sortConditions": sortConditions, "filter": filters, } res = self.services.http_get(f"projects/{accession}/files", params=params) try: res = res["list"] except Exception: pass return res
[docs] def get_protein_evidences( self, project_accession=None, assay_accession=None, reported_accession=None, pageSize=100, page=0, sortDirection="DESC", sortConditions="projectAccession", ): """Get all proteins evidence :param str project_accession: filter by PRIDE project accession (optional) :param str assay_accession: filter by assay accession (optional) :param str reported_accession: filter by reported protein accession (optional) :param int pageSize: how many results to return per page (default 100) :param int page: which page (starting from 0) of the result to return :param str sortConditions: field(s) to sort by, comma-separated (default ``"projectAccession"``) :param str sortDirection: the sorting order (``"ASC"`` or ``"DESC"``) :: p.get_protein_evidences()['_embedded']['proteinevidences'] """ params = {} if project_accession: params["projectAccession"] = project_accession if assay_accession: # pragma: no cover params["assayAccession"] = assay_accession if reported_accession: # pragma: no cover params["reportedAccession"] = reported_accession params["pageSize"] = pageSize params["page"] = page params["sortConditions"] = sortConditions params["sortDirection"] = sortDirection res = self.services.http_get("proteinevidences", params=params) return res
[docs] def get_peptide_evidence( self, project_accession=None, assay_accession=None, protein_accession=None, peptide_evidence_accession=None, peptide_sequence=None, pageSize=100, page=0, sortDirection="DESC", sortConditions="projectAccession", ): """Get all the peptide evidences for a specific protein evidence. :param str project_accession: filter by PRIDE project accession (optional) :param str assay_accession: filter by assay accession (optional) :param str protein_accession: filter by protein accession (optional) :param str peptide_evidence_accession: filter by peptide evidence accession (optional) :param str peptide_sequence: filter by peptide sequence (optional) :param int pageSize: how many results to return per page (default 100) :param int page: which page (starting from 0) of the result to return :param str sortConditions: field(s) to sort by, comma-separated (default ``"projectAccession"``) :param str sortDirection: the sorting order (``"ASC"`` or ``"DESC"``) Retrieving data from project accession should be fast:: p.get_peptide_evidence(protein_accession="Q8IX30") but other methods may be slow:: p.get_peptide_evidence(peptide_sequence="CQGSPGASKAMLSCNR") """ params = {} if project_accession: params["projectAccession"] = project_accession if assay_accession: # pragma: no cover params["assayAccession"] = assay_accession if protein_accession: # pragma: no cover params["proteinAccession"] = protein_accession if peptide_evidence_accession: # pragma: no cover params["peptideEvidenceAccession"] = peptide_evidence_accession if peptide_sequence: # pragma: no cover params["peptideSequence"] = peptide_sequence params["pageSize"] = pageSize params["page"] = page params["sortConditions"] = sortConditions res = self.services.http_get("peptideevidences", params=params) return res
[docs] def get_stats(self, name): """Retrieve statistics by name. :param str name: statistics name (e.g., ``"SUBMISSIONS_PER_YEAR"``) :return: statistics data for the given name :: p.get_stats("SUBMISSIONS_PER_YEAR") """ res = self.services.http_get(f"stats/{name}") return res