Source code for bioservices.apps.fasta

try:
    from collections import OrderedDict
except:
    # for python 2.6
    from ordereddict import OrderedDict
from easydev.decorators import ifpylab, ifpandas

__all__ = ["FASTA", "MultiFASTA"]


[docs]class MultiFASTA(object): """Class to manipulate several several FASTA items Here, we load some FASTA using UniProt web service:: >>> from bioservices import MultiFASTA >>> mf = MultiFASTA() >>> mf.load_fasta("P43408") >>> mf.load_fasta("P21318") You can then get back to your accession entries as follows :: >>> mf.ids ['P43408', 'P21318'] And the sequences in the same order can be accessed:: >>> len(mf) 2 Each FASTA is stored in :attr:`fasta`, which is a dictionary where each values is an instance of :class:`FASTA`:: >>> print(mf._fasta["P43408"].fasta) >sp|P43408|KADA_METIG Adenylate kinase OS=Methanotorris igneus GN=adkA PE=1 SV=2 MKNKVVVVTGVPGVGGTTLTQKTIEKLKEEGIEYKMVNFGTVMFEVAKEEGLVEDRDQMR KLDPDTQKRIQKLAGRKIAEMAKESNVIVDTHSTVKTPKGYLAGLPIWVLEELNPDIIVI VETSSDEILMRRLGDATRNRDIELTSDIDEHQFMNRCAAMAYGVLTGATVKIIKNRDGLL DKAVEELISVLK The most convenient way to access to all data is to use the dataframe attribute:: >>> mf.df.Sequence .. plot:: :width: 50% :include-source: >>> from bioservices.apps import MultiFASTA >>> f = MultiFASTA() >>> f.load_fasta(["P43403", "P43410"]) >>> f.df.Size.hist() """ def __init__(self): # fetch the sequence using this attribute self._fasta_fetcher = FASTA() # an ordered dictionary to store the fasta contents # not python2.6 compatible self._fasta = OrderedDict() def __len__(self): return len(self._fasta) def _get_fasta(self): return self._fasta fasta = property(_get_fasta, doc="Returns all FASTA instances ") def _get_ids(self): return [f for f in self._fasta.keys()] ids = property(_get_ids, doc="returns list of keys/accession identifiers")
[docs] def load_fasta(self, ids): """Loads a single FASTA file into the dictionary """ if isinstance(ids, str): ids = [ids] for id_ in ids: self._fasta_fetcher.load(id_) # create a new instance of FASTA and save fasta data f = FASTA() f._fasta = self._fasta_fetcher._fasta[:] # append in the ordered dictionary self._fasta[id_] = f print("%s loaded" % id_)
[docs] def save_fasta(self, filename): """Save all FASTA into a file""" fh = open(filename, "w") for f in self._fasta.values(): fh.write(f.fasta) fh.close()
[docs] def read_fasta(self, filename): """Load several FASTA from a filename""" fh = open(filename, "r") data = fh.read() fh.close() # we split according to ">2 character for thisfasta in data.split(">")[1:]: f = FASTA() f._fasta = f._interpret(thisfasta) if f.accession != None and f.accession not in self.ids: self._fasta[f.accession] = f else: print("Accession %s is already in the ids list or could not be interpreted. skipped" % str(f.accession))
@ifpandas def _get_df(self): import pandas as pd df = pd.concat([self.fasta[id_].df for id_ in self.fasta.keys()]) df.reset_index(inplace=True) return df df = property(_get_df) @ifpylab def hist_size(self, **kargs): import pylab self.df.Size.hist(**kargs) pylab.title("Histogram length of the sequences") pylab.xlabel("Length")
[docs]class FASTA(object): """Dedicated class to manipulates FASTA sequence(s) Here is a FASTA file example:: >sp|P43408|KADA_METIG Adenylate kinase OS=Methanotorris igneus GN=adkA PE=1 SV=2 MKNKVVVVTGVPGVGGTTLTQKTIEKLKEEGIEYKMVNFGTVMFEVAKEEGLVEDRDQMR KLDPDTQKRIQKLAGRKIAEMAKESNVIVDTHSTVKTPKGYLAGLPIWVLEELNPDIIVI VETSSDEILMRRLGDATRNRDIELTSDIDEHQFMNRCAAMAYGVLTGATVKIIKNRDGLL DKAVEELISVLK The format is made of a header and a sequence. Any FASTA can be read and the pair of header/sequence retrieved from the :attr:`sequence` and :attr:`header` attributes. However, headers differ from one database to another one and interpretation is not implemented except for SWISS-PROT. Identifiers can be retrieved whatsoever. You can read a FASTA sequence from a local file or download one from UniProt .. doctest:: >>> from bioservices.apps.fasta import FASTA >>> f = FASTA() >>> f.load("P43403") >>> acc = f.accession # the accession (P43403) >>> fasta = f.fasta # raw FASTA string >>> seq = f.sequence # the sequence itself >>> header = f.header # the header itself >>> identifier = f.identifier You can also get a dataframe also using Pandas library.:: >>> f.df The columns stored in the dataframe encompase: * **Accession** that is taken from the header (e.g., P43403 from uniprot) * **Sequence**, a copy of the FASTA sequence * **Size**, the length of the sequence. * **Database**, the database type found in the header (e.g., sp for SWISS-PROT; see below for a list of database and their header format). * Some column such as **Organism** are filled only for some database * **Identififers** is the begining of the header. .. seealso:: :class:`MultiFASTA` for multi FASTA manipulation. .. addedversion: 1.2.0 maybe merged with :class:`MulitFASTA` class List of identifiers corresponding to different databases. ================================= ==================================== ================================= ==================================== GenBank gi|gi-number|gb|accession|locus EMBL Data Library gi|gi-number|emb|accession|locus DDBJ, DNA Database of Japan gi|gi-number|dbj|accession|locus NBRF PIR pir||entry Protein Research Foundation prf||name SWISS-PROT sp|accession|name Brookhaven Protein Data Bank (1) pdb|entry|chain Brookhaven Protein Data Bank (2) entry:chain|PDBID|CHAIN|SEQUENCE Patents pat|country|number GenInfo Backbone Id bbs|number General database identifier gnl|database|identifier NCBI Reference Sequence ref|accession|locus Local Sequence identifier lcl|identifier ================================= ==================================== The :meth::`load_fasta` relies on UniProt service. """ known_dbtypes = ["sp", "gi"] def __init__(self): self._fasta = None def _get_fasta(self): return self._fasta fasta = property(_get_fasta, doc="returns FASTA content") # for all types def _get_sequence(self): if self.fasta: return "".join(self.fasta.split("\n")[1:]) else: raise ValueError("You need to load a fasta sequence first using get_fasta or read_fasta") sequence = property(_get_sequence, doc="returns the sequence only") # for all types def _get_header(self): if self.fasta: return self.fasta.split("\n")[0] else: raise ValueError("You need to load a fasta sequence first using get_fasta or read_fasta") header = property(_get_header, doc="returns header only") def _get_dbtype(self): dbtype = self.header.split("|")[0].replace(">", "") return dbtype dbtype = property(_get_dbtype) # for all types def _get_identifier(self): return self.header.split(" ")[0] identifier = property(_get_identifier) def _get_entry(self): return self.header.split("|")[2].split(" ")[0] entry = property(_get_entry, doc="returns entry only") # swiss prot only def _get_accession(self): if self.dbtype == "sp": #header = self.header return self.identifier.split("|")[1] elif self.dbtype == "gi": return self.identifier.split("|")[1] accession = property(_get_accession) # swiss prot only def _get_name_sp(self): if self.dbtype == "sp": header = self.header return header.split(" ")[0].split("|")[2] name = property(_get_name_sp) @ifpandas def _get_df(self): import pandas as pd df = pd.DataFrame({ "Identifiers": [self.identifier], "Accession": [self.accession], "Entry": [self.entry], "Database": [self.dbtype], "Organism": [self.organism], "PE": [self.PE], "SV": [self.SV], "Sequence": [self.sequence], "Header": [self.header], "Size": [len(self.sequence)]}) return df df = property(_get_df) def _get_info_from_header(self, prefix): if prefix not in self.header: return None # finds the prefix index = self.header.index(prefix+"=") # remove it name = self.header[index:][3:] # figure out if there is anothe = sign to split the string # otherwise, the prefix we looked for is the last one anyway if "=" in name: name = name.split("=")[0] # here each = sign in FASTA is preceded by 2 characters that we must remove name = name[0:-2] name = name.strip() else: name = name.strip() return name def _get_gene_name(self): return self._get_info_from_header("GN") gene_name = property(_get_gene_name, doc="returns gene name from GN keyword found in the header if any") def _get_organism(self): return self._get_info_from_header("OS") organism = property(_get_organism, doc="returns organism from OS keyword found in the header if any") def _get_PE(self): pe = self._get_info_from_header("PE") if pe is not None: return int(pe) PE = property(_get_PE, doc="returns PE keyword found in the header if any") def _get_SV(self): sv = self._get_info_from_header("SV") if sv is not None: return int(sv) SV = property(_get_SV, doc="returns SV keyword found in the header if any") def __str__(self): str_ = self.fasta return str_
[docs] def get_fasta(self, id_): """Fetches FASTA from uniprot and loads into attrbiute :attr:`fasta` :param str id_: a given uniprot identifier :returns: the FASTA contents """ print("get_fasta is deprecated. Use load_fasta instead") from bioservices import UniProt u = UniProt(verbose=False) res = u.retrieve(id_, frmt="fasta") self._fasta = res[:] return res
[docs] def load(self, id_): self.load_fasta(id_)
[docs] def load_fasta(self, id_): """Fetches FASTA from uniprot and loads into attribute :attr:`fasta` :param str id_: a given uniprot identifier :returns: nothing .. note:: same as :meth:`get_fasta` but returns nothing """ # save fasta into attributes fasta from bioservices import UniProt u = UniProt(verbose=False) try: res = u.retrieve(id_, frmt="fasta") # some entries in uniprot are valid but obsolet and return empty string if res == "": raise Exception self._fasta = res[:] except: pass
[docs] def save_fasta(self, filename): """Save FASTA file into a filename :param str data: the FASTA contents :param str filename: where to save it """ if self._fasta == None: raise ValueError("No fasta was read or downloaded. Nothing to save.") fh = open(filename, "w") fh.write(self._fasta) fh.close()
[docs] def read_fasta(self, filename): """Reads a FASTA file and loads it Type:: >>> f = FASTA() >>> f.read_fasta(filename) >>> f.fasta :return: nothing .. warning:: If more than one FASTA is contained in the file, an error is raised """ fh = open(filename, "r") data = fh.read() fh.close() # Is there more than one sequence ? data = data.split(">")[1:] if len(data)>1 or len(data)==0: raise ValueError("""Only one sequence expected to be found. Found %s. Please use MultiFASTA class instead""" % len(data)) self._data = data if data.count(">sp|")>1: raise ValueError("""It looks like your FASTA file contains more than one FASTA. You must use MultiFASTA class instead""") self._fasta = data[:] self._fasta = self._fasta[0] if self.dbtype not in self.known_dbtypes: print("Only sp and gi header are recognised so far but sequence and header are loaded")
def _interpret(self, data): # cleanup the data in case of empty spaces or \n characters return data