try:
from collections import OrderedDict
except:
# for python 2.6
from ordereddict import OrderedDict
from easydev.decorators import ifpylab, ifpandas
__all__ = ["FASTA", "MultiFASTA"]
[docs]class MultiFASTA(object):
"""Class to manipulate several several FASTA items
Here, we load some FASTA using UniProt web service::
>>> from bioservices import MultiFASTA
>>> mf = MultiFASTA()
>>> mf.load_fasta("P43408")
>>> mf.load_fasta("P21318")
You can then get back to your accession entries as follows ::
>>> mf.ids
['P43408', 'P21318']
And the sequences in the same order can be accessed::
>>> len(mf)
2
Each FASTA is stored in :attr:`fasta`, which is a dictionary where each
values is an instance of :class:`FASTA`::
>>> print(mf._fasta["P43408"].fasta)
>sp|P43408|KADA_METIG Adenylate kinase OS=Methanotorris igneus GN=adkA PE=1 SV=2
MKNKVVVVTGVPGVGGTTLTQKTIEKLKEEGIEYKMVNFGTVMFEVAKEEGLVEDRDQMR
KLDPDTQKRIQKLAGRKIAEMAKESNVIVDTHSTVKTPKGYLAGLPIWVLEELNPDIIVI
VETSSDEILMRRLGDATRNRDIELTSDIDEHQFMNRCAAMAYGVLTGATVKIIKNRDGLL
DKAVEELISVLK
The most convenient way to access to all data is to use the dataframe attribute::
>>> mf.df.Sequence
.. plot::
:width: 50%
:include-source:
>>> from bioservices.apps import MultiFASTA
>>> f = MultiFASTA()
>>> f.load_fasta(["P43403", "P43410"])
>>> f.df.Size.hist()
"""
def __init__(self):
# fetch the sequence using this attribute
self._fasta_fetcher = FASTA()
# an ordered dictionary to store the fasta contents
# not python2.6 compatible
self._fasta = OrderedDict()
def __len__(self):
return len(self._fasta)
def _get_fasta(self):
return self._fasta
fasta = property(_get_fasta, doc="Returns all FASTA instances ")
def _get_ids(self):
return [f for f in self._fasta.keys()]
ids = property(_get_ids, doc="returns list of keys/accession identifiers")
[docs] def load_fasta(self, ids):
"""Loads a single FASTA file into the dictionary"""
if isinstance(ids, str):
ids = [ids]
for id_ in ids:
self._fasta_fetcher.load(id_)
# create a new instance of FASTA and save fasta data
f = FASTA()
f._fasta = self._fasta_fetcher._fasta[:]
# append in the ordered dictionary
self._fasta[id_] = f
print("%s loaded" % id_)
[docs] def save_fasta(self, filename):
"""Save all FASTA into a file"""
fh = open(filename, "w")
for f in self._fasta.values():
fh.write(f.fasta)
fh.close()
[docs] def read_fasta(self, filename):
"""Load several FASTA from a filename"""
fh = open(filename, "r")
data = fh.read()
fh.close()
# we split according to ">2 character
for thisfasta in data.split(">")[1:]:
f = FASTA()
f._fasta = f._interpret(thisfasta)
if f.accession != None and f.accession not in self.ids:
self._fasta[f.accession] = f
else:
print("Accession %s is already in the ids list or could not be interpreted. skipped" % str(f.accession))
@ifpandas
def _get_df(self):
import pandas as pd
df = pd.concat([self.fasta[id_].df for id_ in self.fasta.keys()])
df.reset_index(inplace=True)
return df
df = property(_get_df)
@ifpylab
def hist_size(self, **kargs):
import pylab
self.df.Size.hist(**kargs)
pylab.title("Histogram length of the sequences")
pylab.xlabel("Length")
[docs]class FASTA(object):
"""Dedicated class to manipulates FASTA sequence(s)
Here is a FASTA file example::
>sp|P43408|KADA_METIG Adenylate kinase OS=Methanotorris igneus GN=adkA PE=1 SV=2
MKNKVVVVTGVPGVGGTTLTQKTIEKLKEEGIEYKMVNFGTVMFEVAKEEGLVEDRDQMR
KLDPDTQKRIQKLAGRKIAEMAKESNVIVDTHSTVKTPKGYLAGLPIWVLEELNPDIIVI
VETSSDEILMRRLGDATRNRDIELTSDIDEHQFMNRCAAMAYGVLTGATVKIIKNRDGLL
DKAVEELISVLK
The format is made of a header and a sequence. Any FASTA can be read
and the pair of header/sequence retrieved from the :attr:`sequence` and
:attr:`header` attributes. However, headers differ from one database to
another one and interpretation is not implemented except for SWISS-PROT.
Identifiers can be retrieved whatsoever.
You can read a FASTA sequence from a local file or download one from UniProt
.. doctest::
>>> from bioservices.apps.fasta import FASTA
>>> f = FASTA()
>>> f.load("P43403")
>>> acc = f.accession # the accession (P43403)
>>> fasta = f.fasta # raw FASTA string
>>> seq = f.sequence # the sequence itself
>>> header = f.header # the header itself
>>> identifier = f.identifier
You can also get a dataframe also using Pandas library.::
>>> f.df
The columns stored in the dataframe encompase:
* **Accession** that is taken from the header (e.g., P43403 from uniprot)
* **Sequence**, a copy of the FASTA sequence
* **Size**, the length of the sequence.
* **Database**, the database type found in the header (e.g., sp for
SWISS-PROT; see below for a list of database and their header format).
* Some column such as **Organism** are filled only for some database
* **Identififers** is the begining of the header.
.. seealso:: :class:`MultiFASTA` for multi FASTA manipulation.
.. addedversion: 1.2.0 maybe merged with :class:`MulitFASTA` class
List of identifiers corresponding to different databases.
================================= ====================================
================================= ====================================
GenBank gi|gi-number|gb|accession|locus
EMBL Data Library gi|gi-number|emb|accession|locus
DDBJ, DNA Database of Japan gi|gi-number|dbj|accession|locus
NBRF PIR pir||entry
Protein Research Foundation prf||name
SWISS-PROT sp|accession|name
Brookhaven Protein Data Bank (1) pdb|entry|chain
Brookhaven Protein Data Bank (2) entry:chain|PDBID|CHAIN|SEQUENCE
Patents pat|country|number
GenInfo Backbone Id bbs|number
General database identifier gnl|database|identifier
NCBI Reference Sequence ref|accession|locus
Local Sequence identifier lcl|identifier
================================= ====================================
The :meth::`load_fasta` relies on UniProt service.
"""
known_dbtypes = ["sp", "gi"]
def __init__(self):
self._fasta = None
def _get_fasta(self):
return self._fasta
fasta = property(_get_fasta, doc="returns FASTA content")
# for all types
def _get_sequence(self):
if self.fasta:
return "".join(self.fasta.split("\n")[1:])
else:
raise ValueError("You need to load a fasta sequence first using get_fasta or read_fasta")
sequence = property(_get_sequence, doc="returns the sequence only")
# for all types
def _get_header(self):
if self.fasta:
return self.fasta.split("\n")[0]
else:
raise ValueError("You need to load a fasta sequence first using get_fasta or read_fasta")
header = property(_get_header, doc="returns header only")
def _get_dbtype(self):
dbtype = self.header.split("|")[0].replace(">", "")
return dbtype
dbtype = property(_get_dbtype)
# for all types
def _get_identifier(self):
return self.header.split(" ")[0]
identifier = property(_get_identifier)
def _get_entry(self):
return self.header.split("|")[2].split(" ")[0]
entry = property(_get_entry, doc="returns entry only")
# swiss prot only
def _get_accession(self):
if self.dbtype == "sp":
# header = self.header
return self.identifier.split("|")[1]
elif self.dbtype == "gi":
return self.identifier.split("|")[1]
accession = property(_get_accession)
# swiss prot only
def _get_name_sp(self):
if self.dbtype == "sp":
header = self.header
return header.split(" ")[0].split("|")[2]
name = property(_get_name_sp)
@ifpandas
def _get_df(self):
import pandas as pd
df = pd.DataFrame(
{
"Identifiers": [self.identifier],
"Accession": [self.accession],
"Entry": [self.entry],
"Database": [self.dbtype],
"Organism": [self.organism],
"PE": [self.PE],
"SV": [self.SV],
"Sequence": [self.sequence],
"Header": [self.header],
"Size": [len(self.sequence)],
}
)
return df
df = property(_get_df)
def _get_info_from_header(self, prefix):
if prefix not in self.header:
return None
# finds the prefix
index = self.header.index(prefix + "=")
# remove it
name = self.header[index:][3:]
# figure out if there is anothe = sign to split the string
# otherwise, the prefix we looked for is the last one anyway
if "=" in name:
name = name.split("=")[0]
# here each = sign in FASTA is preceded by 2 characters that we must remove
name = name[0:-2]
name = name.strip()
else:
name = name.strip()
return name
def _get_gene_name(self):
return self._get_info_from_header("GN")
gene_name = property(
_get_gene_name,
doc="returns gene name from GN keyword found in the header if any",
)
def _get_organism(self):
return self._get_info_from_header("OS")
organism = property(_get_organism, doc="returns organism from OS keyword found in the header if any")
def _get_PE(self):
pe = self._get_info_from_header("PE")
if pe is not None:
return int(pe)
PE = property(_get_PE, doc="returns PE keyword found in the header if any")
def _get_SV(self):
sv = self._get_info_from_header("SV")
if sv is not None:
return int(sv)
SV = property(_get_SV, doc="returns SV keyword found in the header if any")
def __str__(self):
str_ = self.fasta
return str_
[docs] def get_fasta(self, id_):
"""Fetches FASTA from uniprot and loads into attrbiute :attr:`fasta`
:param str id_: a given uniprot identifier
:returns: the FASTA contents
"""
print("get_fasta is deprecated. Use load_fasta instead")
from bioservices import UniProt
u = UniProt(verbose=False)
res = u.retrieve(id_, frmt="fasta")
self._fasta = res[:]
return res
[docs] def load(self, id_):
self.load_fasta(id_)
[docs] def load_fasta(self, id_):
"""Fetches FASTA from uniprot and loads into attribute :attr:`fasta`
:param str id_: a given uniprot identifier
:returns: nothing
.. note:: same as :meth:`get_fasta` but returns nothing
"""
# save fasta into attributes fasta
from bioservices import UniProt
u = UniProt(verbose=False)
try:
res = u.retrieve(id_, frmt="fasta")
# some entries in uniprot are valid but obsolet and return empty string
if res == "":
raise Exception
self._fasta = res[:]
except:
pass
[docs] def save_fasta(self, filename):
"""Save FASTA file into a filename
:param str data: the FASTA contents
:param str filename: where to save it
"""
if self._fasta == None:
raise ValueError("No fasta was read or downloaded. Nothing to save.")
fh = open(filename, "w")
fh.write(self._fasta)
fh.close()
[docs] def read_fasta(self, filename):
"""Reads a FASTA file and loads it
Type::
>>> f = FASTA()
>>> f.read_fasta(filename)
>>> f.fasta
:return: nothing
.. warning:: If more than one FASTA is contained in the file, an error is raised
"""
fh = open(filename, "r")
data = fh.read()
fh.close()
# Is there more than one sequence ?
data = data.split(">")[1:]
if len(data) > 1 or len(data) == 0:
raise ValueError(
"""Only one sequence expected to be found. Found %s. Please use MultiFASTA class instead""" % len(data)
)
self._data = data
if data.count(">sp|") > 1:
raise ValueError(
"""It looks like your FASTA file contains more than
one FASTA. You must use MultiFASTA class instead"""
)
self._fasta = data[:]
self._fasta = self._fasta[0]
if self.dbtype not in self.known_dbtypes:
print("Only sp and gi header are recognised so far but sequence and header are loaded")
def _interpret(self, data):
# cleanup the data in case of empty spaces or \n characters
return data