Source code for bioservices.xmltools

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EMBL-EBI
#
#  File author(s):
#
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  website: https://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################
"""This module includes common tools to manipulate XML files"""
import xml.etree.ElementTree as ET
import bs4

try:
    from urllib.request import urlopen
    from urllib.error import HTTPError
except:
    from urllib2 import urlopen, HTTPError

__all__ = ["easyXML", "readXML"]


[docs]class easyXML(object):
    """class to ease the introspection of XML documents.

    This class uses the standard xml module as well as the package BeautifulSoup
    to help introspecting the XML documents.

    ::

        >>> from bioservices import *
        >>> n = ncbiblast.NCBIblast()
        >>> res = n.getParameters() # res is an instance of easyXML
        >>> # You can retreive XML from this instance of easyXML and print the content
        >>> # in a more human-readable way.
        >>> res.soup.findAll('id') # a Beautifulsoup instance is available
        >>> res.root # and the root using xml.etree.ElementTree

    There is a getitem so you can type::

        res['id']

    which is equivalent to::

        res.soup.findAll('id')

    There is also aliases findAll and prettify.

    """

    def __init__(self, data, encoding="utf-8"):
        """.. rubric:: Constructor

        :param data: an XML document format
        :param fixing_unicode: use only with HGNC service to fix issue with the
            XML returned by that particular service. No need to use otherwise.
            See :class:`~bioservices.hgnc.HGNC` documentation for details.
        :param encoding: default is utf-8 used. Used to fix the HGNC XML only.


        The data parameter must be a string containing the XML document. If you
        have an URL instead, use :class:`readXML`

        """
        # if fixing_unicode:
        #    x = unicodefix.FixingUnicode(data, verbose=False, encoding=encoding)
        #    self.data = x.fixed_string.encode("utf-8")
        # else:
        self.data = data[:]

        try:
            self.root = ET.fromstring(self.data)
        except:
            self.root = self.data[:]
        self._soup = None
        self.prettify = self.soup.prettify
        self.findAll = self.soup.findAll

[docs]    def getchildren(self):
        """returns all children of the root XML document

        This is just an alias to self.soup.getchildren()
        """
        return [x for x in self.root]

    def _get_soup(self):
        if self._soup is None:
            self._soup = bs4.BeautifulSoup(self.data, "lxml")
        return self._soup

    soup = property(_get_soup, doc="Returns the beautiful soup instance")

    def __str__(self):
        txt = self.soup.prettify()
        return txt

    def __getitem__(self, i):
        return self.findAll(i)


[docs]class readXML(easyXML):
    """Read XML and converts to beautifulsoup data structure

    easyXML accepts as input a string. This class accepts a filename instead
    inherits from easyXML

    .. seealso:: :class:`easyXML`

    """

    def __init__(self, url, encoding="utf-8"):
        self.data = urlopen(url).read()
        super(readXML, self).__init__(self.data, encoding)


class XMLObjectify(object):
    def __init__(self, obj):
        """obj can be easyXML data set"""
        from lxml import objectify

        try:
            self.root = objectify.fromstring(obj.data)
        except:
            # try something else
            self.root = objectify.fromstring(obj)
        self.obj = obj

    def __str__(self):
        txt = ""
        for child in self.root.getchildren():
            txt += child.tag + "\n"
        return txt