#!/usr/bin/env python3
#
#
# Copyright (c) 2018, Hiroyuki Ohsaki.
# All rights reserved.
#
# $Id: htmltree.py,v 1.8 2018/10/09 01:25:24 ohsaki Exp ohsaki $
#

from collections import defaultdict, namedtuple
import html
import io
import re
import sys

from perlcompat import die, warn, getopts
import tbdump

def read_tokens(stream=sys.stdin):
    """A generator function for reading a single token from STREAM."""
    buf = ''
    for line in stream:
        line = line.rstrip()
        buf += line
        while True:
            left = buf.find('<')
            right = buf.find('>', left)
            if 0 <= left < right:
                text = buf[:left]
                if text != '':
                    yield text
                tag = buf[left:right + 1]
                yield tag
                buf = buf[right + 1:]
            else:
                break
    if buf != '':
        yield buf

class Node:
    def __init__(self,
                 text=None,
                 tag=None,
                 attr=None,
                 child=None,
                 parent=None,
                 src=None):
        self.text = text
        self.tag = tag
        self.attr = attr if attr else defaultdict(dict)
        self.child = child if child else []
        self.parent = parent
        if type(src) == io.TextIOWrapper:
            self.parse(src)
        elif type(src) == str:
            self.parse_string(src)

    def is_leaf(self):
        return self.text is not None

    def parse(self, stream=sys.stdin):
        """Read all lines from STREAM and parse those lines as an HTML tree."""
        self.tag = 'root'
        current = self
        for token in read_tokens(stream):
            m = re.match('<(/?)([^ >]+)(.*)(/?)>', token)
            if m:
                head, tag, arg, tail = m.groups()
                tag = tag.lower()
                if head:  # closing tag
                    while current.tag != tag:
                        if current.parent:
                            current = current.parent
                        else:
                            break
                    if current.parent:
                        current = current.parent
                else:  # opening tag
                    attr = {}
                    for key, val in re.findall(r'(\w+)=\"([^\"]*)\"', arg):
                        attr[key] = val
                    child = Node(tag=tag, attr=attr, parent=current)
                    current.child.append(child)
                    if not tail:
                        current = child
            else:  # text
                text = html.unescape(token)
                node = Node(text=text)
                current.child.append(node)

    def parse_string(self, buf):
        """Parse a string BUF as an HTML tree.  Return the root node as a named
        tuple Node."""
        f = io.StringIO(buf)
        return self.parse(f)

    def dump(self, level=0):
        """Pretty print an HTML tree whose root node is NODE."""
        indent = '  ' * level
        if self.is_leaf():
            print(f"{indent}'{self.text}'")
        else:
            attrs = [f'{key}="{val}"' for key, val in self.attr.items()]
            attr = ' '.join(attrs)
            if attr != '':
                attr = ' ' + attr
            print(f'{indent}<{self.tag}{attr}>')
            for child in self.child:
                child.dump(level + 1)
            print(f'{indent}</{self.tag}>')

    def _as_string(self):
        if self.is_leaf():
            yield self.text
        else:
            for child in self.child:
                yield from child._as_string()

    def as_string(self):
        """Return a plain text representation of an HTML tree specifed by NODE."""
        return ''.join(self._as_string())

    def find(self, criteria, *kargs):
        """Traverse an HTML tree starting from NODE to look for a node matching a
        given criteria FUNC."""
        if type(criteria) == str:
            missed = False
            if self.tag != criteria:
                missed = True
            for arg in kargs:
                key, val = arg.split('=', 1)
                if not self.attr:
                    missed = True
                elif self.attr.get(key, None) is None:
                    missed = True
                elif self.attr[key] != val:
                    missed = True
            if not missed:
                return self
        else:  # function
            if criteria(self):
                return self
        for child in self.child:
            if type(child) == str:
                pass
            else:
                found = child.find(criteria, *kargs)
                if found:
                    return found
        return None

def main():
    buf = """<html>
<body class="foo">
This is <i id="bar">a</i> <b>sample</b> text.
</body>
</html>"""
    root = Node()
    root.parse_string(buf)
    root.dump()
    print(root.as_string())

if __name__ == "__main__":
    main()