#!/usr/bin/env python3 # # # Copyright (c) 2018, Hiroyuki Ohsaki. # All rights reserved. # # $Id: htmltree.py,v 1.8 2018/10/09 01:25:24 ohsaki Exp ohsaki $ # from collections import defaultdict, namedtuple import html import io import re import sys from perlcompat import die, warn, getopts import tbdump def read_tokens(stream=sys.stdin): """A generator function for reading a single token from STREAM.""" buf = '' for line in stream: line = line.rstrip() buf += line while True: left = buf.find('<') right = buf.find('>', left) if 0 <= left < right: text = buf[:left] if text != '': yield text tag = buf[left:right + 1] yield tag buf = buf[right + 1:] else: break if buf != '': yield buf class Node: def __init__(self, text=None, tag=None, attr=None, child=None, parent=None, src=None): self.text = text self.tag = tag self.attr = attr if attr else defaultdict(dict) self.child = child if child else [] self.parent = parent if type(src) == io.TextIOWrapper: self.parse(src) elif type(src) == str: self.parse_string(src) def is_leaf(self): return self.text is not None def parse(self, stream=sys.stdin): """Read all lines from STREAM and parse those lines as an HTML tree.""" self.tag = 'root' current = self for token in read_tokens(stream): m = re.match('<(/?)([^ >]+)(.*)(/?)>', token) if m: head, tag, arg, tail = m.groups() tag = tag.lower() if head: # closing tag while current.tag != tag: if current.parent: current = current.parent else: break if current.parent: current = current.parent else: # opening tag attr = {} for key, val in re.findall(r'(\w+)=\"([^\"]*)\"', arg): attr[key] = val child = Node(tag=tag, attr=attr, parent=current) current.child.append(child) if not tail: current = child else: # text text = html.unescape(token) node = Node(text=text) current.child.append(node) def parse_string(self, buf): """Parse a string BUF as an HTML tree. Return the root node as a named tuple Node.""" f = io.StringIO(buf) return self.parse(f) def dump(self, level=0): """Pretty print an HTML tree whose root node is NODE.""" indent = ' ' * level if self.is_leaf(): print(f"{indent}'{self.text}'") else: attrs = [f'{key}="{val}"' for key, val in self.attr.items()] attr = ' '.join(attrs) if attr != '': attr = ' ' + attr print(f'{indent}<{self.tag}{attr}>') for child in self.child: child.dump(level + 1) print(f'{indent}') def _as_string(self): if self.is_leaf(): yield self.text else: for child in self.child: yield from child._as_string() def as_string(self): """Return a plain text representation of an HTML tree specifed by NODE.""" return ''.join(self._as_string()) def find(self, criteria, *kargs): """Traverse an HTML tree starting from NODE to look for a node matching a given criteria FUNC.""" if type(criteria) == str: missed = False if self.tag != criteria: missed = True for arg in kargs: key, val = arg.split('=', 1) if not self.attr: missed = True elif self.attr.get(key, None) is None: missed = True elif self.attr[key] != val: missed = True if not missed: return self else: # function if criteria(self): return self for child in self.child: if type(child) == str: pass else: found = child.find(criteria, *kargs) if found: return found return None def main(): buf = """ This is a sample text. """ root = Node() root.parse_string(buf) root.dump() print(root.as_string()) if __name__ == "__main__": main()