Source code for ndb_adapter.html_parser
from html.parser import HTMLParser
from typing import List, Dict
[docs]class NDBHtmlParser(HTMLParser):
"""Class for html parse"""
def __init__(self):
"""Default constructor"""
HTMLParser.__init__(self)
self.__tree = None
self.__elementsStack = []
[docs] def error(self, message: str):
"""Function for error messages
:param message: message string
:type message: str
"""
pass
[docs] def analyze(self, data: str) -> None:
"""Function for analyze html structure
:param data: html string
:type data: str
:return: None
"""
self.__tree = None
self.__elementsStack = []
self.feed(data)
self.close()
if self.__elementsStack:
self.__tree = self.__elementsStack.pop()
[docs] def get_tree(self) -> 'Tag':
"""Function for get tree top element
:rtype: 'Tag'
:return: Tree top
"""
return self.__tree
[docs] def find_one(self, name: str =None, after: 'Tag'=None, before: 'Tag'=None, params: dict=None) -> 'Tag':
"""Function for get tree node matching criteria
:param name: node name (default value = None)
:type name: str
:param after: after node instance (default value = None)
:type after: Tag
:param before: before node instance (default value = None)
:type before: Tag
:param params: node parameters (default value = None)
:type params: dict
:return: searched node
:rtype: 'Tag'
"""
if after and not isinstance(after, Tag):
raise TypeError("After is not instance of Tag class")
if before and not isinstance(before, Tag):
raise TypeError("Before is not instance of Tag class")
if params and not isinstance(params, dict):
raise TypeError("Params is not instance of dictionary class")
root = self.__tree if after is None else after
next_tag = Tag(name="root", next_sib=root)
while True:
try:
next_tag = next(next_tag)
if before and before == next_tag:
raise StopIteration
elif name and next_tag.name != name:
continue
elif next_tag.has_attr(params):
return next_tag
except StopIteration:
return None
[docs] def find_all(self, name: str=None, after: 'Tag'=None, before: 'Tag'=None, params: dict=None) -> List['Tag']:
"""Function for get tree node matching criteria
:param name: node name (default value = None)
:type name: str
:param after: after node instance (default value = None)
:type after: Tag
:param before: before node instance (default value = None)
:type before: Tag
:param params: node parameters (default value = None)
:type params: dict
:return: searched nodes list
:rtype: List[Tag]
"""
if after and not isinstance(after, Tag):
raise TypeError("After is not instance of Tag class")
if before and not isinstance(before, Tag):
raise TypeError("Before is not instance of Tag class")
if params and not isinstance(params, dict):
raise TypeError("Params is not instance of dictionary class")
root = self.__tree if after is None else after
next_tag = Tag(name="root", next_sib=root)
result = []
while True:
try:
next_tag = next(next_tag)
if before == next_tag:
raise StopIteration
elif name and next_tag.name != name:
continue
elif next_tag.has_attr(params):
result.append(next_tag)
except StopIteration:
return result
[docs] def handle_starttag(self, tag, attrs) -> None:
"""Function to handle start tag and add to stack of nodes
:param tag: tag string
:param attrs: attributes dictionary
:return: None
"""
to_add = Tag(tag, attrs=dict(attrs))
self.__elementsStack.append(to_add)
[docs] def handle_endtag(self, tag) -> None:
"""Function to handle end tag and add to tree
:param tag: tag ending
:return: None
"""
try:
poped = self.__elementsStack.pop()
if not self.__elementsStack:
if not self.__tree:
self.__tree = poped
else:
self.__tree.add_child(poped)
else:
self.__elementsStack[-1].add_child(poped)
except IndexError:
pass
[docs] def handle_data(self, data) -> None:
"""Function to handle data in tags
:param data: data inside tag
:return: None
"""
# TO DO: rest unicode char to null
data = data.translate({ord('\xc5'): '', ord('\xa0'): '', ord('\n'): '',
ord('\t'): '', ord('\r'): '', ord('\f'): ''})
data = data.strip()
try:
self.__elementsStack[-1].data += data
except IndexError:
pass
[docs]class Tag(object):
"""Class for handle html tags"""
def __init__(self, name: str, data: str='', attrs: dict=None, parent: 'Tag'=None, next_sib: 'Tag'=None,
prev_sib: 'Tag'=None, children: List['Tag']=None):
"""Default constructor
:param name: tag name
:type name: str
:param data: tag data (default value = '')
:type data: str
:param attrs: tag attributes (default value = None)
:type attrs: dict
:param parent: parent tag
:type parent: Tag
:param next_sib: next sibling tag
:type next_sib: Tag
:param prev_sib: previous sibling tag
:type prev_sib: Tag
:param children: list of children's
:type children: List[Tag]
"""
self.name = name # type: str
self.data = data # type: str
self.attrs = [] if attrs is None else attrs # type: Dict[str]
self.parent = parent # type: Tag
self.next_sib = next_sib # type: Tag
self.prev_sib = prev_sib # type: Tag
self.children = [] if children is None else children # type: List[Tag]
for child in self.children:
child.parent = self
[docs] def has_attr(self, params: dict) -> bool:
"""To check if tag has given attributes
:param params: attributes to check
:type params: dict
:return: True/False if there is attribute
"""
if not params:
return True
for attr_key in self.attrs:
for par_key in params:
if par_key == attr_key and params[par_key] == self.attrs[attr_key]:
return True
return False
[docs] def add_child(self, child: 'Tag') -> None:
"""To add child to tag
:param child: tag to add
:type child: Tag
:return: None
"""
child.parent = self
if self.children:
self.children[-1].next_sib, child.prev_sib = child, self.children[-1]
self.children.append(child)
[docs] def prev(self) -> 'Tag':
"""Previous tag
:return: previous tag
:rtype: Tag
"""
if self.prev_sib:
return self.prev_sib
elif self.parent:
return self.parent
else:
raise StopIteration
def __next__(self) -> 'Tag':
"""Next tag
:return: next tag
:rtype: Tag
"""
if self.children:
return self.children[0]
elif self.next_sib:
return self.next_sib
parent = self.parent
while parent:
if parent.next_sib:
return parent.next_sib
parent = parent.parent
else:
raise StopIteration
next = __next__
[docs] def next_data(self) -> str:
"""Next tag data
:return: next tag data
:rtype: str
"""
next_tag = self.next()
if next_tag:
return next_tag.data
return ''
[docs] def prev_data(self) -> str:
"""Previous tag data
:return: previous tag data
:rtype: str
"""
next_tag = self.prev()
if next_tag:
return next_tag.data
return ''
def __repr__(self) -> str:
"""Tag to string"""
return self.__str__()
def __iter__(self):
"""For iteration handle"""
return self
def __str__(self) -> str:
"""Tag to string
:return: tag as string
:rtype: str
"""
result = "<" + str(self.name)
for key in self.attrs:
result += " " + str(key) + "=\"" + str(self.attrs[key]) + "\""
result += ">" + str(self.data)
for child in self.children:
result += str(child)
result += "</" + str(self.name) + ">"
return result