bmc_hub/parse_test.py

67 lines
2.1 KiB
Python

import sys
import re
def get_balanced_div(html, start_idx):
i = start_idx
tag_count = 0
while i < len(html):
# We need to correctly parse `<div` vs `</div>` handling any attributes
# Find next tag start
next_open = html.find('<div', i)
next_close = html.find('</div>', i)
if next_open == -1 and next_close == -1:
break
if next_open != -1 and (next_open < next_close or next_close == -1):
tag_count += 1
i = next_open + 4
else:
tag_count -= 1
i = next_close + 6
if tag_count == 0:
return start_idx, i
return start_idx, -1
html = open('app/modules/sag/templates/detail.html').read()
def extract_widget(html, data_module_name):
pattern = f'<div[^>]*data-module="{data_module_name}"[^>]*>'
match = re.search(pattern, html)
if not match: return "", html
start, end = get_balanced_div(html, match.start())
widget = html[start:end]
html = html[:start] + html[end:]
return widget, html
# Let's extract assignment card
# It does not have data-module, but we know it follows: `<!-- Assignment Card -->`
def extract_by_comment(html, comment_str):
c_start = html.find(comment_str)
if c_start == -1: return "", html
div_start = html.find('<div', c_start)
if div_start == -1: return "", html
start, end = get_balanced_div(html, div_start)
widget = html[c_start:end] # include the comment
html = html[:c_start] + html[end:]
return widget, html
def extract_block_by_id(html, id_name):
pattern = f'<div[^>]*id="{id_name}"[^>]*>'
match = re.search(pattern, html)
if not match: return "", html
start, end = get_balanced_div(html, match.start())
widget = html[start:end]
html = html[:start] + html[end:]
return widget, html
# Test extractions
ass, _ = extract_by_comment(html, '<!-- Assignment Card -->')
print(f"Assignment widget len: {len(ass)}")
cust, _ = extract_widget(html, "customers")
print(f"Customer widget len: {len(cust)}")
rem, _ = extract_widget(html, "reminders")
print(f"Reminders widget len: {len(rem)}")