bmc_hub/parse_test.py

import sys
import re

def get_balanced_div(html, start_idx):
    i = start_idx
    tag_count = 0
    while i < len(html):
        # We need to correctly parse `<div` vs `</div>` handling any attributes
        # Find next tag start
        next_open = html.find('<div', i)
        next_close = html.find('</div>', i)

        if next_open == -1 and next_close == -1:
            break

        if next_open != -1 and (next_open < next_close or next_close == -1):
            tag_count += 1
            i = next_open + 4
        else:
            tag_count -= 1
            i = next_close + 6
            if tag_count == 0:
                return start_idx, i
    return start_idx, -1

html = open('app/modules/sag/templates/detail.html').read()

def extract_widget(html, data_module_name):
    pattern = f'<div[^>]*data-module="{data_module_name}"[^>]*>'
    match = re.search(pattern, html)
    if not match: return "", html
    start, end = get_balanced_div(html, match.start())
    widget = html[start:end]
    html = html[:start] + html[end:]
    return widget, html

# Let's extract assignment card
# It does not have data-module, but we know it follows: `<!-- Assignment Card -->`
def extract_by_comment(html, comment_str):
    c_start = html.find(comment_str)
    if c_start == -1: return "", html
    div_start = html.find('<div', c_start)
    if div_start == -1: return "", html
    start, end = get_balanced_div(html, div_start)
    widget = html[c_start:end] # include the comment
    html = html[:c_start] + html[end:]
    return widget, html

def extract_block_by_id(html, id_name):
    pattern = f'<div[^>]*id="{id_name}"[^>]*>'
    match = re.search(pattern, html)
    if not match: return "", html
    start, end = get_balanced_div(html, match.start())
    widget = html[start:end]
    html = html[:start] + html[end:]
    return widget, html

# Test extractions
ass, _ = extract_by_comment(html, '<!-- Assignment Card -->')
print(f"Assignment widget len: {len(ass)}")

cust, _ = extract_widget(html, "customers")
print(f"Customer widget len: {len(cust)}")

rem, _ = extract_widget(html, "reminders")
print(f"Reminders widget len: {len(rem)}")