import sys import re def get_balanced_div(html, start_idx): i = start_idx tag_count = 0 while i < len(html): # We need to correctly parse `` handling any attributes # Find next tag start next_open = html.find('', i) if next_open == -1 and next_close == -1: break if next_open != -1 and (next_open < next_close or next_close == -1): tag_count += 1 i = next_open + 4 else: tag_count -= 1 i = next_close + 6 if tag_count == 0: return start_idx, i return start_idx, -1 html = open('app/modules/sag/templates/detail.html').read() def extract_widget(html, data_module_name): pattern = f']*data-module="{data_module_name}"[^>]*>' match = re.search(pattern, html) if not match: return "", html start, end = get_balanced_div(html, match.start()) widget = html[start:end] html = html[:start] + html[end:] return widget, html # Let's extract assignment card # It does not have data-module, but we know it follows: `` def extract_by_comment(html, comment_str): c_start = html.find(comment_str) if c_start == -1: return "", html div_start = html.find(']*id="{id_name}"[^>]*>' match = re.search(pattern, html) if not match: return "", html start, end = get_balanced_div(html, match.start()) widget = html[start:end] html = html[:start] + html[end:] return widget, html # Test extractions ass, _ = extract_by_comment(html, '') print(f"Assignment widget len: {len(ass)}") cust, _ = extract_widget(html, "customers") print(f"Customer widget len: {len(cust)}") rem, _ = extract_widget(html, "reminders") print(f"Reminders widget len: {len(rem)}")