""" Markdown Parsing Utilities - Domain Logic for Markdown Processing. This module provides pragmatic Markdown parsing utilities using the marko library. As a tolerated dependency, marko is acceptable within the domain layer for this specific parsing task. """ from typing import List import marko from marko.block import BlockElement, Document as MarkoDocument, Heading from marko.inline import InlineElement from marko.md_renderer import MarkdownRenderer from .models import DocumentSection def parse_markdown(text: str) -> List[DocumentSection]: """ Parse Markdown text into structured DocumentSection objects. This function walks the Markdown AST and groups content under headers. Text before the first header is placed in an "Introduction" section. Args: text: Raw Markdown text to parse Returns: List of DocumentSection objects in document order Example: >>> markdown = "# Title\\n\\nContent here\\n## Section\\nMore content" >>> sections = parse_markdown(markdown) >>> len(sections) 2 >>> sections[0].title 'Title' >>> sections[0].level 1 """ if not text or not text.strip(): return [] # Parse the Markdown into an AST doc: MarkoDocument = marko.parse(text) # Create markdown renderer to preserve markdown format md_renderer = MarkdownRenderer() sections: List[DocumentSection] = [] current_heading: str | None = None current_level: int = 0 current_content_parts: List[str] = [] def finalize_section() -> None: """Helper to finalize and append the current section.""" if current_heading is not None or current_content_parts: content = "".join(current_content_parts).strip() if content: # Only add sections with actual content title = current_heading sections.append( DocumentSection( title=title, level=current_level, content=content, ) ) # Walk through all children of the document for child in doc.children: if isinstance(child, Heading): # Finalize previous section before starting new one finalize_section() # Start new section current_heading = _extract_heading_text(child) current_level = child.level current_content_parts = [] else: # Render content back to markdown format instead of HTML rendered = md_renderer.render(child).strip() if rendered: current_content_parts.append(rendered + "\n\n") # Finalize the last section finalize_section() return sections def _extract_heading_text(heading: Heading) -> str: """ Extract plain text from a Heading node. Args: heading: Heading AST node Returns: Plain text content of the heading """ parts: List[str] = [] for child in heading.children: if isinstance(child, str): # Direct string content parts.append(child) elif hasattr(child, 'children'): # Recursively extract from nested elements parts.append(_extract_text_recursive(child)) else: # Raw text parts.append(str(child)) return "".join(parts).strip() def _extract_text_recursive(element) -> str: """ Recursively extract text from an AST element. Args: element: AST element to extract text from Returns: Concatenated text content """ parts: List[str] = [] if hasattr(element, 'children'): for child in element.children: if isinstance(child, str): parts.append(child) elif hasattr(child, 'children'): parts.append(_extract_text_recursive(child)) else: parts.append(str(child)) else: parts.append(str(element)) return "".join(parts).strip()