text_processor/src/core/domain/parsers.py

"""
Markdown Parsing Utilities - Domain Logic for Markdown Processing.

This module provides pragmatic Markdown parsing utilities using the marko library.
As a tolerated dependency, marko is acceptable within the domain layer for this
specific parsing task.
"""
from typing import List

import marko
from marko.block import BlockElement, Document as MarkoDocument, Heading
from marko.inline import InlineElement
from marko.md_renderer import MarkdownRenderer

from .models import DocumentSection


def parse_markdown(text: str) -> List[DocumentSection]:
    """
    Parse Markdown text into structured DocumentSection objects.

    This function walks the Markdown AST and groups content under headers.
    Text before the first header is placed in an "Introduction" section.

    Args:
        text: Raw Markdown text to parse

    Returns:
        List of DocumentSection objects in document order

    Example:
        >>> markdown = "# Title\\n\\nContent here\\n## Section\\nMore content"
        >>> sections = parse_markdown(markdown)
        >>> len(sections)
        2
        >>> sections[0].title
        'Title'
        >>> sections[0].level
        1
    """
    if not text or not text.strip():
        return []

    # Parse the Markdown into an AST
    doc: MarkoDocument = marko.parse(text)

    # Create markdown renderer to preserve markdown format
    md_renderer = MarkdownRenderer()

    sections: List[DocumentSection] = []
    current_heading: str | None = None
    current_level: int = 0
    current_content_parts: List[str] = []

    def finalize_section() -> None:
        """Helper to finalize and append the current section."""
        if current_heading is not None or current_content_parts:
            content = "".join(current_content_parts).strip()
            if content:  # Only add sections with actual content
                title = current_heading
                sections.append(
                    DocumentSection(
                        title=title,
                        level=current_level,
                        content=content,
                    )
                )

    # Walk through all children of the document
    for child in doc.children:
        if isinstance(child, Heading):
            # Finalize previous section before starting new one
            finalize_section()

            # Start new section
            current_heading = _extract_heading_text(child)
            current_level = child.level
            current_content_parts = []
        else:
            # Render content back to markdown format instead of HTML
            rendered = md_renderer.render(child).strip()
            if rendered:
                current_content_parts.append(rendered + "\n\n")

    # Finalize the last section
    finalize_section()

    return sections


def _extract_heading_text(heading: Heading) -> str:
    """
    Extract plain text from a Heading node.

    Args:
        heading: Heading AST node

    Returns:
        Plain text content of the heading
    """
    parts: List[str] = []

    for child in heading.children:
        if isinstance(child, str):
            # Direct string content
            parts.append(child)
        elif hasattr(child, 'children'):
            # Recursively extract from nested elements
            parts.append(_extract_text_recursive(child))
        else:
            # Raw text
            parts.append(str(child))

    return "".join(parts).strip()


def _extract_text_recursive(element) -> str:
    """
    Recursively extract text from an AST element.

    Args:
        element: AST element to extract text from

    Returns:
        Concatenated text content
    """
    parts: List[str] = []

    if hasattr(element, 'children'):
        for child in element.children:
            if isinstance(child, str):
                parts.append(child)
            elif hasattr(child, 'children'):
                parts.append(_extract_text_recursive(child))
            else:
                parts.append(str(child))
    else:
        parts.append(str(element))

    return "".join(parts).strip()