141 lines
4.0 KiB
Python
141 lines
4.0 KiB
Python
"""
|
|
Markdown Parsing Utilities - Domain Logic for Markdown Processing.
|
|
|
|
This module provides pragmatic Markdown parsing utilities using the marko library.
|
|
As a tolerated dependency, marko is acceptable within the domain layer for this
|
|
specific parsing task.
|
|
"""
|
|
from typing import List
|
|
|
|
import marko
|
|
from marko.block import BlockElement, Document as MarkoDocument, Heading
|
|
from marko.inline import InlineElement
|
|
from marko.md_renderer import MarkdownRenderer
|
|
|
|
from .models import DocumentSection
|
|
|
|
|
|
def parse_markdown(text: str) -> List[DocumentSection]:
|
|
"""
|
|
Parse Markdown text into structured DocumentSection objects.
|
|
|
|
This function walks the Markdown AST and groups content under headers.
|
|
Text before the first header is placed in an "Introduction" section.
|
|
|
|
Args:
|
|
text: Raw Markdown text to parse
|
|
|
|
Returns:
|
|
List of DocumentSection objects in document order
|
|
|
|
Example:
|
|
>>> markdown = "# Title\\n\\nContent here\\n## Section\\nMore content"
|
|
>>> sections = parse_markdown(markdown)
|
|
>>> len(sections)
|
|
2
|
|
>>> sections[0].title
|
|
'Title'
|
|
>>> sections[0].level
|
|
1
|
|
"""
|
|
if not text or not text.strip():
|
|
return []
|
|
|
|
# Parse the Markdown into an AST
|
|
doc: MarkoDocument = marko.parse(text)
|
|
|
|
# Create markdown renderer to preserve markdown format
|
|
md_renderer = MarkdownRenderer()
|
|
|
|
sections: List[DocumentSection] = []
|
|
current_heading: str | None = None
|
|
current_level: int = 0
|
|
current_content_parts: List[str] = []
|
|
|
|
def finalize_section() -> None:
|
|
"""Helper to finalize and append the current section."""
|
|
if current_heading is not None or current_content_parts:
|
|
content = "".join(current_content_parts).strip()
|
|
if content: # Only add sections with actual content
|
|
title = current_heading
|
|
sections.append(
|
|
DocumentSection(
|
|
title=title,
|
|
level=current_level,
|
|
content=content,
|
|
)
|
|
)
|
|
|
|
# Walk through all children of the document
|
|
for child in doc.children:
|
|
if isinstance(child, Heading):
|
|
# Finalize previous section before starting new one
|
|
finalize_section()
|
|
|
|
# Start new section
|
|
current_heading = _extract_heading_text(child)
|
|
current_level = child.level
|
|
current_content_parts = []
|
|
else:
|
|
# Render content back to markdown format instead of HTML
|
|
rendered = md_renderer.render(child).strip()
|
|
if rendered:
|
|
current_content_parts.append(rendered + "\n\n")
|
|
|
|
# Finalize the last section
|
|
finalize_section()
|
|
|
|
return sections
|
|
|
|
|
|
def _extract_heading_text(heading: Heading) -> str:
|
|
"""
|
|
Extract plain text from a Heading node.
|
|
|
|
Args:
|
|
heading: Heading AST node
|
|
|
|
Returns:
|
|
Plain text content of the heading
|
|
"""
|
|
parts: List[str] = []
|
|
|
|
for child in heading.children:
|
|
if isinstance(child, str):
|
|
# Direct string content
|
|
parts.append(child)
|
|
elif hasattr(child, 'children'):
|
|
# Recursively extract from nested elements
|
|
parts.append(_extract_text_recursive(child))
|
|
else:
|
|
# Raw text
|
|
parts.append(str(child))
|
|
|
|
return "".join(parts).strip()
|
|
|
|
|
|
def _extract_text_recursive(element) -> str:
|
|
"""
|
|
Recursively extract text from an AST element.
|
|
|
|
Args:
|
|
element: AST element to extract text from
|
|
|
|
Returns:
|
|
Concatenated text content
|
|
"""
|
|
parts: List[str] = []
|
|
|
|
if hasattr(element, 'children'):
|
|
for child in element.children:
|
|
if isinstance(child, str):
|
|
parts.append(child)
|
|
elif hasattr(child, 'children'):
|
|
parts.append(_extract_text_recursive(child))
|
|
else:
|
|
parts.append(str(child))
|
|
else:
|
|
parts.append(str(element))
|
|
|
|
return "".join(parts).strip()
|