2026-01-24 20:25:34 +03:30

141 lines
4.0 KiB
Python

"""
Markdown Parsing Utilities - Domain Logic for Markdown Processing.
This module provides pragmatic Markdown parsing utilities using the marko library.
As a tolerated dependency, marko is acceptable within the domain layer for this
specific parsing task.
"""
from typing import List
import marko
from marko.block import BlockElement, Document as MarkoDocument, Heading
from marko.inline import InlineElement
from marko.md_renderer import MarkdownRenderer
from .models import DocumentSection
def parse_markdown(text: str) -> List[DocumentSection]:
"""
Parse Markdown text into structured DocumentSection objects.
This function walks the Markdown AST and groups content under headers.
Text before the first header is placed in an "Introduction" section.
Args:
text: Raw Markdown text to parse
Returns:
List of DocumentSection objects in document order
Example:
>>> markdown = "# Title\\n\\nContent here\\n## Section\\nMore content"
>>> sections = parse_markdown(markdown)
>>> len(sections)
2
>>> sections[0].title
'Title'
>>> sections[0].level
1
"""
if not text or not text.strip():
return []
# Parse the Markdown into an AST
doc: MarkoDocument = marko.parse(text)
# Create markdown renderer to preserve markdown format
md_renderer = MarkdownRenderer()
sections: List[DocumentSection] = []
current_heading: str | None = None
current_level: int = 0
current_content_parts: List[str] = []
def finalize_section() -> None:
"""Helper to finalize and append the current section."""
if current_heading is not None or current_content_parts:
content = "".join(current_content_parts).strip()
if content: # Only add sections with actual content
title = current_heading
sections.append(
DocumentSection(
title=title,
level=current_level,
content=content,
)
)
# Walk through all children of the document
for child in doc.children:
if isinstance(child, Heading):
# Finalize previous section before starting new one
finalize_section()
# Start new section
current_heading = _extract_heading_text(child)
current_level = child.level
current_content_parts = []
else:
# Render content back to markdown format instead of HTML
rendered = md_renderer.render(child).strip()
if rendered:
current_content_parts.append(rendered + "\n\n")
# Finalize the last section
finalize_section()
return sections
def _extract_heading_text(heading: Heading) -> str:
"""
Extract plain text from a Heading node.
Args:
heading: Heading AST node
Returns:
Plain text content of the heading
"""
parts: List[str] = []
for child in heading.children:
if isinstance(child, str):
# Direct string content
parts.append(child)
elif hasattr(child, 'children'):
# Recursively extract from nested elements
parts.append(_extract_text_recursive(child))
else:
# Raw text
parts.append(str(child))
return "".join(parts).strip()
def _extract_text_recursive(element) -> str:
"""
Recursively extract text from an AST element.
Args:
element: AST element to extract text from
Returns:
Concatenated text content
"""
parts: List[str] = []
if hasattr(element, 'children'):
for child in element.children:
if isinstance(child, str):
parts.append(child)
elif hasattr(child, 'children'):
parts.append(_extract_text_recursive(child))
else:
parts.append(str(child))
else:
parts.append(str(element))
return "".join(parts).strip()