some fixes on the output text

This commit is contained in:
m.dabbagh 2026-01-18 20:05:41 +03:30
parent 90c10c79fa
commit 32ca394d91
2 changed files with 13 additions and 15 deletions

View File

@ -28,8 +28,7 @@ class ZipExtractor(IExtractor):
1. Opens ZIP archives and filters for .md files 1. Opens ZIP archives and filters for .md files
2. Sorts files alphabetically for deterministic order 2. Sorts files alphabetically for deterministic order
3. Merges all Markdown files into a single document 3. Merges all Markdown files into a single document
4. Inserts file source headers between merged content 4. Handles corrupted files gracefully
5. Handles corrupted files gracefully
""" """
def __init__(self) -> None: def __init__(self) -> None:
@ -149,7 +148,7 @@ class ZipExtractor(IExtractor):
file_path: Path to ZIP file file_path: Path to ZIP file
Returns: Returns:
Merged Markdown content with file source headers Merged Markdown content
Raises: Raises:
ExtractionError: If ZIP extraction fails ExtractionError: If ZIP extraction fails
@ -172,9 +171,6 @@ class ZipExtractor(IExtractor):
for md_file in md_files: for md_file in md_files:
content = self._extract_file_content(zip_file, md_file) content = self._extract_file_content(zip_file, md_file)
if content is not None: if content is not None:
# Add file source header
header = f"\n\n# File Source: {md_file}\n\n"
merged_parts.append(header)
merged_parts.append(content) merged_parts.append(content)
successful_extractions += 1 successful_extractions += 1
logger.debug(f"Successfully extracted: {md_file}") logger.debug(f"Successfully extracted: {md_file}")

View File

@ -10,6 +10,7 @@ from typing import List
import marko import marko
from marko.block import BlockElement, Document as MarkoDocument, Heading from marko.block import BlockElement, Document as MarkoDocument, Heading
from marko.inline import InlineElement from marko.inline import InlineElement
from marko.md_renderer import MarkdownRenderer
from .models import DocumentSection from .models import DocumentSection
@ -43,6 +44,9 @@ def parse_markdown(text: str) -> List[DocumentSection]:
# Parse the Markdown into an AST # Parse the Markdown into an AST
doc: MarkoDocument = marko.parse(text) doc: MarkoDocument = marko.parse(text)
# Create markdown renderer to preserve markdown format
md_renderer = MarkdownRenderer()
sections: List[DocumentSection] = [] sections: List[DocumentSection] = []
current_heading: str | None = None current_heading: str | None = None
current_level: int = 0 current_level: int = 0
@ -73,8 +77,8 @@ def parse_markdown(text: str) -> List[DocumentSection]:
current_level = child.level current_level = child.level
current_content_parts = [] current_content_parts = []
else: else:
# Add content to current section # Render content back to markdown format instead of HTML
rendered = marko.render(child).strip() rendered = md_renderer.render(child).strip()
if rendered: if rendered:
current_content_parts.append(rendered + "\n\n") current_content_parts.append(rendered + "\n\n")
@ -97,10 +101,9 @@ def _extract_heading_text(heading: Heading) -> str:
parts: List[str] = [] parts: List[str] = []
for child in heading.children: for child in heading.children:
if isinstance(child, InlineElement): if isinstance(child, str):
# Render the inline element to preserve formatting # Direct string content
rendered = marko.render(child).strip() parts.append(child)
parts.append(rendered)
elif hasattr(child, 'children'): elif hasattr(child, 'children'):
# Recursively extract from nested elements # Recursively extract from nested elements
parts.append(_extract_text_recursive(child)) parts.append(_extract_text_recursive(child))
@ -125,9 +128,8 @@ def _extract_text_recursive(element) -> str:
if hasattr(element, 'children'): if hasattr(element, 'children'):
for child in element.children: for child in element.children:
if isinstance(child, (BlockElement, InlineElement)): if isinstance(child, str):
rendered = marko.render(child).strip() parts.append(child)
parts.append(rendered)
elif hasattr(child, 'children'): elif hasattr(child, 'children'):
parts.append(_extract_text_recursive(child)) parts.append(_extract_text_recursive(child))
else: else: