From 32ca394d914a5ae07bcdb4e7580de9fefb6e2683 Mon Sep 17 00:00:00 2001 From: "m.dabbagh" Date: Sun, 18 Jan 2026 20:05:41 +0330 Subject: [PATCH] some fixes on the output text --- .../outgoing/extractors/zip_extractor.py | 8 ++------ src/core/domain/parsers.py | 20 ++++++++++--------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/adapters/outgoing/extractors/zip_extractor.py b/src/adapters/outgoing/extractors/zip_extractor.py index 205c798..6bc9141 100644 --- a/src/adapters/outgoing/extractors/zip_extractor.py +++ b/src/adapters/outgoing/extractors/zip_extractor.py @@ -28,8 +28,7 @@ class ZipExtractor(IExtractor): 1. Opens ZIP archives and filters for .md files 2. Sorts files alphabetically for deterministic order 3. Merges all Markdown files into a single document - 4. Inserts file source headers between merged content - 5. Handles corrupted files gracefully + 4. Handles corrupted files gracefully """ def __init__(self) -> None: @@ -149,7 +148,7 @@ class ZipExtractor(IExtractor): file_path: Path to ZIP file Returns: - Merged Markdown content with file source headers + Merged Markdown content Raises: ExtractionError: If ZIP extraction fails @@ -172,9 +171,6 @@ class ZipExtractor(IExtractor): for md_file in md_files: content = self._extract_file_content(zip_file, md_file) if content is not None: - # Add file source header - header = f"\n\n# File Source: {md_file}\n\n" - merged_parts.append(header) merged_parts.append(content) successful_extractions += 1 logger.debug(f"Successfully extracted: {md_file}") diff --git a/src/core/domain/parsers.py b/src/core/domain/parsers.py index 2b52bfb..ac24cd8 100644 --- a/src/core/domain/parsers.py +++ b/src/core/domain/parsers.py @@ -10,6 +10,7 @@ from typing import List import marko from marko.block import BlockElement, Document as MarkoDocument, Heading from marko.inline import InlineElement +from marko.md_renderer import MarkdownRenderer from .models import DocumentSection @@ -43,6 +44,9 @@ def parse_markdown(text: str) -> List[DocumentSection]: # Parse the Markdown into an AST doc: MarkoDocument = marko.parse(text) + # Create markdown renderer to preserve markdown format + md_renderer = MarkdownRenderer() + sections: List[DocumentSection] = [] current_heading: str | None = None current_level: int = 0 @@ -73,8 +77,8 @@ def parse_markdown(text: str) -> List[DocumentSection]: current_level = child.level current_content_parts = [] else: - # Add content to current section - rendered = marko.render(child).strip() + # Render content back to markdown format instead of HTML + rendered = md_renderer.render(child).strip() if rendered: current_content_parts.append(rendered + "\n\n") @@ -97,10 +101,9 @@ def _extract_heading_text(heading: Heading) -> str: parts: List[str] = [] for child in heading.children: - if isinstance(child, InlineElement): - # Render the inline element to preserve formatting - rendered = marko.render(child).strip() - parts.append(rendered) + if isinstance(child, str): + # Direct string content + parts.append(child) elif hasattr(child, 'children'): # Recursively extract from nested elements parts.append(_extract_text_recursive(child)) @@ -125,9 +128,8 @@ def _extract_text_recursive(element) -> str: if hasattr(element, 'children'): for child in element.children: - if isinstance(child, (BlockElement, InlineElement)): - rendered = marko.render(child).strip() - parts.append(rendered) + if isinstance(child, str): + parts.append(child) elif hasattr(child, 'children'): parts.append(_extract_text_recursive(child)) else: