some fixes on the output text

2026-01-18 20:05:41 +03:30 · 2026-01-18 20:05:41 +03:30 · 32ca394d91
commit 32ca394d91
parent 90c10c79fa
2 changed files with 13 additions and 15 deletions
--- a/src/adapters/outgoing/extractors/zip_extractor.py
+++ b/src/adapters/outgoing/extractors/zip_extractor.py
@ -28,8 +28,7 @@ class ZipExtractor(IExtractor):
    1. Opens ZIP archives and filters for .md files
    2. Sorts files alphabetically for deterministic order
    3. Merges all Markdown files into a single document
-    4. Inserts file source headers between merged content
+    4. Handles corrupted files gracefully
    5. Handles corrupted files gracefully
    """
    def __init__(self) -> None:
@ -149,7 +148,7 @@ class ZipExtractor(IExtractor):
            file_path: Path to ZIP file
        Returns:
-            Merged Markdown content with file source headers
+            Merged Markdown content
        Raises:
            ExtractionError: If ZIP extraction fails
@ -172,9 +171,6 @@ class ZipExtractor(IExtractor):
                for md_file in md_files:
                    content = self._extract_file_content(zip_file, md_file)
                    if content is not None:
                        # Add file source header
                        header = f"\n\n# File Source: {md_file}\n\n"
                        merged_parts.append(header)
                        merged_parts.append(content)
                        successful_extractions += 1
                        logger.debug(f"Successfully extracted: {md_file}")
--- a/src/core/domain/parsers.py
+++ b/src/core/domain/parsers.py
@ -10,6 +10,7 @@ from typing import List
 import marko
 from marko.block import BlockElement, Document as MarkoDocument, Heading
 from marko.inline import InlineElement
 from marko.md_renderer import MarkdownRenderer
 from .models import DocumentSection
@ -43,6 +44,9 @@ def parse_markdown(text: str) -> List[DocumentSection]:
    # Parse the Markdown into an AST
    doc: MarkoDocument = marko.parse(text)
    # Create markdown renderer to preserve markdown format
    md_renderer = MarkdownRenderer()
    sections: List[DocumentSection] = []
    current_heading: str | None = None
    current_level: int = 0
@ -73,8 +77,8 @@ def parse_markdown(text: str) -> List[DocumentSection]:
            current_level = child.level
            current_content_parts = []
        else:
-            # Add content to current section
+            # Render content back to markdown format instead of HTML
-            rendered = marko.render(child).strip()
+            rendered = md_renderer.render(child).strip()
            if rendered:
                current_content_parts.append(rendered + "\n\n")
@ -97,10 +101,9 @@ def _extract_heading_text(heading: Heading) -> str:
    parts: List[str] = []
    for child in heading.children:
-        if isinstance(child, InlineElement):
+        if isinstance(child, str):
-            # Render the inline element to preserve formatting
+            # Direct string content
-            rendered = marko.render(child).strip()
+            parts.append(child)
            parts.append(rendered)
        elif hasattr(child, 'children'):
            # Recursively extract from nested elements
            parts.append(_extract_text_recursive(child))
@ -125,9 +128,8 @@ def _extract_text_recursive(element) -> str:
    if hasattr(element, 'children'):
        for child in element.children:
-            if isinstance(child, (BlockElement, InlineElement)):
+            if isinstance(child, str):
-                rendered = marko.render(child).strip()
+                parts.append(child)
                parts.append(rendered)
            elif hasattr(child, 'children'):
                parts.append(_extract_text_recursive(child))
            else: