some fixes on the output text

This commit is contained in:
m.dabbagh 2026-01-18 20:05:41 +03:30
parent 90c10c79fa
commit 32ca394d91
2 changed files with 13 additions and 15 deletions

View File

@ -28,8 +28,7 @@ class ZipExtractor(IExtractor):
1. Opens ZIP archives and filters for .md files
2. Sorts files alphabetically for deterministic order
3. Merges all Markdown files into a single document
4. Inserts file source headers between merged content
5. Handles corrupted files gracefully
4. Handles corrupted files gracefully
"""
def __init__(self) -> None:
@ -149,7 +148,7 @@ class ZipExtractor(IExtractor):
file_path: Path to ZIP file
Returns:
Merged Markdown content with file source headers
Merged Markdown content
Raises:
ExtractionError: If ZIP extraction fails
@ -172,9 +171,6 @@ class ZipExtractor(IExtractor):
for md_file in md_files:
content = self._extract_file_content(zip_file, md_file)
if content is not None:
# Add file source header
header = f"\n\n# File Source: {md_file}\n\n"
merged_parts.append(header)
merged_parts.append(content)
successful_extractions += 1
logger.debug(f"Successfully extracted: {md_file}")

View File

@ -10,6 +10,7 @@ from typing import List
import marko
from marko.block import BlockElement, Document as MarkoDocument, Heading
from marko.inline import InlineElement
from marko.md_renderer import MarkdownRenderer
from .models import DocumentSection
@ -43,6 +44,9 @@ def parse_markdown(text: str) -> List[DocumentSection]:
# Parse the Markdown into an AST
doc: MarkoDocument = marko.parse(text)
# Create markdown renderer to preserve markdown format
md_renderer = MarkdownRenderer()
sections: List[DocumentSection] = []
current_heading: str | None = None
current_level: int = 0
@ -73,8 +77,8 @@ def parse_markdown(text: str) -> List[DocumentSection]:
current_level = child.level
current_content_parts = []
else:
# Add content to current section
rendered = marko.render(child).strip()
# Render content back to markdown format instead of HTML
rendered = md_renderer.render(child).strip()
if rendered:
current_content_parts.append(rendered + "\n\n")
@ -97,10 +101,9 @@ def _extract_heading_text(heading: Heading) -> str:
parts: List[str] = []
for child in heading.children:
if isinstance(child, InlineElement):
# Render the inline element to preserve formatting
rendered = marko.render(child).strip()
parts.append(rendered)
if isinstance(child, str):
# Direct string content
parts.append(child)
elif hasattr(child, 'children'):
# Recursively extract from nested elements
parts.append(_extract_text_recursive(child))
@ -125,9 +128,8 @@ def _extract_text_recursive(element) -> str:
if hasattr(element, 'children'):
for child in element.children:
if isinstance(child, (BlockElement, InlineElement)):
rendered = marko.render(child).strip()
parts.append(rendered)
if isinstance(child, str):
parts.append(child)
elif hasattr(child, 'children'):
parts.append(_extract_text_recursive(child))
else: