some fixes on the output text
This commit is contained in:
parent
90c10c79fa
commit
32ca394d91
@ -28,8 +28,7 @@ class ZipExtractor(IExtractor):
|
||||
1. Opens ZIP archives and filters for .md files
|
||||
2. Sorts files alphabetically for deterministic order
|
||||
3. Merges all Markdown files into a single document
|
||||
4. Inserts file source headers between merged content
|
||||
5. Handles corrupted files gracefully
|
||||
4. Handles corrupted files gracefully
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
@ -149,7 +148,7 @@ class ZipExtractor(IExtractor):
|
||||
file_path: Path to ZIP file
|
||||
|
||||
Returns:
|
||||
Merged Markdown content with file source headers
|
||||
Merged Markdown content
|
||||
|
||||
Raises:
|
||||
ExtractionError: If ZIP extraction fails
|
||||
@ -172,9 +171,6 @@ class ZipExtractor(IExtractor):
|
||||
for md_file in md_files:
|
||||
content = self._extract_file_content(zip_file, md_file)
|
||||
if content is not None:
|
||||
# Add file source header
|
||||
header = f"\n\n# File Source: {md_file}\n\n"
|
||||
merged_parts.append(header)
|
||||
merged_parts.append(content)
|
||||
successful_extractions += 1
|
||||
logger.debug(f"Successfully extracted: {md_file}")
|
||||
|
||||
@ -10,6 +10,7 @@ from typing import List
|
||||
import marko
|
||||
from marko.block import BlockElement, Document as MarkoDocument, Heading
|
||||
from marko.inline import InlineElement
|
||||
from marko.md_renderer import MarkdownRenderer
|
||||
|
||||
from .models import DocumentSection
|
||||
|
||||
@ -43,6 +44,9 @@ def parse_markdown(text: str) -> List[DocumentSection]:
|
||||
# Parse the Markdown into an AST
|
||||
doc: MarkoDocument = marko.parse(text)
|
||||
|
||||
# Create markdown renderer to preserve markdown format
|
||||
md_renderer = MarkdownRenderer()
|
||||
|
||||
sections: List[DocumentSection] = []
|
||||
current_heading: str | None = None
|
||||
current_level: int = 0
|
||||
@ -73,8 +77,8 @@ def parse_markdown(text: str) -> List[DocumentSection]:
|
||||
current_level = child.level
|
||||
current_content_parts = []
|
||||
else:
|
||||
# Add content to current section
|
||||
rendered = marko.render(child).strip()
|
||||
# Render content back to markdown format instead of HTML
|
||||
rendered = md_renderer.render(child).strip()
|
||||
if rendered:
|
||||
current_content_parts.append(rendered + "\n\n")
|
||||
|
||||
@ -97,10 +101,9 @@ def _extract_heading_text(heading: Heading) -> str:
|
||||
parts: List[str] = []
|
||||
|
||||
for child in heading.children:
|
||||
if isinstance(child, InlineElement):
|
||||
# Render the inline element to preserve formatting
|
||||
rendered = marko.render(child).strip()
|
||||
parts.append(rendered)
|
||||
if isinstance(child, str):
|
||||
# Direct string content
|
||||
parts.append(child)
|
||||
elif hasattr(child, 'children'):
|
||||
# Recursively extract from nested elements
|
||||
parts.append(_extract_text_recursive(child))
|
||||
@ -125,9 +128,8 @@ def _extract_text_recursive(element) -> str:
|
||||
|
||||
if hasattr(element, 'children'):
|
||||
for child in element.children:
|
||||
if isinstance(child, (BlockElement, InlineElement)):
|
||||
rendered = marko.render(child).strip()
|
||||
parts.append(rendered)
|
||||
if isinstance(child, str):
|
||||
parts.append(child)
|
||||
elif hasattr(child, 'children'):
|
||||
parts.append(_extract_text_recursive(child))
|
||||
else:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user