some fixes on the output text
This commit is contained in:
parent
90c10c79fa
commit
32ca394d91
@ -28,8 +28,7 @@ class ZipExtractor(IExtractor):
|
|||||||
1. Opens ZIP archives and filters for .md files
|
1. Opens ZIP archives and filters for .md files
|
||||||
2. Sorts files alphabetically for deterministic order
|
2. Sorts files alphabetically for deterministic order
|
||||||
3. Merges all Markdown files into a single document
|
3. Merges all Markdown files into a single document
|
||||||
4. Inserts file source headers between merged content
|
4. Handles corrupted files gracefully
|
||||||
5. Handles corrupted files gracefully
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
@ -149,7 +148,7 @@ class ZipExtractor(IExtractor):
|
|||||||
file_path: Path to ZIP file
|
file_path: Path to ZIP file
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Merged Markdown content with file source headers
|
Merged Markdown content
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ExtractionError: If ZIP extraction fails
|
ExtractionError: If ZIP extraction fails
|
||||||
@ -172,9 +171,6 @@ class ZipExtractor(IExtractor):
|
|||||||
for md_file in md_files:
|
for md_file in md_files:
|
||||||
content = self._extract_file_content(zip_file, md_file)
|
content = self._extract_file_content(zip_file, md_file)
|
||||||
if content is not None:
|
if content is not None:
|
||||||
# Add file source header
|
|
||||||
header = f"\n\n# File Source: {md_file}\n\n"
|
|
||||||
merged_parts.append(header)
|
|
||||||
merged_parts.append(content)
|
merged_parts.append(content)
|
||||||
successful_extractions += 1
|
successful_extractions += 1
|
||||||
logger.debug(f"Successfully extracted: {md_file}")
|
logger.debug(f"Successfully extracted: {md_file}")
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from typing import List
|
|||||||
import marko
|
import marko
|
||||||
from marko.block import BlockElement, Document as MarkoDocument, Heading
|
from marko.block import BlockElement, Document as MarkoDocument, Heading
|
||||||
from marko.inline import InlineElement
|
from marko.inline import InlineElement
|
||||||
|
from marko.md_renderer import MarkdownRenderer
|
||||||
|
|
||||||
from .models import DocumentSection
|
from .models import DocumentSection
|
||||||
|
|
||||||
@ -43,6 +44,9 @@ def parse_markdown(text: str) -> List[DocumentSection]:
|
|||||||
# Parse the Markdown into an AST
|
# Parse the Markdown into an AST
|
||||||
doc: MarkoDocument = marko.parse(text)
|
doc: MarkoDocument = marko.parse(text)
|
||||||
|
|
||||||
|
# Create markdown renderer to preserve markdown format
|
||||||
|
md_renderer = MarkdownRenderer()
|
||||||
|
|
||||||
sections: List[DocumentSection] = []
|
sections: List[DocumentSection] = []
|
||||||
current_heading: str | None = None
|
current_heading: str | None = None
|
||||||
current_level: int = 0
|
current_level: int = 0
|
||||||
@ -73,8 +77,8 @@ def parse_markdown(text: str) -> List[DocumentSection]:
|
|||||||
current_level = child.level
|
current_level = child.level
|
||||||
current_content_parts = []
|
current_content_parts = []
|
||||||
else:
|
else:
|
||||||
# Add content to current section
|
# Render content back to markdown format instead of HTML
|
||||||
rendered = marko.render(child).strip()
|
rendered = md_renderer.render(child).strip()
|
||||||
if rendered:
|
if rendered:
|
||||||
current_content_parts.append(rendered + "\n\n")
|
current_content_parts.append(rendered + "\n\n")
|
||||||
|
|
||||||
@ -97,10 +101,9 @@ def _extract_heading_text(heading: Heading) -> str:
|
|||||||
parts: List[str] = []
|
parts: List[str] = []
|
||||||
|
|
||||||
for child in heading.children:
|
for child in heading.children:
|
||||||
if isinstance(child, InlineElement):
|
if isinstance(child, str):
|
||||||
# Render the inline element to preserve formatting
|
# Direct string content
|
||||||
rendered = marko.render(child).strip()
|
parts.append(child)
|
||||||
parts.append(rendered)
|
|
||||||
elif hasattr(child, 'children'):
|
elif hasattr(child, 'children'):
|
||||||
# Recursively extract from nested elements
|
# Recursively extract from nested elements
|
||||||
parts.append(_extract_text_recursive(child))
|
parts.append(_extract_text_recursive(child))
|
||||||
@ -125,9 +128,8 @@ def _extract_text_recursive(element) -> str:
|
|||||||
|
|
||||||
if hasattr(element, 'children'):
|
if hasattr(element, 'children'):
|
||||||
for child in element.children:
|
for child in element.children:
|
||||||
if isinstance(child, (BlockElement, InlineElement)):
|
if isinstance(child, str):
|
||||||
rendered = marko.render(child).strip()
|
parts.append(child)
|
||||||
parts.append(rendered)
|
|
||||||
elif hasattr(child, 'children'):
|
elif hasattr(child, 'children'):
|
||||||
parts.append(_extract_text_recursive(child))
|
parts.append(_extract_text_recursive(child))
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user