fix sorting and merging in zip extractor
This commit is contained in:
parent
6072bb188c
commit
e2e1c86dd4
@ -5,6 +5,7 @@ This adapter implements the IExtractor port for ZIP files containing
|
||||
Markdown documents. It merges all .md files into a single document.
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
@ -184,7 +185,7 @@ class ZipExtractor(IExtractor):
|
||||
)
|
||||
|
||||
# Join all parts with proper spacing
|
||||
return "".join(merged_parts).strip()
|
||||
return "\n".join(merged_parts).strip()
|
||||
|
||||
except EmptyContentError:
|
||||
raise
|
||||
@ -234,11 +235,29 @@ class ZipExtractor(IExtractor):
|
||||
if filename.lower().endswith('.md'):
|
||||
md_files.append(filename)
|
||||
|
||||
# Sort alphabetically for deterministic order
|
||||
md_files.sort()
|
||||
# Sort using natural/numeric order (page_1, page_2, ..., page_10)
|
||||
md_files.sort(key=self._natural_sort_key)
|
||||
|
||||
return md_files
|
||||
|
||||
def _natural_sort_key(self, filename: str):
|
||||
"""
|
||||
Generate a natural sort key for proper numeric ordering.
|
||||
|
||||
Converts numeric parts to integers for correct sorting:
|
||||
- 'page_1.md' < 'page_2.md' < 'page_10.md'
|
||||
|
||||
Args:
|
||||
filename: Filename to generate sort key for
|
||||
|
||||
Returns:
|
||||
List of alternating strings and integers for natural sorting
|
||||
"""
|
||||
def convert(text):
|
||||
return int(text) if text.isdigit() else text.lower()
|
||||
|
||||
return [convert(c) for c in re.split(r'(\d+)', filename)]
|
||||
|
||||
def _extract_file_content(
|
||||
self,
|
||||
zip_file: zipfile.ZipFile,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user