fix sorting and merging in zip extractor
This commit is contained in:
parent
6072bb188c
commit
e2e1c86dd4
@ -5,6 +5,7 @@ This adapter implements the IExtractor port for ZIP files containing
|
|||||||
Markdown documents. It merges all .md files into a single document.
|
Markdown documents. It merges all .md files into a single document.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import zipfile
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -184,7 +185,7 @@ class ZipExtractor(IExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Join all parts with proper spacing
|
# Join all parts with proper spacing
|
||||||
return "".join(merged_parts).strip()
|
return "\n".join(merged_parts).strip()
|
||||||
|
|
||||||
except EmptyContentError:
|
except EmptyContentError:
|
||||||
raise
|
raise
|
||||||
@ -234,11 +235,29 @@ class ZipExtractor(IExtractor):
|
|||||||
if filename.lower().endswith('.md'):
|
if filename.lower().endswith('.md'):
|
||||||
md_files.append(filename)
|
md_files.append(filename)
|
||||||
|
|
||||||
# Sort alphabetically for deterministic order
|
# Sort using natural/numeric order (page_1, page_2, ..., page_10)
|
||||||
md_files.sort()
|
md_files.sort(key=self._natural_sort_key)
|
||||||
|
|
||||||
return md_files
|
return md_files
|
||||||
|
|
||||||
|
def _natural_sort_key(self, filename: str):
|
||||||
|
"""
|
||||||
|
Generate a natural sort key for proper numeric ordering.
|
||||||
|
|
||||||
|
Converts numeric parts to integers for correct sorting:
|
||||||
|
- 'page_1.md' < 'page_2.md' < 'page_10.md'
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename: Filename to generate sort key for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of alternating strings and integers for natural sorting
|
||||||
|
"""
|
||||||
|
def convert(text):
|
||||||
|
return int(text) if text.isdigit() else text.lower()
|
||||||
|
|
||||||
|
return [convert(c) for c in re.split(r'(\d+)', filename)]
|
||||||
|
|
||||||
def _extract_file_content(
|
def _extract_file_content(
|
||||||
self,
|
self,
|
||||||
zip_file: zipfile.ZipFile,
|
zip_file: zipfile.ZipFile,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user