fix sorting and merging in zip extractor

This commit is contained in:
m.dabbagh 2026-01-19 14:00:17 +03:30
parent 6072bb188c
commit e2e1c86dd4

View File

@ -5,6 +5,7 @@ This adapter implements the IExtractor port for ZIP files containing
Markdown documents. It merges all .md files into a single document.
"""
import logging
import re
import zipfile
from pathlib import Path
from typing import List
@ -184,7 +185,7 @@ class ZipExtractor(IExtractor):
)
# Join all parts with proper spacing
return "".join(merged_parts).strip()
return "\n".join(merged_parts).strip()
except EmptyContentError:
raise
@ -234,11 +235,29 @@ class ZipExtractor(IExtractor):
if filename.lower().endswith('.md'):
md_files.append(filename)
# Sort alphabetically for deterministic order
md_files.sort()
# Sort using natural/numeric order (page_1, page_2, ..., page_10)
md_files.sort(key=self._natural_sort_key)
return md_files
def _natural_sort_key(self, filename: str):
"""
Generate a natural sort key for proper numeric ordering.
Converts numeric parts to integers for correct sorting:
- 'page_1.md' < 'page_2.md' < 'page_10.md'
Args:
filename: Filename to generate sort key for
Returns:
List of alternating strings and integers for natural sorting
"""
def convert(text):
return int(text) if text.isdigit() else text.lower()
return [convert(c) for c in re.split(r'(\d+)', filename)]
def _extract_file_content(
self,
zip_file: zipfile.ZipFile,