From e2e1c86dd4b0a5a7307a62aebc1b4e6abf1c37cd Mon Sep 17 00:00:00 2001 From: "m.dabbagh" Date: Mon, 19 Jan 2026 14:00:17 +0330 Subject: [PATCH] fix sorting and merging in zip extractor --- .../outgoing/extractors/zip_extractor.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/adapters/outgoing/extractors/zip_extractor.py b/src/adapters/outgoing/extractors/zip_extractor.py index 13e4253..c7c4048 100644 --- a/src/adapters/outgoing/extractors/zip_extractor.py +++ b/src/adapters/outgoing/extractors/zip_extractor.py @@ -5,6 +5,7 @@ This adapter implements the IExtractor port for ZIP files containing Markdown documents. It merges all .md files into a single document. """ import logging +import re import zipfile from pathlib import Path from typing import List @@ -184,7 +185,7 @@ class ZipExtractor(IExtractor): ) # Join all parts with proper spacing - return "".join(merged_parts).strip() + return "\n".join(merged_parts).strip() except EmptyContentError: raise @@ -234,11 +235,29 @@ class ZipExtractor(IExtractor): if filename.lower().endswith('.md'): md_files.append(filename) - # Sort alphabetically for deterministic order - md_files.sort() + # Sort using natural/numeric order (page_1, page_2, ..., page_10) + md_files.sort(key=self._natural_sort_key) return md_files + def _natural_sort_key(self, filename: str): + """ + Generate a natural sort key for proper numeric ordering. + + Converts numeric parts to integers for correct sorting: + - 'page_1.md' < 'page_2.md' < 'page_10.md' + + Args: + filename: Filename to generate sort key for + + Returns: + List of alternating strings and integers for natural sorting + """ + def convert(text): + return int(text) if text.isdigit() else text.lower() + + return [convert(c) for c in re.split(r'(\d+)', filename)] + def _extract_file_content( self, zip_file: zipfile.ZipFile,