fix sorting and merging in zip extractor

2026-01-19 14:00:17 +03:30 · 2026-01-19 14:00:17 +03:30 · e2e1c86dd4
commit e2e1c86dd4
parent 6072bb188c
1 changed files with 22 additions and 3 deletions
--- a/src/adapters/outgoing/extractors/zip_extractor.py
+++ b/src/adapters/outgoing/extractors/zip_extractor.py
@ -5,6 +5,7 @@ This adapter implements the IExtractor port for ZIP files containing
 Markdown documents. It merges all .md files into a single document.
 """
 import logging
+import re
 import zipfile
 from pathlib import Path
 from typing import List
@ -184,7 +185,7 @@ class ZipExtractor(IExtractor):
                )

                # Join all parts with proper spacing
-                return "".join(merged_parts).strip()
+                return "\n".join(merged_parts).strip()

        except EmptyContentError:
            raise
@ -234,11 +235,29 @@ class ZipExtractor(IExtractor):
            if filename.lower().endswith('.md'):
                md_files.append(filename)

-        # Sort alphabetically for deterministic order
-        md_files.sort()
+        # Sort using natural/numeric order (page_1, page_2, ..., page_10)
+        md_files.sort(key=self._natural_sort_key)

        return md_files

+    def _natural_sort_key(self, filename: str):
+        """
+        Generate a natural sort key for proper numeric ordering.
+
+        Converts numeric parts to integers for correct sorting:
+        - 'page_1.md' < 'page_2.md' < 'page_10.md'
+
+        Args:
+            filename: Filename to generate sort key for
+
+        Returns:
+            List of alternating strings and integers for natural sorting
+        """
+        def convert(text):
+            return int(text) if text.isdigit() else text.lower()
+
+        return [convert(c) for c in re.split(r'(\d+)', filename)]
+
    def _extract_file_content(
        self,
        zip_file: zipfile.ZipFile,