add zip extractor adapter

2026-01-18 15:44:49 +03:30 · 2026-01-18 15:44:49 +03:30 · 13b887260f
commit 13b887260f
parent f06370e0b9
3 changed files with 300 additions and 1 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -423,7 +423,7 @@ async def health_check() -> HealthCheckResponse:
    return HealthCheckResponse(
        status="healthy",
        version="1.0.0",
-        supported_file_types=["pdf", "docx", "txt"],
+        supported_file_types=["pdf", "docx", "txt", "zip"],
        available_strategies=["fixed_size", "paragraph"],
    )

--- a/src/adapters/outgoing/extractors/zip_extractor.py
+++ b/src/adapters/outgoing/extractors/zip_extractor.py
@ -0,0 +1,297 @@
+"""
+ZIP Extractor - Concrete implementation for ZIP archive extraction.
+
+This adapter implements the IExtractor port for ZIP files containing
+Markdown documents. It merges all .md files into a single document.
+"""
+import logging
+import zipfile
+from pathlib import Path
+from typing import List
+
+from ....core.domain.exceptions import (
+    EmptyContentError,
+    ExtractionError,
+)
+from ....core.domain.models import Document, DocumentMetadata, SourceType
+from ....core.ports.outgoing.extractor import IExtractor
+
+
+logger = logging.getLogger(__name__)
+
+
+class ZipExtractor(IExtractor):
+    """
+    Concrete ZIP extractor for archives containing Markdown files.
+
+    This adapter:
+    1. Opens ZIP archives and filters for .md files
+    2. Sorts files alphabetically for deterministic order
+    3. Merges all Markdown files into a single document
+    4. Inserts file source headers between merged content
+    5. Handles corrupted files gracefully
+    """
+
+    def __init__(self) -> None:
+        """Initialize ZIP extractor."""
+        self._supported_extensions = ['zip']
+        self._encodings = ['utf-8', 'utf-16', 'latin-1', 'cp1252']
+        logger.debug("ZipExtractor initialized")
+
+    def extract(self, file_path: Path) -> Document:
+        """
+        Extract and merge Markdown files from ZIP archive.
+
+        Args:
+            file_path: Path to the ZIP file
+
+        Returns:
+            Document entity with merged content and metadata
+
+        Raises:
+            ExtractionError: If extraction fails
+            EmptyContentError: If no Markdown files could be extracted
+        """
+        try:
+            logger.info(f"Extracting Markdown files from ZIP: {file_path}")
+
+            # Validate file
+            self._validate_file(file_path)
+
+            # Extract and merge markdown files
+            merged_text = self._extract_and_merge_markdown(file_path)
+
+            # Validate content
+            if not merged_text or not merged_text.strip():
+                raise EmptyContentError(file_path=str(file_path))
+
+            # Create metadata
+            metadata = self._create_metadata(file_path)
+
+            # Build document with raw_markdown
+            document = Document(raw_markdown=merged_text, metadata=metadata)
+
+            logger.info(
+                f"Successfully extracted {len(merged_text)} characters from {file_path.name}"
+            )
+            return document
+
+        except EmptyContentError:
+            raise
+        except ExtractionError:
+            raise
+        except Exception as e:
+            logger.error(f"ZIP extraction failed for {file_path}: {str(e)}")
+            raise ExtractionError(
+                message=f"Failed to extract Markdown from {file_path.name}",
+                details=str(e),
+                file_path=str(file_path),
+            )
+
+    def supports_file_type(self, file_extension: str) -> bool:
+        """
+        Check if this extractor supports ZIP files.
+
+        Args:
+            file_extension: File extension (e.g., 'zip')
+
+        Returns:
+            True if ZIP files are supported
+        """
+        return file_extension.lower() in self._supported_extensions
+
+    def get_supported_types(self) -> List[str]:
+        """
+        Get list of supported file extensions.
+
+        Returns:
+            List containing 'zip'
+        """
+        return self._supported_extensions.copy()
+
+    def _validate_file(self, file_path: Path) -> None:
+        """
+        Validate file exists and is a valid ZIP archive.
+
+        Args:
+            file_path: Path to validate
+
+        Raises:
+            ExtractionError: If file is invalid
+        """
+        if not file_path.exists():
+            raise ExtractionError(
+                message=f"File not found: {file_path}",
+                file_path=str(file_path),
+            )
+
+        if not file_path.is_file():
+            raise ExtractionError(
+                message=f"Path is not a file: {file_path}",
+                file_path=str(file_path),
+            )
+
+        if file_path.stat().st_size == 0:
+            raise EmptyContentError(file_path=str(file_path))
+
+        # Validate it's a valid ZIP file
+        if not zipfile.is_zipfile(file_path):
+            raise ExtractionError(
+                message=f"File is not a valid ZIP archive: {file_path}",
+                file_path=str(file_path),
+            )
+
+    def _extract_and_merge_markdown(self, file_path: Path) -> str:
+        """
+        Extract all Markdown files from ZIP and merge into single string.
+
+        Args:
+            file_path: Path to ZIP file
+
+        Returns:
+            Merged Markdown content with file source headers
+
+        Raises:
+            ExtractionError: If ZIP extraction fails
+        """
+        try:
+            with zipfile.ZipFile(file_path, 'r') as zip_file:
+                # Get all markdown files, sorted alphabetically
+                md_files = self._get_markdown_files(zip_file)
+
+                if not md_files:
+                    logger.warning(f"No .md files found in ZIP archive: {file_path}")
+                    raise EmptyContentError(file_path=str(file_path))
+
+                logger.info(f"Found {len(md_files)} Markdown files in ZIP")
+
+                # Merge all files
+                merged_parts = []
+                successful_extractions = 0
+
+                for md_file in md_files:
+                    content = self._extract_file_content(zip_file, md_file)
+                    if content is not None:
+                        # Add file source header
+                        header = f"\n\n# File Source: {md_file}\n\n"
+                        merged_parts.append(header)
+                        merged_parts.append(content)
+                        successful_extractions += 1
+                        logger.debug(f"Successfully extracted: {md_file}")
+
+                if successful_extractions == 0:
+                    logger.warning(f"Failed to extract any valid Markdown files from ZIP: {file_path}")
+                    raise EmptyContentError(file_path=str(file_path))
+
+                logger.info(
+                    f"Successfully merged {successful_extractions}/{len(md_files)} files"
+                )
+
+                # Join all parts with proper spacing
+                return "".join(merged_parts).strip()
+
+        except EmptyContentError:
+            raise
+        except zipfile.BadZipFile as e:
+            raise ExtractionError(
+                message=f"Corrupted ZIP file: {file_path}",
+                details=str(e),
+                file_path=str(file_path),
+            )
+        except Exception as e:
+            raise ExtractionError(
+                message=f"ZIP extraction failed: {str(e)}",
+                file_path=str(file_path),
+            )
+
+    def _get_markdown_files(self, zip_file: zipfile.ZipFile) -> List[str]:
+        """
+        Get sorted list of Markdown files from ZIP, filtering hidden files.
+
+        Args:
+            zip_file: Open ZipFile object
+
+        Returns:
+            Sorted list of Markdown file paths
+        """
+        md_files = []
+
+        for file_info in zip_file.filelist:
+            filename = file_info.filename
+
+            # Skip directories
+            if filename.endswith('/'):
+                continue
+
+            # Skip hidden files and __MACOSX
+            path_parts = Path(filename).parts
+            if any(part.startswith('.') or part.startswith('__') for part in path_parts):
+                logger.debug(f"Skipping hidden/system file: {filename}")
+                continue
+
+            # Check for .md extension
+            if filename.lower().endswith('.md'):
+                md_files.append(filename)
+
+        # Sort alphabetically for deterministic order
+        md_files.sort()
+
+        return md_files
+
+    def _extract_file_content(
+        self,
+        zip_file: zipfile.ZipFile,
+        filename: str,
+    ) -> str | None:
+        """
+        Extract content from a single file in the ZIP with encoding detection.
+
+        Args:
+            zip_file: Open ZipFile object
+            filename: Name of file to extract
+
+        Returns:
+            File content as string, or None if extraction fails
+        """
+        try:
+            # Read raw bytes
+            raw_content = zip_file.read(filename)
+
+            # Try multiple encodings
+            for encoding in self._encodings:
+                try:
+                    text = raw_content.decode(encoding)
+                    logger.debug(f"Decoded {filename} with {encoding}")
+                    return text
+                except UnicodeDecodeError:
+                    continue
+
+            # If all encodings fail, log warning and skip
+            logger.warning(
+                f"Failed to decode {filename} with any supported encoding, skipping"
+            )
+            return None
+
+        except Exception as e:
+            # Log error but continue processing other files
+            logger.warning(f"Error extracting {filename}: {str(e)}, skipping")
+            return None
+
+    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
+        """
+        Create source-neutral document metadata from ZIP file.
+
+        Args:
+            file_path: Path to the ZIP file
+
+        Returns:
+            DocumentMetadata entity
+        """
+        stat = file_path.stat()
+
+        return DocumentMetadata(
+            source_id=str(file_path.absolute()),
+            source_type=SourceType.FILE,
+            display_name=file_path.name,
+            size_bytes=stat.st_size,
+        )
--- a/src/bootstrap.py
+++ b/src/bootstrap.py
@ -17,6 +17,7 @@ from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
 from .adapters.outgoing.extractors.factory import ExtractorFactory
 from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
 from .adapters.outgoing.extractors.txt_extractor import TxtExtractor
+from .adapters.outgoing.extractors.zip_extractor import ZipExtractor
 from .adapters.outgoing.persistence.in_memory_repository import (
    InMemoryDocumentRepository,
 )
@ -99,6 +100,7 @@ class ApplicationContainer:
        factory.register_extractor(PDFExtractor())
        factory.register_extractor(DocxExtractor())
        factory.register_extractor(TxtExtractor())
+        factory.register_extractor(ZipExtractor())

        logger.info(
            f"Registered extractors for: {factory.get_supported_types()}"