feat: add pptx_extractor and html_extractor

2026-01-31 18:23:04 +03:30 · 2026-01-31 18:23:04 +03:30 · b57792eb41
commit b57792eb41
parent b53f8c47d3
4 changed files with 340 additions and 5 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -283,14 +283,14 @@ async def perform_chunking(
    description="Upload a file and extract text content with metadata",
 )
 async def extract_document(
-    file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, md, zip)"),
+    file: UploadFile = File(..., description="Document file to extract (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
    service: ITextProcessor = Depends(get_service),
 ) -> DocumentResponse:
    """
    Extract text content from uploaded file.

    This endpoint handles file extraction only:
-    1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
+    1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
    2. Extracts raw text content using appropriate extractor
    3. Returns Document entity with metadata (no parsing)
    """
@ -312,7 +312,7 @@ async def extract_document(
    description="Upload a file, extract text, parse markdown, and return chunks",
 )
 async def process_file(
-    file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, md, zip)"),
+    file: UploadFile = File(..., description="Document file to process (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
    strategy: ChunkingStrategy = Depends(get_chunking_strategy),
    service: ITextProcessor = Depends(get_service),
 ) -> ChunkListResponse:
@ -320,7 +320,7 @@ async def process_file(
    Complete file processing pipeline: Upload → Extract → Parse → Chunk.

    This endpoint handles the full document processing workflow:
-    1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
+    1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
    2. Extracts text content using appropriate extractor
    3. Parses markdown structure into sections
    4. Chunks content according to strategy
@ -351,7 +351,7 @@ async def health_check() -> HealthCheckResponse:
    return HealthCheckResponse(
        status="healthy",
        version="1.0.0",
-        supported_file_types=["pdf", "docx", "txt", "md", "markdown", "zip"],
+        supported_file_types=["pdf", "docx", "pptx", "html", "htm", "txt", "md", "markdown", "zip", "xlsx"],
        available_strategies=["fixed_size", "paragraph"],
    )

--- a/src/adapters/outgoing/extractors/html_extractor.py
+++ b/src/adapters/outgoing/extractors/html_extractor.py
@ -0,0 +1,158 @@
+"""
+HTML Extractor - Concrete implementation for HTML text extraction.
+
+This adapter implements the IExtractor port using Docling library.
+It maps Docling exceptions to domain exceptions.
+"""
+import logging
+from pathlib import Path
+from typing import List
+
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter
+
+from ....core.domain.exceptions import (
+    EmptyContentError,
+    ExtractionError,
+)
+from ....core.domain.models import Document, DocumentMetadata, SourceType
+from ....core.ports.outgoing.extractor import IExtractor
+
+
+logger = logging.getLogger(__name__)
+
+
+class HTMLExtractor(IExtractor):
+    """
+    Concrete HTML extractor using Docling.
+
+    This adapter:
+    1. Extracts text from HTML files using Docling's DocumentConverter
+    2. Converts HTML to Markdown format
+    3. Preserves document structure and formatting
+    """
+
+    def __init__(self) -> None:
+        """Initialize HTML extractor with Docling converter."""
+        self._supported_extensions = ['html', 'htm']
+        self._converter = DocumentConverter()
+        logger.info("HTML Extractor initialized with Docling DocumentConverter")
+
+    def extract(self, file_path: Path) -> Document:
+        """
+        Extract text and metadata from HTML file using Docling.
+
+        Args:
+            file_path: Path to the HTML file
+
+        Returns:
+            Document entity with extracted content and metadata
+
+        Raises:
+            ExtractionError: If extraction fails
+            EmptyContentError: If no text could be extracted
+        """
+        try:
+            logger.info(f"Extracting text from HTML: {file_path}")
+
+            # Validate file
+            self._validate_file(file_path)
+
+            # Convert HTML to markdown using Docling
+            result = self._converter.convert(str(file_path))
+            markdown_text = result.document.export_to_markdown()
+
+            # Validate content
+            if not markdown_text or not markdown_text.strip():
+                raise EmptyContentError(file_path=str(file_path))
+
+            # Create metadata
+            metadata = self._create_metadata(file_path)
+
+            # Build document with raw_markdown
+            document = Document(
+                raw_markdown=markdown_text,
+                title=file_path.stem,
+                metadata=metadata
+            )
+
+            logger.info(
+                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
+            )
+            return document
+
+        except EmptyContentError:
+            raise
+        except ExtractionError:
+            raise
+        except Exception as e:
+            logger.error(f"HTML extraction failed for {file_path}: {str(e)}")
+            raise ExtractionError(
+                message=f"Failed to extract text from {file_path.name}",
+                details=str(e),
+                file_path=str(file_path),
+            )
+
+    def supports_file_type(self, file_extension: str) -> bool:
+        """
+        Check if this extractor supports a given file type.
+
+        Args:
+            file_extension: File extension (e.g., 'html', 'htm')
+
+        Returns:
+            True if HTML files are supported
+        """
+        return file_extension.lower() in self._supported_extensions
+
+    def get_supported_types(self) -> List[str]:
+        """
+        Get list of supported file extensions.
+
+        Returns:
+            List containing 'html' and 'htm'
+        """
+        return self._supported_extensions.copy()
+
+    def _validate_file(self, file_path: Path) -> None:
+        """
+        Validate file exists and is readable.
+
+        Args:
+            file_path: Path to validate
+
+        Raises:
+            ExtractionError: If file is invalid
+        """
+        if not file_path.exists():
+            raise ExtractionError(
+                message=f"File not found: {file_path}",
+                file_path=str(file_path),
+            )
+
+        if not file_path.is_file():
+            raise ExtractionError(
+                message=f"Path is not a file: {file_path}",
+                file_path=str(file_path),
+            )
+
+        if file_path.stat().st_size == 0:
+            raise EmptyContentError(file_path=str(file_path))
+
+    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
+        """
+        Create document metadata from HTML file.
+
+        Args:
+            file_path: Path to the HTML file
+
+        Returns:
+            DocumentMetadata entity
+        """
+        stat = file_path.stat()
+
+        return DocumentMetadata(
+            source_id=str(file_path.absolute()),
+            source_type=SourceType.FILE,
+            size_bytes=stat.st_size,
+        )
--- a/src/adapters/outgoing/extractors/pptx_extractor.py
+++ b/src/adapters/outgoing/extractors/pptx_extractor.py
@ -0,0 +1,173 @@
+"""
+PPTX Extractor - Concrete implementation for PowerPoint text extraction.
+
+This adapter implements the IExtractor port using Docling library.
+It maps Docling exceptions to domain exceptions.
+"""
+import logging
+from pathlib import Path
+from typing import List
+
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter
+
+from ....core.domain.exceptions import (
+    EmptyContentError,
+    ExtractionError,
+)
+from ....core.domain.models import Document, DocumentMetadata, SourceType
+from ....core.ports.outgoing.extractor import IExtractor
+
+
+logger = logging.getLogger(__name__)
+
+
+class PPTXExtractor(IExtractor):
+    """
+    Concrete PPTX extractor using Docling.
+
+    This adapter:
+    1. Extracts text from PowerPoint files using Docling's DocumentConverter
+    2. Converts slides to Markdown format
+    3. Preserves slide structure and formatting
+    4. Extracts slide count metadata
+    """
+
+    def __init__(self) -> None:
+        """Initialize PPTX extractor with Docling converter."""
+        self._supported_extensions = ['pptx']
+        self._converter = DocumentConverter()
+        logger.info("PPTX Extractor initialized with Docling DocumentConverter")
+
+    def extract(self, file_path: Path) -> Document:
+        """
+        Extract text and metadata from PPTX file using Docling.
+
+        Args:
+            file_path: Path to the PPTX file
+
+        Returns:
+            Document entity with extracted content and metadata
+
+        Raises:
+            ExtractionError: If extraction fails
+            EmptyContentError: If no text could be extracted
+        """
+        try:
+            logger.info(f"Extracting text from PPTX: {file_path}")
+
+            # Validate file
+            self._validate_file(file_path)
+
+            # Convert PPTX to markdown using Docling
+            result = self._converter.convert(str(file_path))
+            markdown_text = result.document.export_to_markdown()
+
+            # Validate content
+            if not markdown_text or not markdown_text.strip():
+                raise EmptyContentError(file_path=str(file_path))
+
+            # Create metadata with slide count from Docling result
+            metadata = self._create_metadata(file_path, result)
+
+            # Build document with raw_markdown
+            document = Document(
+                raw_markdown=markdown_text,
+                title=file_path.stem,
+                metadata=metadata
+            )
+
+            logger.info(
+                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
+            )
+            return document
+
+        except EmptyContentError:
+            raise
+        except ExtractionError:
+            raise
+        except Exception as e:
+            logger.error(f"PPTX extraction failed for {file_path}: {str(e)}")
+            raise ExtractionError(
+                message=f"Failed to extract text from {file_path.name}",
+                details=str(e),
+                file_path=str(file_path),
+            )
+
+    def supports_file_type(self, file_extension: str) -> bool:
+        """
+        Check if this extractor supports a given file type.
+
+        Args:
+            file_extension: File extension (e.g., 'pptx')
+
+        Returns:
+            True if PPTX files are supported
+        """
+        return file_extension.lower() in self._supported_extensions
+
+    def get_supported_types(self) -> List[str]:
+        """
+        Get list of supported file extensions.
+
+        Returns:
+            List containing 'pptx'
+        """
+        return self._supported_extensions.copy()
+
+    def _validate_file(self, file_path: Path) -> None:
+        """
+        Validate file exists and is readable.
+
+        Args:
+            file_path: Path to validate
+
+        Raises:
+            ExtractionError: If file is invalid
+        """
+        if not file_path.exists():
+            raise ExtractionError(
+                message=f"File not found: {file_path}",
+                file_path=str(file_path),
+            )
+
+        if not file_path.is_file():
+            raise ExtractionError(
+                message=f"Path is not a file: {file_path}",
+                file_path=str(file_path),
+            )
+
+        if file_path.stat().st_size == 0:
+            raise EmptyContentError(file_path=str(file_path))
+
+    def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
+        """
+        Create document metadata from PPTX file and Docling result.
+
+        Args:
+            file_path: Path to the PPTX file
+            result: Docling conversion result
+
+        Returns:
+            DocumentMetadata entity
+        """
+        stat = file_path.stat()
+
+        # Extract slide count from Docling result
+        slide_count = None
+        try:
+            if hasattr(result.document, 'pages'):
+                slide_count = len(result.document.pages)
+        except Exception as e:
+            logger.warning(f"Could not extract slide count: {str(e)}")
+
+        extra_metadata = {}
+        if slide_count is not None:
+            extra_metadata['slide_count'] = str(slide_count)
+
+        return DocumentMetadata(
+            source_id=str(file_path.absolute()),
+            source_type=SourceType.FILE,
+            size_bytes=stat.st_size,
+            extra_metadata=extra_metadata,
+        )
--- a/src/bootstrap.py
+++ b/src/bootstrap.py
@ -17,8 +17,10 @@ from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
 from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
 from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
 from .adapters.outgoing.extractors.factory import ExtractorFactory
+from .adapters.outgoing.extractors.html_extractor import HTMLExtractor
 from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
 from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
+from .adapters.outgoing.extractors.pptx_extractor import PPTXExtractor
 from .adapters.outgoing.extractors.txt_extractor import TxtExtractor
 from .adapters.outgoing.extractors.zip_extractor import ZipExtractor
 from .adapters.outgoing.persistence.in_memory_repository import (
@ -123,6 +125,8 @@ class ApplicationContainer:
        factory.register_extractor(TxtExtractor())
        factory.register_extractor(MarkdownExtractor())
        factory.register_extractor(ZipExtractor())
+        factory.register_extractor(HTMLExtractor())
+        factory.register_extractor(PPTXExtractor())

        logger.info(
            f"Registered extractors for: {factory.get_supported_types()}"