feat: add pptx_extractor and html_extractor

2026-01-31 18:23:04 +03:30 · 2026-01-31 18:23:04 +03:30 · b57792eb41
commit b57792eb41
parent b53f8c47d3
4 changed files with 340 additions and 5 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -283,14 +283,14 @@ async def perform_chunking(
    description="Upload a file and extract text content with metadata",
 )
 async def extract_document(
-    file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, md, zip)"),
+    file: UploadFile = File(..., description="Document file to extract (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
    service: ITextProcessor = Depends(get_service),
 ) -> DocumentResponse:
    """
    Extract text content from uploaded file.
    This endpoint handles file extraction only:
-    1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
+    1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
    2. Extracts raw text content using appropriate extractor
    3. Returns Document entity with metadata (no parsing)
    """
@ -312,7 +312,7 @@ async def extract_document(
    description="Upload a file, extract text, parse markdown, and return chunks",
 )
 async def process_file(
-    file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, md, zip)"),
+    file: UploadFile = File(..., description="Document file to process (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
    strategy: ChunkingStrategy = Depends(get_chunking_strategy),
    service: ITextProcessor = Depends(get_service),
 ) -> ChunkListResponse:
@ -320,7 +320,7 @@ async def process_file(
    Complete file processing pipeline: Upload → Extract → Parse → Chunk.
    This endpoint handles the full document processing workflow:
-    1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
+    1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
    2. Extracts text content using appropriate extractor
    3. Parses markdown structure into sections
    4. Chunks content according to strategy
@ -351,7 +351,7 @@ async def health_check() -> HealthCheckResponse:
    return HealthCheckResponse(
        status="healthy",
        version="1.0.0",
-        supported_file_types=["pdf", "docx", "txt", "md", "markdown", "zip"],
+        supported_file_types=["pdf", "docx", "pptx", "html", "htm", "txt", "md", "markdown", "zip", "xlsx"],
        available_strategies=["fixed_size", "paragraph"],
    )
--- a/src/adapters/outgoing/extractors/html_extractor.py
+++ b/src/adapters/outgoing/extractors/html_extractor.py
@ -0,0 +1,158 @@
 """
 HTML Extractor - Concrete implementation for HTML text extraction.
 This adapter implements the IExtractor port using Docling library.
 It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List
 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import DocumentConverter
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
 )
 from ....core.domain.models import Document, DocumentMetadata, SourceType
 from ....core.ports.outgoing.extractor import IExtractor
 logger = logging.getLogger(__name__)
 class HTMLExtractor(IExtractor):
    """
    Concrete HTML extractor using Docling.
    This adapter:
    1. Extracts text from HTML files using Docling's DocumentConverter
    2. Converts HTML to Markdown format
    3. Preserves document structure and formatting
    """
    def __init__(self) -> None:
        """Initialize HTML extractor with Docling converter."""
        self._supported_extensions = ['html', 'htm']
        self._converter = DocumentConverter()
        logger.info("HTML Extractor initialized with Docling DocumentConverter")
    def extract(self, file_path: Path) -> Document:
        """
        Extract text and metadata from HTML file using Docling.
        Args:
            file_path: Path to the HTML file
        Returns:
            Document entity with extracted content and metadata
        Raises:
            ExtractionError: If extraction fails
            EmptyContentError: If no text could be extracted
        """
        try:
            logger.info(f"Extracting text from HTML: {file_path}")
            # Validate file
            self._validate_file(file_path)
            # Convert HTML to markdown using Docling
            result = self._converter.convert(str(file_path))
            markdown_text = result.document.export_to_markdown()
            # Validate content
            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))
            # Create metadata
            metadata = self._create_metadata(file_path)
            # Build document with raw_markdown
            document = Document(
                raw_markdown=markdown_text,
                title=file_path.stem,
                metadata=metadata
            )
            logger.info(
                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document
        except EmptyContentError:
            raise
        except ExtractionError:
            raise
        except Exception as e:
            logger.error(f"HTML extraction failed for {file_path}: {str(e)}")
            raise ExtractionError(
                message=f"Failed to extract text from {file_path.name}",
                details=str(e),
                file_path=str(file_path),
            )
    def supports_file_type(self, file_extension: str) -> bool:
        """
        Check if this extractor supports a given file type.
        Args:
            file_extension: File extension (e.g., 'html', 'htm')
        Returns:
            True if HTML files are supported
        """
        return file_extension.lower() in self._supported_extensions
    def get_supported_types(self) -> List[str]:
        """
        Get list of supported file extensions.
        Returns:
            List containing 'html' and 'htm'
        """
        return self._supported_extensions.copy()
    def _validate_file(self, file_path: Path) -> None:
        """
        Validate file exists and is readable.
        Args:
            file_path: Path to validate
        Raises:
            ExtractionError: If file is invalid
        """
        if not file_path.exists():
            raise ExtractionError(
                message=f"File not found: {file_path}",
                file_path=str(file_path),
            )
        if not file_path.is_file():
            raise ExtractionError(
                message=f"Path is not a file: {file_path}",
                file_path=str(file_path),
            )
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
        Create document metadata from HTML file.
        Args:
            file_path: Path to the HTML file
        Returns:
            DocumentMetadata entity
        """
        stat = file_path.stat()
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
            size_bytes=stat.st_size,
        )
--- a/src/adapters/outgoing/extractors/pptx_extractor.py
+++ b/src/adapters/outgoing/extractors/pptx_extractor.py
@ -0,0 +1,173 @@
 """
 PPTX Extractor - Concrete implementation for PowerPoint text extraction.
 This adapter implements the IExtractor port using Docling library.
 It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List
 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import DocumentConverter
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
 )
 from ....core.domain.models import Document, DocumentMetadata, SourceType
 from ....core.ports.outgoing.extractor import IExtractor
 logger = logging.getLogger(__name__)
 class PPTXExtractor(IExtractor):
    """
    Concrete PPTX extractor using Docling.
    This adapter:
    1. Extracts text from PowerPoint files using Docling's DocumentConverter
    2. Converts slides to Markdown format
    3. Preserves slide structure and formatting
    4. Extracts slide count metadata
    """
    def __init__(self) -> None:
        """Initialize PPTX extractor with Docling converter."""
        self._supported_extensions = ['pptx']
        self._converter = DocumentConverter()
        logger.info("PPTX Extractor initialized with Docling DocumentConverter")
    def extract(self, file_path: Path) -> Document:
        """
        Extract text and metadata from PPTX file using Docling.
        Args:
            file_path: Path to the PPTX file
        Returns:
            Document entity with extracted content and metadata
        Raises:
            ExtractionError: If extraction fails
            EmptyContentError: If no text could be extracted
        """
        try:
            logger.info(f"Extracting text from PPTX: {file_path}")
            # Validate file
            self._validate_file(file_path)
            # Convert PPTX to markdown using Docling
            result = self._converter.convert(str(file_path))
            markdown_text = result.document.export_to_markdown()
            # Validate content
            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))
            # Create metadata with slide count from Docling result
            metadata = self._create_metadata(file_path, result)
            # Build document with raw_markdown
            document = Document(
                raw_markdown=markdown_text,
                title=file_path.stem,
                metadata=metadata
            )
            logger.info(
                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document
        except EmptyContentError:
            raise
        except ExtractionError:
            raise
        except Exception as e:
            logger.error(f"PPTX extraction failed for {file_path}: {str(e)}")
            raise ExtractionError(
                message=f"Failed to extract text from {file_path.name}",
                details=str(e),
                file_path=str(file_path),
            )
    def supports_file_type(self, file_extension: str) -> bool:
        """
        Check if this extractor supports a given file type.
        Args:
            file_extension: File extension (e.g., 'pptx')
        Returns:
            True if PPTX files are supported
        """
        return file_extension.lower() in self._supported_extensions
    def get_supported_types(self) -> List[str]:
        """
        Get list of supported file extensions.
        Returns:
            List containing 'pptx'
        """
        return self._supported_extensions.copy()
    def _validate_file(self, file_path: Path) -> None:
        """
        Validate file exists and is readable.
        Args:
            file_path: Path to validate
        Raises:
            ExtractionError: If file is invalid
        """
        if not file_path.exists():
            raise ExtractionError(
                message=f"File not found: {file_path}",
                file_path=str(file_path),
            )
        if not file_path.is_file():
            raise ExtractionError(
                message=f"Path is not a file: {file_path}",
                file_path=str(file_path),
            )
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))
    def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
        """
        Create document metadata from PPTX file and Docling result.
        Args:
            file_path: Path to the PPTX file
            result: Docling conversion result
        Returns:
            DocumentMetadata entity
        """
        stat = file_path.stat()
        # Extract slide count from Docling result
        slide_count = None
        try:
            if hasattr(result.document, 'pages'):
                slide_count = len(result.document.pages)
        except Exception as e:
            logger.warning(f"Could not extract slide count: {str(e)}")
        extra_metadata = {}
        if slide_count is not None:
            extra_metadata['slide_count'] = str(slide_count)
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
            size_bytes=stat.st_size,
            extra_metadata=extra_metadata,
        )
--- a/src/bootstrap.py
+++ b/src/bootstrap.py
@ -17,8 +17,10 @@ from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
 from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
 from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
 from .adapters.outgoing.extractors.factory import ExtractorFactory
 from .adapters.outgoing.extractors.html_extractor import HTMLExtractor
 from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
 from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
 from .adapters.outgoing.extractors.pptx_extractor import PPTXExtractor
 from .adapters.outgoing.extractors.txt_extractor import TxtExtractor
 from .adapters.outgoing.extractors.zip_extractor import ZipExtractor
 from .adapters.outgoing.persistence.in_memory_repository import (
@ -123,6 +125,8 @@ class ApplicationContainer:
        factory.register_extractor(TxtExtractor())
        factory.register_extractor(MarkdownExtractor())
        factory.register_extractor(ZipExtractor())
        factory.register_extractor(HTMLExtractor())
        factory.register_extractor(PPTXExtractor())
        logger.info(
            f"Registered extractors for: {factory.get_supported_types()}"