diff --git a/src/adapters/incoming/api_routes.py b/src/adapters/incoming/api_routes.py index 6b79439..7df7bfd 100644 --- a/src/adapters/incoming/api_routes.py +++ b/src/adapters/incoming/api_routes.py @@ -283,14 +283,14 @@ async def perform_chunking( description="Upload a file and extract text content with metadata", ) async def extract_document( - file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, md, zip)"), + file: UploadFile = File(..., description="Document file to extract (pdf, docx, pptx, html, txt, md, xlsx, zip)"), service: ITextProcessor = Depends(get_service), ) -> DocumentResponse: """ Extract text content from uploaded file. This endpoint handles file extraction only: - 1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP) + 1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP) 2. Extracts raw text content using appropriate extractor 3. Returns Document entity with metadata (no parsing) """ @@ -312,7 +312,7 @@ async def extract_document( description="Upload a file, extract text, parse markdown, and return chunks", ) async def process_file( - file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, md, zip)"), + file: UploadFile = File(..., description="Document file to process (pdf, docx, pptx, html, txt, md, xlsx, zip)"), strategy: ChunkingStrategy = Depends(get_chunking_strategy), service: ITextProcessor = Depends(get_service), ) -> ChunkListResponse: @@ -320,7 +320,7 @@ async def process_file( Complete file processing pipeline: Upload → Extract → Parse → Chunk. This endpoint handles the full document processing workflow: - 1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP) + 1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP) 2. Extracts text content using appropriate extractor 3. Parses markdown structure into sections 4. Chunks content according to strategy @@ -351,7 +351,7 @@ async def health_check() -> HealthCheckResponse: return HealthCheckResponse( status="healthy", version="1.0.0", - supported_file_types=["pdf", "docx", "txt", "md", "markdown", "zip"], + supported_file_types=["pdf", "docx", "pptx", "html", "htm", "txt", "md", "markdown", "zip", "xlsx"], available_strategies=["fixed_size", "paragraph"], ) diff --git a/src/adapters/outgoing/extractors/html_extractor.py b/src/adapters/outgoing/extractors/html_extractor.py new file mode 100644 index 0000000..aa5aca6 --- /dev/null +++ b/src/adapters/outgoing/extractors/html_extractor.py @@ -0,0 +1,158 @@ +""" +HTML Extractor - Concrete implementation for HTML text extraction. + +This adapter implements the IExtractor port using Docling library. +It maps Docling exceptions to domain exceptions. +""" +import logging +from pathlib import Path +from typing import List + +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter + +from ....core.domain.exceptions import ( + EmptyContentError, + ExtractionError, +) +from ....core.domain.models import Document, DocumentMetadata, SourceType +from ....core.ports.outgoing.extractor import IExtractor + + +logger = logging.getLogger(__name__) + + +class HTMLExtractor(IExtractor): + """ + Concrete HTML extractor using Docling. + + This adapter: + 1. Extracts text from HTML files using Docling's DocumentConverter + 2. Converts HTML to Markdown format + 3. Preserves document structure and formatting + """ + + def __init__(self) -> None: + """Initialize HTML extractor with Docling converter.""" + self._supported_extensions = ['html', 'htm'] + self._converter = DocumentConverter() + logger.info("HTML Extractor initialized with Docling DocumentConverter") + + def extract(self, file_path: Path) -> Document: + """ + Extract text and metadata from HTML file using Docling. + + Args: + file_path: Path to the HTML file + + Returns: + Document entity with extracted content and metadata + + Raises: + ExtractionError: If extraction fails + EmptyContentError: If no text could be extracted + """ + try: + logger.info(f"Extracting text from HTML: {file_path}") + + # Validate file + self._validate_file(file_path) + + # Convert HTML to markdown using Docling + result = self._converter.convert(str(file_path)) + markdown_text = result.document.export_to_markdown() + + # Validate content + if not markdown_text or not markdown_text.strip(): + raise EmptyContentError(file_path=str(file_path)) + + # Create metadata + metadata = self._create_metadata(file_path) + + # Build document with raw_markdown + document = Document( + raw_markdown=markdown_text, + title=file_path.stem, + metadata=metadata + ) + + logger.info( + f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" + ) + return document + + except EmptyContentError: + raise + except ExtractionError: + raise + except Exception as e: + logger.error(f"HTML extraction failed for {file_path}: {str(e)}") + raise ExtractionError( + message=f"Failed to extract text from {file_path.name}", + details=str(e), + file_path=str(file_path), + ) + + def supports_file_type(self, file_extension: str) -> bool: + """ + Check if this extractor supports a given file type. + + Args: + file_extension: File extension (e.g., 'html', 'htm') + + Returns: + True if HTML files are supported + """ + return file_extension.lower() in self._supported_extensions + + def get_supported_types(self) -> List[str]: + """ + Get list of supported file extensions. + + Returns: + List containing 'html' and 'htm' + """ + return self._supported_extensions.copy() + + def _validate_file(self, file_path: Path) -> None: + """ + Validate file exists and is readable. + + Args: + file_path: Path to validate + + Raises: + ExtractionError: If file is invalid + """ + if not file_path.exists(): + raise ExtractionError( + message=f"File not found: {file_path}", + file_path=str(file_path), + ) + + if not file_path.is_file(): + raise ExtractionError( + message=f"Path is not a file: {file_path}", + file_path=str(file_path), + ) + + if file_path.stat().st_size == 0: + raise EmptyContentError(file_path=str(file_path)) + + def _create_metadata(self, file_path: Path) -> DocumentMetadata: + """ + Create document metadata from HTML file. + + Args: + file_path: Path to the HTML file + + Returns: + DocumentMetadata entity + """ + stat = file_path.stat() + + return DocumentMetadata( + source_id=str(file_path.absolute()), + source_type=SourceType.FILE, + size_bytes=stat.st_size, + ) diff --git a/src/adapters/outgoing/extractors/pptx_extractor.py b/src/adapters/outgoing/extractors/pptx_extractor.py new file mode 100644 index 0000000..6b8d5b9 --- /dev/null +++ b/src/adapters/outgoing/extractors/pptx_extractor.py @@ -0,0 +1,173 @@ +""" +PPTX Extractor - Concrete implementation for PowerPoint text extraction. + +This adapter implements the IExtractor port using Docling library. +It maps Docling exceptions to domain exceptions. +""" +import logging +from pathlib import Path +from typing import List + +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter + +from ....core.domain.exceptions import ( + EmptyContentError, + ExtractionError, +) +from ....core.domain.models import Document, DocumentMetadata, SourceType +from ....core.ports.outgoing.extractor import IExtractor + + +logger = logging.getLogger(__name__) + + +class PPTXExtractor(IExtractor): + """ + Concrete PPTX extractor using Docling. + + This adapter: + 1. Extracts text from PowerPoint files using Docling's DocumentConverter + 2. Converts slides to Markdown format + 3. Preserves slide structure and formatting + 4. Extracts slide count metadata + """ + + def __init__(self) -> None: + """Initialize PPTX extractor with Docling converter.""" + self._supported_extensions = ['pptx'] + self._converter = DocumentConverter() + logger.info("PPTX Extractor initialized with Docling DocumentConverter") + + def extract(self, file_path: Path) -> Document: + """ + Extract text and metadata from PPTX file using Docling. + + Args: + file_path: Path to the PPTX file + + Returns: + Document entity with extracted content and metadata + + Raises: + ExtractionError: If extraction fails + EmptyContentError: If no text could be extracted + """ + try: + logger.info(f"Extracting text from PPTX: {file_path}") + + # Validate file + self._validate_file(file_path) + + # Convert PPTX to markdown using Docling + result = self._converter.convert(str(file_path)) + markdown_text = result.document.export_to_markdown() + + # Validate content + if not markdown_text or not markdown_text.strip(): + raise EmptyContentError(file_path=str(file_path)) + + # Create metadata with slide count from Docling result + metadata = self._create_metadata(file_path, result) + + # Build document with raw_markdown + document = Document( + raw_markdown=markdown_text, + title=file_path.stem, + metadata=metadata + ) + + logger.info( + f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" + ) + return document + + except EmptyContentError: + raise + except ExtractionError: + raise + except Exception as e: + logger.error(f"PPTX extraction failed for {file_path}: {str(e)}") + raise ExtractionError( + message=f"Failed to extract text from {file_path.name}", + details=str(e), + file_path=str(file_path), + ) + + def supports_file_type(self, file_extension: str) -> bool: + """ + Check if this extractor supports a given file type. + + Args: + file_extension: File extension (e.g., 'pptx') + + Returns: + True if PPTX files are supported + """ + return file_extension.lower() in self._supported_extensions + + def get_supported_types(self) -> List[str]: + """ + Get list of supported file extensions. + + Returns: + List containing 'pptx' + """ + return self._supported_extensions.copy() + + def _validate_file(self, file_path: Path) -> None: + """ + Validate file exists and is readable. + + Args: + file_path: Path to validate + + Raises: + ExtractionError: If file is invalid + """ + if not file_path.exists(): + raise ExtractionError( + message=f"File not found: {file_path}", + file_path=str(file_path), + ) + + if not file_path.is_file(): + raise ExtractionError( + message=f"Path is not a file: {file_path}", + file_path=str(file_path), + ) + + if file_path.stat().st_size == 0: + raise EmptyContentError(file_path=str(file_path)) + + def _create_metadata(self, file_path: Path, result) -> DocumentMetadata: + """ + Create document metadata from PPTX file and Docling result. + + Args: + file_path: Path to the PPTX file + result: Docling conversion result + + Returns: + DocumentMetadata entity + """ + stat = file_path.stat() + + # Extract slide count from Docling result + slide_count = None + try: + if hasattr(result.document, 'pages'): + slide_count = len(result.document.pages) + except Exception as e: + logger.warning(f"Could not extract slide count: {str(e)}") + + extra_metadata = {} + if slide_count is not None: + extra_metadata['slide_count'] = str(slide_count) + + return DocumentMetadata( + source_id=str(file_path.absolute()), + source_type=SourceType.FILE, + size_bytes=stat.st_size, + extra_metadata=extra_metadata, + ) diff --git a/src/bootstrap.py b/src/bootstrap.py index e752bb9..c7713b8 100644 --- a/src/bootstrap.py +++ b/src/bootstrap.py @@ -17,8 +17,10 @@ from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker from .adapters.outgoing.extractors.docx_extractor import DocxExtractor from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor from .adapters.outgoing.extractors.factory import ExtractorFactory +from .adapters.outgoing.extractors.html_extractor import HTMLExtractor from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor +from .adapters.outgoing.extractors.pptx_extractor import PPTXExtractor from .adapters.outgoing.extractors.txt_extractor import TxtExtractor from .adapters.outgoing.extractors.zip_extractor import ZipExtractor from .adapters.outgoing.persistence.in_memory_repository import ( @@ -123,6 +125,8 @@ class ApplicationContainer: factory.register_extractor(TxtExtractor()) factory.register_extractor(MarkdownExtractor()) factory.register_extractor(ZipExtractor()) + factory.register_extractor(HTMLExtractor()) + factory.register_extractor(PPTXExtractor()) logger.info( f"Registered extractors for: {factory.get_supported_types()}"