feat: add pptx_extractor and html_extractor

This commit is contained in:
m.dabbagh 2026-01-31 18:23:04 +03:30
parent b53f8c47d3
commit b57792eb41
4 changed files with 340 additions and 5 deletions

View File

@ -283,14 +283,14 @@ async def perform_chunking(
description="Upload a file and extract text content with metadata",
)
async def extract_document(
file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, md, zip)"),
file: UploadFile = File(..., description="Document file to extract (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
service: ITextProcessor = Depends(get_service),
) -> DocumentResponse:
"""
Extract text content from uploaded file.
This endpoint handles file extraction only:
1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
2. Extracts raw text content using appropriate extractor
3. Returns Document entity with metadata (no parsing)
"""
@ -312,7 +312,7 @@ async def extract_document(
description="Upload a file, extract text, parse markdown, and return chunks",
)
async def process_file(
file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, md, zip)"),
file: UploadFile = File(..., description="Document file to process (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
strategy: ChunkingStrategy = Depends(get_chunking_strategy),
service: ITextProcessor = Depends(get_service),
) -> ChunkListResponse:
@ -320,7 +320,7 @@ async def process_file(
Complete file processing pipeline: Upload Extract Parse Chunk.
This endpoint handles the full document processing workflow:
1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
2. Extracts text content using appropriate extractor
3. Parses markdown structure into sections
4. Chunks content according to strategy
@ -351,7 +351,7 @@ async def health_check() -> HealthCheckResponse:
return HealthCheckResponse(
status="healthy",
version="1.0.0",
supported_file_types=["pdf", "docx", "txt", "md", "markdown", "zip"],
supported_file_types=["pdf", "docx", "pptx", "html", "htm", "txt", "md", "markdown", "zip", "xlsx"],
available_strategies=["fixed_size", "paragraph"],
)

View File

@ -0,0 +1,158 @@
"""
HTML Extractor - Concrete implementation for HTML text extraction.
This adapter implements the IExtractor port using Docling library.
It maps Docling exceptions to domain exceptions.
"""
import logging
from pathlib import Path
from typing import List
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter
from ....core.domain.exceptions import (
EmptyContentError,
ExtractionError,
)
from ....core.domain.models import Document, DocumentMetadata, SourceType
from ....core.ports.outgoing.extractor import IExtractor
logger = logging.getLogger(__name__)
class HTMLExtractor(IExtractor):
"""
Concrete HTML extractor using Docling.
This adapter:
1. Extracts text from HTML files using Docling's DocumentConverter
2. Converts HTML to Markdown format
3. Preserves document structure and formatting
"""
def __init__(self) -> None:
"""Initialize HTML extractor with Docling converter."""
self._supported_extensions = ['html', 'htm']
self._converter = DocumentConverter()
logger.info("HTML Extractor initialized with Docling DocumentConverter")
def extract(self, file_path: Path) -> Document:
"""
Extract text and metadata from HTML file using Docling.
Args:
file_path: Path to the HTML file
Returns:
Document entity with extracted content and metadata
Raises:
ExtractionError: If extraction fails
EmptyContentError: If no text could be extracted
"""
try:
logger.info(f"Extracting text from HTML: {file_path}")
# Validate file
self._validate_file(file_path)
# Convert HTML to markdown using Docling
result = self._converter.convert(str(file_path))
markdown_text = result.document.export_to_markdown()
# Validate content
if not markdown_text or not markdown_text.strip():
raise EmptyContentError(file_path=str(file_path))
# Create metadata
metadata = self._create_metadata(file_path)
# Build document with raw_markdown
document = Document(
raw_markdown=markdown_text,
title=file_path.stem,
metadata=metadata
)
logger.info(
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
)
return document
except EmptyContentError:
raise
except ExtractionError:
raise
except Exception as e:
logger.error(f"HTML extraction failed for {file_path}: {str(e)}")
raise ExtractionError(
message=f"Failed to extract text from {file_path.name}",
details=str(e),
file_path=str(file_path),
)
def supports_file_type(self, file_extension: str) -> bool:
"""
Check if this extractor supports a given file type.
Args:
file_extension: File extension (e.g., 'html', 'htm')
Returns:
True if HTML files are supported
"""
return file_extension.lower() in self._supported_extensions
def get_supported_types(self) -> List[str]:
"""
Get list of supported file extensions.
Returns:
List containing 'html' and 'htm'
"""
return self._supported_extensions.copy()
def _validate_file(self, file_path: Path) -> None:
"""
Validate file exists and is readable.
Args:
file_path: Path to validate
Raises:
ExtractionError: If file is invalid
"""
if not file_path.exists():
raise ExtractionError(
message=f"File not found: {file_path}",
file_path=str(file_path),
)
if not file_path.is_file():
raise ExtractionError(
message=f"Path is not a file: {file_path}",
file_path=str(file_path),
)
if file_path.stat().st_size == 0:
raise EmptyContentError(file_path=str(file_path))
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
"""
Create document metadata from HTML file.
Args:
file_path: Path to the HTML file
Returns:
DocumentMetadata entity
"""
stat = file_path.stat()
return DocumentMetadata(
source_id=str(file_path.absolute()),
source_type=SourceType.FILE,
size_bytes=stat.st_size,
)

View File

@ -0,0 +1,173 @@
"""
PPTX Extractor - Concrete implementation for PowerPoint text extraction.
This adapter implements the IExtractor port using Docling library.
It maps Docling exceptions to domain exceptions.
"""
import logging
from pathlib import Path
from typing import List
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter
from ....core.domain.exceptions import (
EmptyContentError,
ExtractionError,
)
from ....core.domain.models import Document, DocumentMetadata, SourceType
from ....core.ports.outgoing.extractor import IExtractor
logger = logging.getLogger(__name__)
class PPTXExtractor(IExtractor):
"""
Concrete PPTX extractor using Docling.
This adapter:
1. Extracts text from PowerPoint files using Docling's DocumentConverter
2. Converts slides to Markdown format
3. Preserves slide structure and formatting
4. Extracts slide count metadata
"""
def __init__(self) -> None:
"""Initialize PPTX extractor with Docling converter."""
self._supported_extensions = ['pptx']
self._converter = DocumentConverter()
logger.info("PPTX Extractor initialized with Docling DocumentConverter")
def extract(self, file_path: Path) -> Document:
"""
Extract text and metadata from PPTX file using Docling.
Args:
file_path: Path to the PPTX file
Returns:
Document entity with extracted content and metadata
Raises:
ExtractionError: If extraction fails
EmptyContentError: If no text could be extracted
"""
try:
logger.info(f"Extracting text from PPTX: {file_path}")
# Validate file
self._validate_file(file_path)
# Convert PPTX to markdown using Docling
result = self._converter.convert(str(file_path))
markdown_text = result.document.export_to_markdown()
# Validate content
if not markdown_text or not markdown_text.strip():
raise EmptyContentError(file_path=str(file_path))
# Create metadata with slide count from Docling result
metadata = self._create_metadata(file_path, result)
# Build document with raw_markdown
document = Document(
raw_markdown=markdown_text,
title=file_path.stem,
metadata=metadata
)
logger.info(
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
)
return document
except EmptyContentError:
raise
except ExtractionError:
raise
except Exception as e:
logger.error(f"PPTX extraction failed for {file_path}: {str(e)}")
raise ExtractionError(
message=f"Failed to extract text from {file_path.name}",
details=str(e),
file_path=str(file_path),
)
def supports_file_type(self, file_extension: str) -> bool:
"""
Check if this extractor supports a given file type.
Args:
file_extension: File extension (e.g., 'pptx')
Returns:
True if PPTX files are supported
"""
return file_extension.lower() in self._supported_extensions
def get_supported_types(self) -> List[str]:
"""
Get list of supported file extensions.
Returns:
List containing 'pptx'
"""
return self._supported_extensions.copy()
def _validate_file(self, file_path: Path) -> None:
"""
Validate file exists and is readable.
Args:
file_path: Path to validate
Raises:
ExtractionError: If file is invalid
"""
if not file_path.exists():
raise ExtractionError(
message=f"File not found: {file_path}",
file_path=str(file_path),
)
if not file_path.is_file():
raise ExtractionError(
message=f"Path is not a file: {file_path}",
file_path=str(file_path),
)
if file_path.stat().st_size == 0:
raise EmptyContentError(file_path=str(file_path))
def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
"""
Create document metadata from PPTX file and Docling result.
Args:
file_path: Path to the PPTX file
result: Docling conversion result
Returns:
DocumentMetadata entity
"""
stat = file_path.stat()
# Extract slide count from Docling result
slide_count = None
try:
if hasattr(result.document, 'pages'):
slide_count = len(result.document.pages)
except Exception as e:
logger.warning(f"Could not extract slide count: {str(e)}")
extra_metadata = {}
if slide_count is not None:
extra_metadata['slide_count'] = str(slide_count)
return DocumentMetadata(
source_id=str(file_path.absolute()),
source_type=SourceType.FILE,
size_bytes=stat.st_size,
extra_metadata=extra_metadata,
)

View File

@ -17,8 +17,10 @@ from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
from .adapters.outgoing.extractors.factory import ExtractorFactory
from .adapters.outgoing.extractors.html_extractor import HTMLExtractor
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
from .adapters.outgoing.extractors.pptx_extractor import PPTXExtractor
from .adapters.outgoing.extractors.txt_extractor import TxtExtractor
from .adapters.outgoing.extractors.zip_extractor import ZipExtractor
from .adapters.outgoing.persistence.in_memory_repository import (
@ -123,6 +125,8 @@ class ApplicationContainer:
factory.register_extractor(TxtExtractor())
factory.register_extractor(MarkdownExtractor())
factory.register_extractor(ZipExtractor())
factory.register_extractor(HTMLExtractor())
factory.register_extractor(PPTXExtractor())
logger.info(
f"Registered extractors for: {factory.get_supported_types()}"