feat: add pptx_extractor and html_extractor
This commit is contained in:
parent
b53f8c47d3
commit
b57792eb41
@ -283,14 +283,14 @@ async def perform_chunking(
|
||||
description="Upload a file and extract text content with metadata",
|
||||
)
|
||||
async def extract_document(
|
||||
file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, md, zip)"),
|
||||
file: UploadFile = File(..., description="Document file to extract (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
|
||||
service: ITextProcessor = Depends(get_service),
|
||||
) -> DocumentResponse:
|
||||
"""
|
||||
Extract text content from uploaded file.
|
||||
|
||||
This endpoint handles file extraction only:
|
||||
1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
|
||||
1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
|
||||
2. Extracts raw text content using appropriate extractor
|
||||
3. Returns Document entity with metadata (no parsing)
|
||||
"""
|
||||
@ -312,7 +312,7 @@ async def extract_document(
|
||||
description="Upload a file, extract text, parse markdown, and return chunks",
|
||||
)
|
||||
async def process_file(
|
||||
file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, md, zip)"),
|
||||
file: UploadFile = File(..., description="Document file to process (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
|
||||
strategy: ChunkingStrategy = Depends(get_chunking_strategy),
|
||||
service: ITextProcessor = Depends(get_service),
|
||||
) -> ChunkListResponse:
|
||||
@ -320,7 +320,7 @@ async def process_file(
|
||||
Complete file processing pipeline: Upload → Extract → Parse → Chunk.
|
||||
|
||||
This endpoint handles the full document processing workflow:
|
||||
1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
|
||||
1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
|
||||
2. Extracts text content using appropriate extractor
|
||||
3. Parses markdown structure into sections
|
||||
4. Chunks content according to strategy
|
||||
@ -351,7 +351,7 @@ async def health_check() -> HealthCheckResponse:
|
||||
return HealthCheckResponse(
|
||||
status="healthy",
|
||||
version="1.0.0",
|
||||
supported_file_types=["pdf", "docx", "txt", "md", "markdown", "zip"],
|
||||
supported_file_types=["pdf", "docx", "pptx", "html", "htm", "txt", "md", "markdown", "zip", "xlsx"],
|
||||
available_strategies=["fixed_size", "paragraph"],
|
||||
)
|
||||
|
||||
|
||||
158
src/adapters/outgoing/extractors/html_extractor.py
Normal file
158
src/adapters/outgoing/extractors/html_extractor.py
Normal file
@ -0,0 +1,158 @@
|
||||
"""
|
||||
HTML Extractor - Concrete implementation for HTML text extraction.
|
||||
|
||||
This adapter implements the IExtractor port using Docling library.
|
||||
It maps Docling exceptions to domain exceptions.
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from ....core.domain.exceptions import (
|
||||
EmptyContentError,
|
||||
ExtractionError,
|
||||
)
|
||||
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||
from ....core.ports.outgoing.extractor import IExtractor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTMLExtractor(IExtractor):
|
||||
"""
|
||||
Concrete HTML extractor using Docling.
|
||||
|
||||
This adapter:
|
||||
1. Extracts text from HTML files using Docling's DocumentConverter
|
||||
2. Converts HTML to Markdown format
|
||||
3. Preserves document structure and formatting
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize HTML extractor with Docling converter."""
|
||||
self._supported_extensions = ['html', 'htm']
|
||||
self._converter = DocumentConverter()
|
||||
logger.info("HTML Extractor initialized with Docling DocumentConverter")
|
||||
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
"""
|
||||
Extract text and metadata from HTML file using Docling.
|
||||
|
||||
Args:
|
||||
file_path: Path to the HTML file
|
||||
|
||||
Returns:
|
||||
Document entity with extracted content and metadata
|
||||
|
||||
Raises:
|
||||
ExtractionError: If extraction fails
|
||||
EmptyContentError: If no text could be extracted
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Extracting text from HTML: {file_path}")
|
||||
|
||||
# Validate file
|
||||
self._validate_file(file_path)
|
||||
|
||||
# Convert HTML to markdown using Docling
|
||||
result = self._converter.convert(str(file_path))
|
||||
markdown_text = result.document.export_to_markdown()
|
||||
|
||||
# Validate content
|
||||
if not markdown_text or not markdown_text.strip():
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
# Create metadata
|
||||
metadata = self._create_metadata(file_path)
|
||||
|
||||
# Build document with raw_markdown
|
||||
document = Document(
|
||||
raw_markdown=markdown_text,
|
||||
title=file_path.stem,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||
)
|
||||
return document
|
||||
|
||||
except EmptyContentError:
|
||||
raise
|
||||
except ExtractionError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"HTML extraction failed for {file_path}: {str(e)}")
|
||||
raise ExtractionError(
|
||||
message=f"Failed to extract text from {file_path.name}",
|
||||
details=str(e),
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
def supports_file_type(self, file_extension: str) -> bool:
|
||||
"""
|
||||
Check if this extractor supports a given file type.
|
||||
|
||||
Args:
|
||||
file_extension: File extension (e.g., 'html', 'htm')
|
||||
|
||||
Returns:
|
||||
True if HTML files are supported
|
||||
"""
|
||||
return file_extension.lower() in self._supported_extensions
|
||||
|
||||
def get_supported_types(self) -> List[str]:
|
||||
"""
|
||||
Get list of supported file extensions.
|
||||
|
||||
Returns:
|
||||
List containing 'html' and 'htm'
|
||||
"""
|
||||
return self._supported_extensions.copy()
|
||||
|
||||
def _validate_file(self, file_path: Path) -> None:
|
||||
"""
|
||||
Validate file exists and is readable.
|
||||
|
||||
Args:
|
||||
file_path: Path to validate
|
||||
|
||||
Raises:
|
||||
ExtractionError: If file is invalid
|
||||
"""
|
||||
if not file_path.exists():
|
||||
raise ExtractionError(
|
||||
message=f"File not found: {file_path}",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
if not file_path.is_file():
|
||||
raise ExtractionError(
|
||||
message=f"Path is not a file: {file_path}",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
if file_path.stat().st_size == 0:
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||
"""
|
||||
Create document metadata from HTML file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the HTML file
|
||||
|
||||
Returns:
|
||||
DocumentMetadata entity
|
||||
"""
|
||||
stat = file_path.stat()
|
||||
|
||||
return DocumentMetadata(
|
||||
source_id=str(file_path.absolute()),
|
||||
source_type=SourceType.FILE,
|
||||
size_bytes=stat.st_size,
|
||||
)
|
||||
173
src/adapters/outgoing/extractors/pptx_extractor.py
Normal file
173
src/adapters/outgoing/extractors/pptx_extractor.py
Normal file
@ -0,0 +1,173 @@
|
||||
"""
|
||||
PPTX Extractor - Concrete implementation for PowerPoint text extraction.
|
||||
|
||||
This adapter implements the IExtractor port using Docling library.
|
||||
It maps Docling exceptions to domain exceptions.
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from ....core.domain.exceptions import (
|
||||
EmptyContentError,
|
||||
ExtractionError,
|
||||
)
|
||||
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||
from ....core.ports.outgoing.extractor import IExtractor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PPTXExtractor(IExtractor):
|
||||
"""
|
||||
Concrete PPTX extractor using Docling.
|
||||
|
||||
This adapter:
|
||||
1. Extracts text from PowerPoint files using Docling's DocumentConverter
|
||||
2. Converts slides to Markdown format
|
||||
3. Preserves slide structure and formatting
|
||||
4. Extracts slide count metadata
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize PPTX extractor with Docling converter."""
|
||||
self._supported_extensions = ['pptx']
|
||||
self._converter = DocumentConverter()
|
||||
logger.info("PPTX Extractor initialized with Docling DocumentConverter")
|
||||
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
"""
|
||||
Extract text and metadata from PPTX file using Docling.
|
||||
|
||||
Args:
|
||||
file_path: Path to the PPTX file
|
||||
|
||||
Returns:
|
||||
Document entity with extracted content and metadata
|
||||
|
||||
Raises:
|
||||
ExtractionError: If extraction fails
|
||||
EmptyContentError: If no text could be extracted
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Extracting text from PPTX: {file_path}")
|
||||
|
||||
# Validate file
|
||||
self._validate_file(file_path)
|
||||
|
||||
# Convert PPTX to markdown using Docling
|
||||
result = self._converter.convert(str(file_path))
|
||||
markdown_text = result.document.export_to_markdown()
|
||||
|
||||
# Validate content
|
||||
if not markdown_text or not markdown_text.strip():
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
# Create metadata with slide count from Docling result
|
||||
metadata = self._create_metadata(file_path, result)
|
||||
|
||||
# Build document with raw_markdown
|
||||
document = Document(
|
||||
raw_markdown=markdown_text,
|
||||
title=file_path.stem,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||
)
|
||||
return document
|
||||
|
||||
except EmptyContentError:
|
||||
raise
|
||||
except ExtractionError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"PPTX extraction failed for {file_path}: {str(e)}")
|
||||
raise ExtractionError(
|
||||
message=f"Failed to extract text from {file_path.name}",
|
||||
details=str(e),
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
def supports_file_type(self, file_extension: str) -> bool:
|
||||
"""
|
||||
Check if this extractor supports a given file type.
|
||||
|
||||
Args:
|
||||
file_extension: File extension (e.g., 'pptx')
|
||||
|
||||
Returns:
|
||||
True if PPTX files are supported
|
||||
"""
|
||||
return file_extension.lower() in self._supported_extensions
|
||||
|
||||
def get_supported_types(self) -> List[str]:
|
||||
"""
|
||||
Get list of supported file extensions.
|
||||
|
||||
Returns:
|
||||
List containing 'pptx'
|
||||
"""
|
||||
return self._supported_extensions.copy()
|
||||
|
||||
def _validate_file(self, file_path: Path) -> None:
|
||||
"""
|
||||
Validate file exists and is readable.
|
||||
|
||||
Args:
|
||||
file_path: Path to validate
|
||||
|
||||
Raises:
|
||||
ExtractionError: If file is invalid
|
||||
"""
|
||||
if not file_path.exists():
|
||||
raise ExtractionError(
|
||||
message=f"File not found: {file_path}",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
if not file_path.is_file():
|
||||
raise ExtractionError(
|
||||
message=f"Path is not a file: {file_path}",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
if file_path.stat().st_size == 0:
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
|
||||
"""
|
||||
Create document metadata from PPTX file and Docling result.
|
||||
|
||||
Args:
|
||||
file_path: Path to the PPTX file
|
||||
result: Docling conversion result
|
||||
|
||||
Returns:
|
||||
DocumentMetadata entity
|
||||
"""
|
||||
stat = file_path.stat()
|
||||
|
||||
# Extract slide count from Docling result
|
||||
slide_count = None
|
||||
try:
|
||||
if hasattr(result.document, 'pages'):
|
||||
slide_count = len(result.document.pages)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not extract slide count: {str(e)}")
|
||||
|
||||
extra_metadata = {}
|
||||
if slide_count is not None:
|
||||
extra_metadata['slide_count'] = str(slide_count)
|
||||
|
||||
return DocumentMetadata(
|
||||
source_id=str(file_path.absolute()),
|
||||
source_type=SourceType.FILE,
|
||||
size_bytes=stat.st_size,
|
||||
extra_metadata=extra_metadata,
|
||||
)
|
||||
@ -17,8 +17,10 @@ from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
||||
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
|
||||
from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
|
||||
from .adapters.outgoing.extractors.factory import ExtractorFactory
|
||||
from .adapters.outgoing.extractors.html_extractor import HTMLExtractor
|
||||
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
|
||||
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
|
||||
from .adapters.outgoing.extractors.pptx_extractor import PPTXExtractor
|
||||
from .adapters.outgoing.extractors.txt_extractor import TxtExtractor
|
||||
from .adapters.outgoing.extractors.zip_extractor import ZipExtractor
|
||||
from .adapters.outgoing.persistence.in_memory_repository import (
|
||||
@ -123,6 +125,8 @@ class ApplicationContainer:
|
||||
factory.register_extractor(TxtExtractor())
|
||||
factory.register_extractor(MarkdownExtractor())
|
||||
factory.register_extractor(ZipExtractor())
|
||||
factory.register_extractor(HTMLExtractor())
|
||||
factory.register_extractor(PPTXExtractor())
|
||||
|
||||
logger.info(
|
||||
f"Registered extractors for: {factory.get_supported_types()}"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user