feat: add pptx_extractor and html_extractor
This commit is contained in:
parent
b53f8c47d3
commit
b57792eb41
@ -283,14 +283,14 @@ async def perform_chunking(
|
|||||||
description="Upload a file and extract text content with metadata",
|
description="Upload a file and extract text content with metadata",
|
||||||
)
|
)
|
||||||
async def extract_document(
|
async def extract_document(
|
||||||
file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, md, zip)"),
|
file: UploadFile = File(..., description="Document file to extract (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
|
||||||
service: ITextProcessor = Depends(get_service),
|
service: ITextProcessor = Depends(get_service),
|
||||||
) -> DocumentResponse:
|
) -> DocumentResponse:
|
||||||
"""
|
"""
|
||||||
Extract text content from uploaded file.
|
Extract text content from uploaded file.
|
||||||
|
|
||||||
This endpoint handles file extraction only:
|
This endpoint handles file extraction only:
|
||||||
1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
|
1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
|
||||||
2. Extracts raw text content using appropriate extractor
|
2. Extracts raw text content using appropriate extractor
|
||||||
3. Returns Document entity with metadata (no parsing)
|
3. Returns Document entity with metadata (no parsing)
|
||||||
"""
|
"""
|
||||||
@ -312,7 +312,7 @@ async def extract_document(
|
|||||||
description="Upload a file, extract text, parse markdown, and return chunks",
|
description="Upload a file, extract text, parse markdown, and return chunks",
|
||||||
)
|
)
|
||||||
async def process_file(
|
async def process_file(
|
||||||
file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, md, zip)"),
|
file: UploadFile = File(..., description="Document file to process (pdf, docx, pptx, html, txt, md, xlsx, zip)"),
|
||||||
strategy: ChunkingStrategy = Depends(get_chunking_strategy),
|
strategy: ChunkingStrategy = Depends(get_chunking_strategy),
|
||||||
service: ITextProcessor = Depends(get_service),
|
service: ITextProcessor = Depends(get_service),
|
||||||
) -> ChunkListResponse:
|
) -> ChunkListResponse:
|
||||||
@ -320,7 +320,7 @@ async def process_file(
|
|||||||
Complete file processing pipeline: Upload → Extract → Parse → Chunk.
|
Complete file processing pipeline: Upload → Extract → Parse → Chunk.
|
||||||
|
|
||||||
This endpoint handles the full document processing workflow:
|
This endpoint handles the full document processing workflow:
|
||||||
1. Accepts file upload (PDF, DOCX, TXT, MD, ZIP)
|
1. Accepts file upload (PDF, DOCX, PPTX, HTML, TXT, MD, XLSX, ZIP)
|
||||||
2. Extracts text content using appropriate extractor
|
2. Extracts text content using appropriate extractor
|
||||||
3. Parses markdown structure into sections
|
3. Parses markdown structure into sections
|
||||||
4. Chunks content according to strategy
|
4. Chunks content according to strategy
|
||||||
@ -351,7 +351,7 @@ async def health_check() -> HealthCheckResponse:
|
|||||||
return HealthCheckResponse(
|
return HealthCheckResponse(
|
||||||
status="healthy",
|
status="healthy",
|
||||||
version="1.0.0",
|
version="1.0.0",
|
||||||
supported_file_types=["pdf", "docx", "txt", "md", "markdown", "zip"],
|
supported_file_types=["pdf", "docx", "pptx", "html", "htm", "txt", "md", "markdown", "zip", "xlsx"],
|
||||||
available_strategies=["fixed_size", "paragraph"],
|
available_strategies=["fixed_size", "paragraph"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
158
src/adapters/outgoing/extractors/html_extractor.py
Normal file
158
src/adapters/outgoing/extractors/html_extractor.py
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
"""
|
||||||
|
HTML Extractor - Concrete implementation for HTML text extraction.
|
||||||
|
|
||||||
|
This adapter implements the IExtractor port using Docling library.
|
||||||
|
It maps Docling exceptions to domain exceptions.
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
from ....core.domain.exceptions import (
|
||||||
|
EmptyContentError,
|
||||||
|
ExtractionError,
|
||||||
|
)
|
||||||
|
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||||
|
from ....core.ports.outgoing.extractor import IExtractor
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLExtractor(IExtractor):
|
||||||
|
"""
|
||||||
|
Concrete HTML extractor using Docling.
|
||||||
|
|
||||||
|
This adapter:
|
||||||
|
1. Extracts text from HTML files using Docling's DocumentConverter
|
||||||
|
2. Converts HTML to Markdown format
|
||||||
|
3. Preserves document structure and formatting
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
"""Initialize HTML extractor with Docling converter."""
|
||||||
|
self._supported_extensions = ['html', 'htm']
|
||||||
|
self._converter = DocumentConverter()
|
||||||
|
logger.info("HTML Extractor initialized with Docling DocumentConverter")
|
||||||
|
|
||||||
|
def extract(self, file_path: Path) -> Document:
|
||||||
|
"""
|
||||||
|
Extract text and metadata from HTML file using Docling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the HTML file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document entity with extracted content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If extraction fails
|
||||||
|
EmptyContentError: If no text could be extracted
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Extracting text from HTML: {file_path}")
|
||||||
|
|
||||||
|
# Validate file
|
||||||
|
self._validate_file(file_path)
|
||||||
|
|
||||||
|
# Convert HTML to markdown using Docling
|
||||||
|
result = self._converter.convert(str(file_path))
|
||||||
|
markdown_text = result.document.export_to_markdown()
|
||||||
|
|
||||||
|
# Validate content
|
||||||
|
if not markdown_text or not markdown_text.strip():
|
||||||
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
|
# Create metadata
|
||||||
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
|
# Build document with raw_markdown
|
||||||
|
document = Document(
|
||||||
|
raw_markdown=markdown_text,
|
||||||
|
title=file_path.stem,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
|
)
|
||||||
|
return document
|
||||||
|
|
||||||
|
except EmptyContentError:
|
||||||
|
raise
|
||||||
|
except ExtractionError:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"HTML extraction failed for {file_path}: {str(e)}")
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"Failed to extract text from {file_path.name}",
|
||||||
|
details=str(e),
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
def supports_file_type(self, file_extension: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this extractor supports a given file type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_extension: File extension (e.g., 'html', 'htm')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if HTML files are supported
|
||||||
|
"""
|
||||||
|
return file_extension.lower() in self._supported_extensions
|
||||||
|
|
||||||
|
def get_supported_types(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get list of supported file extensions.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List containing 'html' and 'htm'
|
||||||
|
"""
|
||||||
|
return self._supported_extensions.copy()
|
||||||
|
|
||||||
|
def _validate_file(self, file_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
Validate file exists and is readable.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to validate
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If file is invalid
|
||||||
|
"""
|
||||||
|
if not file_path.exists():
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"File not found: {file_path}",
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not file_path.is_file():
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"Path is not a file: {file_path}",
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_path.stat().st_size == 0:
|
||||||
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
|
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||||
|
"""
|
||||||
|
Create document metadata from HTML file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the HTML file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DocumentMetadata entity
|
||||||
|
"""
|
||||||
|
stat = file_path.stat()
|
||||||
|
|
||||||
|
return DocumentMetadata(
|
||||||
|
source_id=str(file_path.absolute()),
|
||||||
|
source_type=SourceType.FILE,
|
||||||
|
size_bytes=stat.st_size,
|
||||||
|
)
|
||||||
173
src/adapters/outgoing/extractors/pptx_extractor.py
Normal file
173
src/adapters/outgoing/extractors/pptx_extractor.py
Normal file
@ -0,0 +1,173 @@
|
|||||||
|
"""
|
||||||
|
PPTX Extractor - Concrete implementation for PowerPoint text extraction.
|
||||||
|
|
||||||
|
This adapter implements the IExtractor port using Docling library.
|
||||||
|
It maps Docling exceptions to domain exceptions.
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
from ....core.domain.exceptions import (
|
||||||
|
EmptyContentError,
|
||||||
|
ExtractionError,
|
||||||
|
)
|
||||||
|
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||||
|
from ....core.ports.outgoing.extractor import IExtractor
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PPTXExtractor(IExtractor):
|
||||||
|
"""
|
||||||
|
Concrete PPTX extractor using Docling.
|
||||||
|
|
||||||
|
This adapter:
|
||||||
|
1. Extracts text from PowerPoint files using Docling's DocumentConverter
|
||||||
|
2. Converts slides to Markdown format
|
||||||
|
3. Preserves slide structure and formatting
|
||||||
|
4. Extracts slide count metadata
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
"""Initialize PPTX extractor with Docling converter."""
|
||||||
|
self._supported_extensions = ['pptx']
|
||||||
|
self._converter = DocumentConverter()
|
||||||
|
logger.info("PPTX Extractor initialized with Docling DocumentConverter")
|
||||||
|
|
||||||
|
def extract(self, file_path: Path) -> Document:
|
||||||
|
"""
|
||||||
|
Extract text and metadata from PPTX file using Docling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the PPTX file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document entity with extracted content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If extraction fails
|
||||||
|
EmptyContentError: If no text could be extracted
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Extracting text from PPTX: {file_path}")
|
||||||
|
|
||||||
|
# Validate file
|
||||||
|
self._validate_file(file_path)
|
||||||
|
|
||||||
|
# Convert PPTX to markdown using Docling
|
||||||
|
result = self._converter.convert(str(file_path))
|
||||||
|
markdown_text = result.document.export_to_markdown()
|
||||||
|
|
||||||
|
# Validate content
|
||||||
|
if not markdown_text or not markdown_text.strip():
|
||||||
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
|
# Create metadata with slide count from Docling result
|
||||||
|
metadata = self._create_metadata(file_path, result)
|
||||||
|
|
||||||
|
# Build document with raw_markdown
|
||||||
|
document = Document(
|
||||||
|
raw_markdown=markdown_text,
|
||||||
|
title=file_path.stem,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
|
)
|
||||||
|
return document
|
||||||
|
|
||||||
|
except EmptyContentError:
|
||||||
|
raise
|
||||||
|
except ExtractionError:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PPTX extraction failed for {file_path}: {str(e)}")
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"Failed to extract text from {file_path.name}",
|
||||||
|
details=str(e),
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
def supports_file_type(self, file_extension: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this extractor supports a given file type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_extension: File extension (e.g., 'pptx')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if PPTX files are supported
|
||||||
|
"""
|
||||||
|
return file_extension.lower() in self._supported_extensions
|
||||||
|
|
||||||
|
def get_supported_types(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get list of supported file extensions.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List containing 'pptx'
|
||||||
|
"""
|
||||||
|
return self._supported_extensions.copy()
|
||||||
|
|
||||||
|
def _validate_file(self, file_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
Validate file exists and is readable.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to validate
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If file is invalid
|
||||||
|
"""
|
||||||
|
if not file_path.exists():
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"File not found: {file_path}",
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not file_path.is_file():
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"Path is not a file: {file_path}",
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_path.stat().st_size == 0:
|
||||||
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
|
def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
|
||||||
|
"""
|
||||||
|
Create document metadata from PPTX file and Docling result.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the PPTX file
|
||||||
|
result: Docling conversion result
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DocumentMetadata entity
|
||||||
|
"""
|
||||||
|
stat = file_path.stat()
|
||||||
|
|
||||||
|
# Extract slide count from Docling result
|
||||||
|
slide_count = None
|
||||||
|
try:
|
||||||
|
if hasattr(result.document, 'pages'):
|
||||||
|
slide_count = len(result.document.pages)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not extract slide count: {str(e)}")
|
||||||
|
|
||||||
|
extra_metadata = {}
|
||||||
|
if slide_count is not None:
|
||||||
|
extra_metadata['slide_count'] = str(slide_count)
|
||||||
|
|
||||||
|
return DocumentMetadata(
|
||||||
|
source_id=str(file_path.absolute()),
|
||||||
|
source_type=SourceType.FILE,
|
||||||
|
size_bytes=stat.st_size,
|
||||||
|
extra_metadata=extra_metadata,
|
||||||
|
)
|
||||||
@ -17,8 +17,10 @@ from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
|||||||
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
|
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
|
||||||
from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
|
from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
|
||||||
from .adapters.outgoing.extractors.factory import ExtractorFactory
|
from .adapters.outgoing.extractors.factory import ExtractorFactory
|
||||||
|
from .adapters.outgoing.extractors.html_extractor import HTMLExtractor
|
||||||
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
|
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
|
||||||
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
|
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
|
||||||
|
from .adapters.outgoing.extractors.pptx_extractor import PPTXExtractor
|
||||||
from .adapters.outgoing.extractors.txt_extractor import TxtExtractor
|
from .adapters.outgoing.extractors.txt_extractor import TxtExtractor
|
||||||
from .adapters.outgoing.extractors.zip_extractor import ZipExtractor
|
from .adapters.outgoing.extractors.zip_extractor import ZipExtractor
|
||||||
from .adapters.outgoing.persistence.in_memory_repository import (
|
from .adapters.outgoing.persistence.in_memory_repository import (
|
||||||
@ -123,6 +125,8 @@ class ApplicationContainer:
|
|||||||
factory.register_extractor(TxtExtractor())
|
factory.register_extractor(TxtExtractor())
|
||||||
factory.register_extractor(MarkdownExtractor())
|
factory.register_extractor(MarkdownExtractor())
|
||||||
factory.register_extractor(ZipExtractor())
|
factory.register_extractor(ZipExtractor())
|
||||||
|
factory.register_extractor(HTMLExtractor())
|
||||||
|
factory.register_extractor(PPTXExtractor())
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Registered extractors for: {factory.get_supported_types()}"
|
f"Registered extractors for: {factory.get_supported_types()}"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user