use docling in extractors

This commit is contained in:
m.dabbagh 2026-01-24 13:43:07 +03:30
parent ad163eb665
commit 2ccb38179d
5 changed files with 212 additions and 180 deletions

View File

@ -11,8 +11,7 @@ uvicorn[standard]==0.34.0
python-multipart==0.0.20
# Document Processing - Extractors
PyPDF2==3.0.1 # PDF extraction
python-docx==1.1.2 # DOCX extraction
docling # Unified document extraction (PDF, DOCX, Excel)
# Cloud Storage
boto3==1.35.94 # AWS S3 integration

View File

@ -1,13 +1,15 @@
"""
DOCX Extractor - Concrete implementation for Word document extraction.
This adapter implements the IExtractor port using python-docx library.
It maps python-docx exceptions to domain exceptions.
This adapter implements the IExtractor port using Docling library.
It maps Docling exceptions to domain exceptions.
"""
import logging
from pathlib import Path
from typing import List
from docling.document_converter import DocumentConverter
from ....core.domain.exceptions import (
EmptyContentError,
ExtractionError,
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
class DocxExtractor(IExtractor):
"""
Concrete DOCX extractor using python-docx.
Concrete DOCX extractor using Docling.
This adapter:
1. Extracts text from DOCX files using python-docx
2. Handles paragraphs and tables
3. Maps exceptions to domain exceptions
1. Extracts text from DOCX files using Docling's DocumentConverter
2. Converts DOCX to Markdown format
3. Extracts metadata from document
"""
def __init__(self) -> None:
"""Initialize DOCX extractor."""
"""Initialize DOCX extractor with Docling converter."""
self._supported_extensions = ['docx']
logger.debug("DocxExtractor initialized")
self._converter = DocumentConverter()
logger.debug("DocxExtractor initialized with Docling")
def extract(self, file_path: Path) -> Document:
"""
Extract text and metadata from DOCX file.
Extract text and metadata from DOCX file using Docling.
Args:
file_path: Path to the DOCX file
@ -54,21 +57,22 @@ class DocxExtractor(IExtractor):
# Validate file
self._validate_file(file_path)
# Extract text
text = self._extract_text_from_docx(file_path)
# Convert DOCX to markdown using Docling
result = self._converter.convert(str(file_path))
markdown_text = result.document.export_to_markdown()
# Validate content
if not text or not text.strip():
if not markdown_text or not markdown_text.strip():
raise EmptyContentError(file_path=str(file_path))
# Create metadata
metadata = self._create_metadata(file_path)
# Build document with raw_markdown
document = Document(raw_markdown=text, metadata=metadata)
document = Document(raw_markdown=markdown_text, metadata=metadata)
logger.info(
f"Successfully extracted {len(text)} characters from {file_path.name}"
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
)
return document
@ -130,83 +134,6 @@ class DocxExtractor(IExtractor):
if file_path.stat().st_size == 0:
raise EmptyContentError(file_path=str(file_path))
def _extract_text_from_docx(self, file_path: Path) -> str:
"""
Extract text from DOCX using python-docx.
Args:
file_path: Path to DOCX file
Returns:
Extracted text content
Raises:
ExtractionError: If DOCX extraction fails
"""
try:
import docx
logger.debug(f"Reading DOCX: {file_path}")
document = docx.Document(file_path)
# Extract paragraphs
text_parts = self._extract_paragraphs(document)
# Extract tables
table_text = self._extract_tables(document)
if table_text:
text_parts.extend(table_text)
return "\n".join(text_parts)
except ImportError:
raise ExtractionError(
message="python-docx library not installed",
details="Install with: pip install python-docx",
file_path=str(file_path),
)
except Exception as e:
raise ExtractionError(
message=f"DOCX extraction failed: {str(e)}",
file_path=str(file_path),
)
def _extract_paragraphs(self, document) -> List[str]:
"""
Extract text from all paragraphs.
Args:
document: python-docx Document object
Returns:
List of paragraph texts
"""
paragraphs = []
for paragraph in document.paragraphs:
text = paragraph.text.strip()
if text:
paragraphs.append(text)
return paragraphs
def _extract_tables(self, document) -> List[str]:
"""
Extract text from all tables.
Args:
document: python-docx Document object
Returns:
List of table cell texts
"""
table_texts = []
for table in document.tables:
for row in table.rows:
for cell in row.cells:
text = cell.text.strip()
if text:
table_texts.append(text)
return table_texts
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
"""
Create source-neutral document metadata from file.

View File

@ -0,0 +1,154 @@
"""
Excel Extractor - Concrete implementation for Excel file extraction.
This adapter implements the IExtractor port using Docling library.
It maps Docling exceptions to domain exceptions.
"""
import logging
from pathlib import Path
from typing import List
from docling.document_converter import DocumentConverter
from ....core.domain.exceptions import (
EmptyContentError,
ExtractionError,
)
from ....core.domain.models import Document, DocumentMetadata, SourceType
from ....core.ports.outgoing.extractor import IExtractor
logger = logging.getLogger(__name__)
class ExcelExtractor(IExtractor):
"""
Concrete Excel extractor using Docling.
This adapter:
1. Extracts text from Excel files (.xlsx, .xls) using Docling's DocumentConverter
2. Converts Excel to Markdown format
3. Extracts metadata from spreadsheet
"""
def __init__(self) -> None:
"""Initialize Excel extractor with Docling converter."""
self._supported_extensions = ['xlsx', 'xls']
self._converter = DocumentConverter()
logger.debug("ExcelExtractor initialized with Docling")
def extract(self, file_path: Path) -> Document:
"""
Extract text and metadata from Excel file using Docling.
Args:
file_path: Path to the Excel file
Returns:
Document entity with extracted content and metadata
Raises:
ExtractionError: If extraction fails
EmptyContentError: If no text could be extracted
"""
try:
logger.info(f"Extracting text from Excel: {file_path}")
# Validate file
self._validate_file(file_path)
# Convert Excel to markdown using Docling
result = self._converter.convert(str(file_path))
markdown_text = result.document.export_to_markdown()
# Validate content
if not markdown_text or not markdown_text.strip():
raise EmptyContentError(file_path=str(file_path))
# Create metadata
metadata = self._create_metadata(file_path)
# Build document with raw_markdown
document = Document(raw_markdown=markdown_text, metadata=metadata)
logger.info(
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
)
return document
except EmptyContentError:
raise
except ExtractionError:
raise
except Exception as e:
logger.error(f"Excel extraction failed for {file_path}: {str(e)}")
raise ExtractionError(
message=f"Failed to extract text from {file_path.name}",
details=str(e),
file_path=str(file_path),
)
def supports_file_type(self, file_extension: str) -> bool:
"""
Check if this extractor supports Excel files.
Args:
file_extension: File extension (e.g., 'xlsx', 'xls')
Returns:
True if Excel files are supported
"""
return file_extension.lower() in self._supported_extensions
def get_supported_types(self) -> List[str]:
"""
Get list of supported file extensions.
Returns:
List containing 'xlsx' and 'xls'
"""
return self._supported_extensions.copy()
def _validate_file(self, file_path: Path) -> None:
"""
Validate file exists and is readable.
Args:
file_path: Path to validate
Raises:
ExtractionError: If file is invalid
"""
if not file_path.exists():
raise ExtractionError(
message=f"File not found: {file_path}",
file_path=str(file_path),
)
if not file_path.is_file():
raise ExtractionError(
message=f"Path is not a file: {file_path}",
file_path=str(file_path),
)
if file_path.stat().st_size == 0:
raise EmptyContentError(file_path=str(file_path))
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
"""
Create document metadata from Excel file.
Args:
file_path: Path to the Excel file
Returns:
DocumentMetadata entity
"""
stat = file_path.stat()
return DocumentMetadata(
source_id=str(file_path.absolute()),
source_type=SourceType.FILE,
display_name=file_path.name,
size_bytes=stat.st_size,
)

View File

@ -1,13 +1,15 @@
"""
PDF Extractor - Concrete implementation for PDF text extraction.
This adapter implements the IExtractor port using PyPDF2 library.
It maps PyPDF2 exceptions to domain exceptions.
This adapter implements the IExtractor port using Docling library.
It maps Docling exceptions to domain exceptions.
"""
import logging
from pathlib import Path
from typing import List
from docling.document_converter import DocumentConverter
from ....core.domain.exceptions import (
EmptyContentError,
ExtractionError,
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
class PDFExtractor(IExtractor):
"""
Concrete PDF extractor using PyPDF2.
Concrete PDF extractor using Docling.
This adapter:
1. Extracts text from PDF files using PyPDF2
2. Maps PyPDF2 exceptions to domain exceptions
3. Creates Document entities with metadata
1. Extracts text from PDF files using Docling's DocumentConverter
2. Converts PDF to Markdown format
3. Extracts metadata including page count
"""
def __init__(self) -> None:
"""Initialize PDF extractor."""
"""Initialize PDF extractor with Docling converter."""
self._supported_extensions = ['pdf']
logger.debug("PDFExtractor initialized")
self._converter = DocumentConverter()
logger.debug("PDFExtractor initialized with Docling")
def extract(self, file_path: Path) -> Document:
"""
Extract text and metadata from PDF file.
Extract text and metadata from PDF file using Docling.
Args:
file_path: Path to the PDF file
@ -54,21 +57,22 @@ class PDFExtractor(IExtractor):
# Validate file
self._validate_file(file_path)
# Extract text
text = self._extract_text_from_pdf(file_path)
# Convert PDF to markdown using Docling
result = self._converter.convert(str(file_path))
markdown_text = result.document.export_to_markdown()
# Validate content
if not text or not text.strip():
if not markdown_text or not markdown_text.strip():
raise EmptyContentError(file_path=str(file_path))
# Create metadata
metadata = self._create_metadata(file_path)
# Create metadata with page count from Docling result
metadata = self._create_metadata(file_path, result)
# Build document with raw_markdown
document = Document(raw_markdown=text, metadata=metadata)
document = Document(raw_markdown=markdown_text, metadata=metadata)
logger.info(
f"Successfully extracted {len(text)} characters from {file_path.name}"
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
)
return document
@ -130,89 +134,35 @@ class PDFExtractor(IExtractor):
if file_path.stat().st_size == 0:
raise EmptyContentError(file_path=str(file_path))
def _extract_text_from_pdf(self, file_path: Path) -> str:
def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
"""
Extract text from PDF using PyPDF2.
Create document metadata from PDF file and Docling result.
Args:
file_path: Path to PDF file
Returns:
Extracted text content
Raises:
ExtractionError: If PDF extraction fails
"""
try:
import PyPDF2
logger.debug(f"Reading PDF: {file_path}")
text_parts = []
with open(file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
logger.debug(f"PDF has {num_pages} pages")
for page_num, page in enumerate(pdf_reader.pages, start=1):
page_text = self._extract_page_text(page, page_num)
if page_text:
text_parts.append(page_text)
return "\n\n".join(text_parts)
except ImportError:
raise ExtractionError(
message="PyPDF2 library not installed",
details="Install with: pip install PyPDF2",
file_path=str(file_path),
)
except Exception as e:
raise ExtractionError(
message=f"PDF extraction failed: {str(e)}",
file_path=str(file_path),
)
def _extract_page_text(self, page, page_num: int) -> str:
"""
Extract text from a single page.
Args:
page: PyPDF2 page object
page_num: Page number for logging
Returns:
Extracted page text
"""
try:
import PyPDF2
text = page.extract_text()
logger.debug(f"Extracted page {page_num}")
return text
except PyPDF2.errors.PdfReadError as e:
logger.warning(f"Failed to extract page {page_num}: {str(e)}")
return ""
except Exception as e:
logger.warning(f"Error on page {page_num}: {str(e)}")
return ""
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
"""
Create source-neutral document metadata from file.
Args:
file_path: Path to the file
file_path: Path to the PDF file
result: Docling conversion result
Returns:
DocumentMetadata entity
"""
stat = file_path.stat()
# Extract page count from Docling result
page_count = None
try:
if hasattr(result.document, 'pages'):
page_count = len(result.document.pages)
except Exception as e:
logger.warning(f"Could not extract page count: {str(e)}")
extra_metadata = {}
if page_count is not None:
extra_metadata['page_count'] = str(page_count)
return DocumentMetadata(
source_id=str(file_path.absolute()),
source_type=SourceType.FILE,
display_name=file_path.name,
size_bytes=stat.st_size,
extra_metadata=extra_metadata,
)

View File

@ -15,6 +15,7 @@ from .adapters.outgoing.chunkers.context import ChunkingContext
from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
from .adapters.outgoing.extractors.factory import ExtractorFactory
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
@ -118,6 +119,7 @@ class ApplicationContainer:
# Register all extractors
factory.register_extractor(PDFExtractor())
factory.register_extractor(DocxExtractor())
factory.register_extractor(ExcelExtractor())
factory.register_extractor(TxtExtractor())
factory.register_extractor(MarkdownExtractor())
factory.register_extractor(ZipExtractor())