use docling in extractors
This commit is contained in:
parent
ad163eb665
commit
2ccb38179d
@ -11,8 +11,7 @@ uvicorn[standard]==0.34.0
|
||||
python-multipart==0.0.20
|
||||
|
||||
# Document Processing - Extractors
|
||||
PyPDF2==3.0.1 # PDF extraction
|
||||
python-docx==1.1.2 # DOCX extraction
|
||||
docling # Unified document extraction (PDF, DOCX, Excel)
|
||||
|
||||
# Cloud Storage
|
||||
boto3==1.35.94 # AWS S3 integration
|
||||
|
||||
@ -1,13 +1,15 @@
|
||||
"""
|
||||
DOCX Extractor - Concrete implementation for Word document extraction.
|
||||
|
||||
This adapter implements the IExtractor port using python-docx library.
|
||||
It maps python-docx exceptions to domain exceptions.
|
||||
This adapter implements the IExtractor port using Docling library.
|
||||
It maps Docling exceptions to domain exceptions.
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from ....core.domain.exceptions import (
|
||||
EmptyContentError,
|
||||
ExtractionError,
|
||||
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class DocxExtractor(IExtractor):
|
||||
"""
|
||||
Concrete DOCX extractor using python-docx.
|
||||
Concrete DOCX extractor using Docling.
|
||||
|
||||
This adapter:
|
||||
1. Extracts text from DOCX files using python-docx
|
||||
2. Handles paragraphs and tables
|
||||
3. Maps exceptions to domain exceptions
|
||||
1. Extracts text from DOCX files using Docling's DocumentConverter
|
||||
2. Converts DOCX to Markdown format
|
||||
3. Extracts metadata from document
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize DOCX extractor."""
|
||||
"""Initialize DOCX extractor with Docling converter."""
|
||||
self._supported_extensions = ['docx']
|
||||
logger.debug("DocxExtractor initialized")
|
||||
self._converter = DocumentConverter()
|
||||
logger.debug("DocxExtractor initialized with Docling")
|
||||
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
"""
|
||||
Extract text and metadata from DOCX file.
|
||||
Extract text and metadata from DOCX file using Docling.
|
||||
|
||||
Args:
|
||||
file_path: Path to the DOCX file
|
||||
@ -54,21 +57,22 @@ class DocxExtractor(IExtractor):
|
||||
# Validate file
|
||||
self._validate_file(file_path)
|
||||
|
||||
# Extract text
|
||||
text = self._extract_text_from_docx(file_path)
|
||||
# Convert DOCX to markdown using Docling
|
||||
result = self._converter.convert(str(file_path))
|
||||
markdown_text = result.document.export_to_markdown()
|
||||
|
||||
# Validate content
|
||||
if not text or not text.strip():
|
||||
if not markdown_text or not markdown_text.strip():
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
# Create metadata
|
||||
metadata = self._create_metadata(file_path)
|
||||
|
||||
# Build document with raw_markdown
|
||||
document = Document(raw_markdown=text, metadata=metadata)
|
||||
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
||||
|
||||
logger.info(
|
||||
f"Successfully extracted {len(text)} characters from {file_path.name}"
|
||||
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||
)
|
||||
return document
|
||||
|
||||
@ -130,83 +134,6 @@ class DocxExtractor(IExtractor):
|
||||
if file_path.stat().st_size == 0:
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
def _extract_text_from_docx(self, file_path: Path) -> str:
|
||||
"""
|
||||
Extract text from DOCX using python-docx.
|
||||
|
||||
Args:
|
||||
file_path: Path to DOCX file
|
||||
|
||||
Returns:
|
||||
Extracted text content
|
||||
|
||||
Raises:
|
||||
ExtractionError: If DOCX extraction fails
|
||||
"""
|
||||
try:
|
||||
import docx
|
||||
|
||||
logger.debug(f"Reading DOCX: {file_path}")
|
||||
document = docx.Document(file_path)
|
||||
|
||||
# Extract paragraphs
|
||||
text_parts = self._extract_paragraphs(document)
|
||||
|
||||
# Extract tables
|
||||
table_text = self._extract_tables(document)
|
||||
if table_text:
|
||||
text_parts.extend(table_text)
|
||||
|
||||
return "\n".join(text_parts)
|
||||
|
||||
except ImportError:
|
||||
raise ExtractionError(
|
||||
message="python-docx library not installed",
|
||||
details="Install with: pip install python-docx",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
except Exception as e:
|
||||
raise ExtractionError(
|
||||
message=f"DOCX extraction failed: {str(e)}",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
def _extract_paragraphs(self, document) -> List[str]:
|
||||
"""
|
||||
Extract text from all paragraphs.
|
||||
|
||||
Args:
|
||||
document: python-docx Document object
|
||||
|
||||
Returns:
|
||||
List of paragraph texts
|
||||
"""
|
||||
paragraphs = []
|
||||
for paragraph in document.paragraphs:
|
||||
text = paragraph.text.strip()
|
||||
if text:
|
||||
paragraphs.append(text)
|
||||
return paragraphs
|
||||
|
||||
def _extract_tables(self, document) -> List[str]:
|
||||
"""
|
||||
Extract text from all tables.
|
||||
|
||||
Args:
|
||||
document: python-docx Document object
|
||||
|
||||
Returns:
|
||||
List of table cell texts
|
||||
"""
|
||||
table_texts = []
|
||||
for table in document.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
text = cell.text.strip()
|
||||
if text:
|
||||
table_texts.append(text)
|
||||
return table_texts
|
||||
|
||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||
"""
|
||||
Create source-neutral document metadata from file.
|
||||
|
||||
154
src/adapters/outgoing/extractors/excel_extractor.py
Normal file
154
src/adapters/outgoing/extractors/excel_extractor.py
Normal file
@ -0,0 +1,154 @@
|
||||
"""
|
||||
Excel Extractor - Concrete implementation for Excel file extraction.
|
||||
|
||||
This adapter implements the IExtractor port using Docling library.
|
||||
It maps Docling exceptions to domain exceptions.
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from ....core.domain.exceptions import (
|
||||
EmptyContentError,
|
||||
ExtractionError,
|
||||
)
|
||||
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||
from ....core.ports.outgoing.extractor import IExtractor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExcelExtractor(IExtractor):
|
||||
"""
|
||||
Concrete Excel extractor using Docling.
|
||||
|
||||
This adapter:
|
||||
1. Extracts text from Excel files (.xlsx, .xls) using Docling's DocumentConverter
|
||||
2. Converts Excel to Markdown format
|
||||
3. Extracts metadata from spreadsheet
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize Excel extractor with Docling converter."""
|
||||
self._supported_extensions = ['xlsx', 'xls']
|
||||
self._converter = DocumentConverter()
|
||||
logger.debug("ExcelExtractor initialized with Docling")
|
||||
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
"""
|
||||
Extract text and metadata from Excel file using Docling.
|
||||
|
||||
Args:
|
||||
file_path: Path to the Excel file
|
||||
|
||||
Returns:
|
||||
Document entity with extracted content and metadata
|
||||
|
||||
Raises:
|
||||
ExtractionError: If extraction fails
|
||||
EmptyContentError: If no text could be extracted
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Extracting text from Excel: {file_path}")
|
||||
|
||||
# Validate file
|
||||
self._validate_file(file_path)
|
||||
|
||||
# Convert Excel to markdown using Docling
|
||||
result = self._converter.convert(str(file_path))
|
||||
markdown_text = result.document.export_to_markdown()
|
||||
|
||||
# Validate content
|
||||
if not markdown_text or not markdown_text.strip():
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
# Create metadata
|
||||
metadata = self._create_metadata(file_path)
|
||||
|
||||
# Build document with raw_markdown
|
||||
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
||||
|
||||
logger.info(
|
||||
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||
)
|
||||
return document
|
||||
|
||||
except EmptyContentError:
|
||||
raise
|
||||
except ExtractionError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Excel extraction failed for {file_path}: {str(e)}")
|
||||
raise ExtractionError(
|
||||
message=f"Failed to extract text from {file_path.name}",
|
||||
details=str(e),
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
def supports_file_type(self, file_extension: str) -> bool:
|
||||
"""
|
||||
Check if this extractor supports Excel files.
|
||||
|
||||
Args:
|
||||
file_extension: File extension (e.g., 'xlsx', 'xls')
|
||||
|
||||
Returns:
|
||||
True if Excel files are supported
|
||||
"""
|
||||
return file_extension.lower() in self._supported_extensions
|
||||
|
||||
def get_supported_types(self) -> List[str]:
|
||||
"""
|
||||
Get list of supported file extensions.
|
||||
|
||||
Returns:
|
||||
List containing 'xlsx' and 'xls'
|
||||
"""
|
||||
return self._supported_extensions.copy()
|
||||
|
||||
def _validate_file(self, file_path: Path) -> None:
|
||||
"""
|
||||
Validate file exists and is readable.
|
||||
|
||||
Args:
|
||||
file_path: Path to validate
|
||||
|
||||
Raises:
|
||||
ExtractionError: If file is invalid
|
||||
"""
|
||||
if not file_path.exists():
|
||||
raise ExtractionError(
|
||||
message=f"File not found: {file_path}",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
if not file_path.is_file():
|
||||
raise ExtractionError(
|
||||
message=f"Path is not a file: {file_path}",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
if file_path.stat().st_size == 0:
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||
"""
|
||||
Create document metadata from Excel file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the Excel file
|
||||
|
||||
Returns:
|
||||
DocumentMetadata entity
|
||||
"""
|
||||
stat = file_path.stat()
|
||||
|
||||
return DocumentMetadata(
|
||||
source_id=str(file_path.absolute()),
|
||||
source_type=SourceType.FILE,
|
||||
display_name=file_path.name,
|
||||
size_bytes=stat.st_size,
|
||||
)
|
||||
@ -1,13 +1,15 @@
|
||||
"""
|
||||
PDF Extractor - Concrete implementation for PDF text extraction.
|
||||
|
||||
This adapter implements the IExtractor port using PyPDF2 library.
|
||||
It maps PyPDF2 exceptions to domain exceptions.
|
||||
This adapter implements the IExtractor port using Docling library.
|
||||
It maps Docling exceptions to domain exceptions.
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from ....core.domain.exceptions import (
|
||||
EmptyContentError,
|
||||
ExtractionError,
|
||||
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class PDFExtractor(IExtractor):
|
||||
"""
|
||||
Concrete PDF extractor using PyPDF2.
|
||||
Concrete PDF extractor using Docling.
|
||||
|
||||
This adapter:
|
||||
1. Extracts text from PDF files using PyPDF2
|
||||
2. Maps PyPDF2 exceptions to domain exceptions
|
||||
3. Creates Document entities with metadata
|
||||
1. Extracts text from PDF files using Docling's DocumentConverter
|
||||
2. Converts PDF to Markdown format
|
||||
3. Extracts metadata including page count
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize PDF extractor."""
|
||||
"""Initialize PDF extractor with Docling converter."""
|
||||
self._supported_extensions = ['pdf']
|
||||
logger.debug("PDFExtractor initialized")
|
||||
self._converter = DocumentConverter()
|
||||
logger.debug("PDFExtractor initialized with Docling")
|
||||
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
"""
|
||||
Extract text and metadata from PDF file.
|
||||
Extract text and metadata from PDF file using Docling.
|
||||
|
||||
Args:
|
||||
file_path: Path to the PDF file
|
||||
@ -54,21 +57,22 @@ class PDFExtractor(IExtractor):
|
||||
# Validate file
|
||||
self._validate_file(file_path)
|
||||
|
||||
# Extract text
|
||||
text = self._extract_text_from_pdf(file_path)
|
||||
# Convert PDF to markdown using Docling
|
||||
result = self._converter.convert(str(file_path))
|
||||
markdown_text = result.document.export_to_markdown()
|
||||
|
||||
# Validate content
|
||||
if not text or not text.strip():
|
||||
if not markdown_text or not markdown_text.strip():
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
# Create metadata
|
||||
metadata = self._create_metadata(file_path)
|
||||
# Create metadata with page count from Docling result
|
||||
metadata = self._create_metadata(file_path, result)
|
||||
|
||||
# Build document with raw_markdown
|
||||
document = Document(raw_markdown=text, metadata=metadata)
|
||||
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
||||
|
||||
logger.info(
|
||||
f"Successfully extracted {len(text)} characters from {file_path.name}"
|
||||
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||
)
|
||||
return document
|
||||
|
||||
@ -130,89 +134,35 @@ class PDFExtractor(IExtractor):
|
||||
if file_path.stat().st_size == 0:
|
||||
raise EmptyContentError(file_path=str(file_path))
|
||||
|
||||
def _extract_text_from_pdf(self, file_path: Path) -> str:
|
||||
def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
|
||||
"""
|
||||
Extract text from PDF using PyPDF2.
|
||||
Create document metadata from PDF file and Docling result.
|
||||
|
||||
Args:
|
||||
file_path: Path to PDF file
|
||||
|
||||
Returns:
|
||||
Extracted text content
|
||||
|
||||
Raises:
|
||||
ExtractionError: If PDF extraction fails
|
||||
"""
|
||||
try:
|
||||
import PyPDF2
|
||||
|
||||
logger.debug(f"Reading PDF: {file_path}")
|
||||
text_parts = []
|
||||
|
||||
with open(file_path, 'rb') as pdf_file:
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
||||
num_pages = len(pdf_reader.pages)
|
||||
logger.debug(f"PDF has {num_pages} pages")
|
||||
|
||||
for page_num, page in enumerate(pdf_reader.pages, start=1):
|
||||
page_text = self._extract_page_text(page, page_num)
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
except ImportError:
|
||||
raise ExtractionError(
|
||||
message="PyPDF2 library not installed",
|
||||
details="Install with: pip install PyPDF2",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
except Exception as e:
|
||||
raise ExtractionError(
|
||||
message=f"PDF extraction failed: {str(e)}",
|
||||
file_path=str(file_path),
|
||||
)
|
||||
|
||||
def _extract_page_text(self, page, page_num: int) -> str:
|
||||
"""
|
||||
Extract text from a single page.
|
||||
|
||||
Args:
|
||||
page: PyPDF2 page object
|
||||
page_num: Page number for logging
|
||||
|
||||
Returns:
|
||||
Extracted page text
|
||||
"""
|
||||
try:
|
||||
import PyPDF2
|
||||
|
||||
text = page.extract_text()
|
||||
logger.debug(f"Extracted page {page_num}")
|
||||
return text
|
||||
|
||||
except PyPDF2.errors.PdfReadError as e:
|
||||
logger.warning(f"Failed to extract page {page_num}: {str(e)}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.warning(f"Error on page {page_num}: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||
"""
|
||||
Create source-neutral document metadata from file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
file_path: Path to the PDF file
|
||||
result: Docling conversion result
|
||||
|
||||
Returns:
|
||||
DocumentMetadata entity
|
||||
"""
|
||||
stat = file_path.stat()
|
||||
|
||||
# Extract page count from Docling result
|
||||
page_count = None
|
||||
try:
|
||||
if hasattr(result.document, 'pages'):
|
||||
page_count = len(result.document.pages)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not extract page count: {str(e)}")
|
||||
|
||||
extra_metadata = {}
|
||||
if page_count is not None:
|
||||
extra_metadata['page_count'] = str(page_count)
|
||||
|
||||
return DocumentMetadata(
|
||||
source_id=str(file_path.absolute()),
|
||||
source_type=SourceType.FILE,
|
||||
display_name=file_path.name,
|
||||
size_bytes=stat.st_size,
|
||||
extra_metadata=extra_metadata,
|
||||
)
|
||||
|
||||
@ -15,6 +15,7 @@ from .adapters.outgoing.chunkers.context import ChunkingContext
|
||||
from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
|
||||
from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
||||
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
|
||||
from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
|
||||
from .adapters.outgoing.extractors.factory import ExtractorFactory
|
||||
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
|
||||
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
|
||||
@ -118,6 +119,7 @@ class ApplicationContainer:
|
||||
# Register all extractors
|
||||
factory.register_extractor(PDFExtractor())
|
||||
factory.register_extractor(DocxExtractor())
|
||||
factory.register_extractor(ExcelExtractor())
|
||||
factory.register_extractor(TxtExtractor())
|
||||
factory.register_extractor(MarkdownExtractor())
|
||||
factory.register_extractor(ZipExtractor())
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user