From 2ccb38179d2f7828ae5e9782cdb2070ecc3ca935 Mon Sep 17 00:00:00 2001 From: "m.dabbagh" Date: Sat, 24 Jan 2026 13:43:07 +0330 Subject: [PATCH] use docling in extractors --- requirements.txt | 3 +- .../outgoing/extractors/docx_extractor.py | 109 ++----------- .../outgoing/extractors/excel_extractor.py | 154 ++++++++++++++++++ .../outgoing/extractors/pdf_extractor.py | 124 +++++--------- src/bootstrap.py | 2 + 5 files changed, 212 insertions(+), 180 deletions(-) create mode 100644 src/adapters/outgoing/extractors/excel_extractor.py diff --git a/requirements.txt b/requirements.txt index 8645bb6..0b64d7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,8 +11,7 @@ uvicorn[standard]==0.34.0 python-multipart==0.0.20 # Document Processing - Extractors -PyPDF2==3.0.1 # PDF extraction -python-docx==1.1.2 # DOCX extraction +docling # Unified document extraction (PDF, DOCX, Excel) # Cloud Storage boto3==1.35.94 # AWS S3 integration diff --git a/src/adapters/outgoing/extractors/docx_extractor.py b/src/adapters/outgoing/extractors/docx_extractor.py index fe36772..5efe5c6 100644 --- a/src/adapters/outgoing/extractors/docx_extractor.py +++ b/src/adapters/outgoing/extractors/docx_extractor.py @@ -1,13 +1,15 @@ """ DOCX Extractor - Concrete implementation for Word document extraction. -This adapter implements the IExtractor port using python-docx library. -It maps python-docx exceptions to domain exceptions. +This adapter implements the IExtractor port using Docling library. +It maps Docling exceptions to domain exceptions. """ import logging from pathlib import Path from typing import List +from docling.document_converter import DocumentConverter + from ....core.domain.exceptions import ( EmptyContentError, ExtractionError, @@ -21,22 +23,23 @@ logger = logging.getLogger(__name__) class DocxExtractor(IExtractor): """ - Concrete DOCX extractor using python-docx. + Concrete DOCX extractor using Docling. This adapter: - 1. Extracts text from DOCX files using python-docx - 2. Handles paragraphs and tables - 3. Maps exceptions to domain exceptions + 1. Extracts text from DOCX files using Docling's DocumentConverter + 2. Converts DOCX to Markdown format + 3. Extracts metadata from document """ def __init__(self) -> None: - """Initialize DOCX extractor.""" + """Initialize DOCX extractor with Docling converter.""" self._supported_extensions = ['docx'] - logger.debug("DocxExtractor initialized") + self._converter = DocumentConverter() + logger.debug("DocxExtractor initialized with Docling") def extract(self, file_path: Path) -> Document: """ - Extract text and metadata from DOCX file. + Extract text and metadata from DOCX file using Docling. Args: file_path: Path to the DOCX file @@ -54,21 +57,22 @@ class DocxExtractor(IExtractor): # Validate file self._validate_file(file_path) - # Extract text - text = self._extract_text_from_docx(file_path) + # Convert DOCX to markdown using Docling + result = self._converter.convert(str(file_path)) + markdown_text = result.document.export_to_markdown() # Validate content - if not text or not text.strip(): + if not markdown_text or not markdown_text.strip(): raise EmptyContentError(file_path=str(file_path)) # Create metadata metadata = self._create_metadata(file_path) # Build document with raw_markdown - document = Document(raw_markdown=text, metadata=metadata) + document = Document(raw_markdown=markdown_text, metadata=metadata) logger.info( - f"Successfully extracted {len(text)} characters from {file_path.name}" + f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" ) return document @@ -130,83 +134,6 @@ class DocxExtractor(IExtractor): if file_path.stat().st_size == 0: raise EmptyContentError(file_path=str(file_path)) - def _extract_text_from_docx(self, file_path: Path) -> str: - """ - Extract text from DOCX using python-docx. - - Args: - file_path: Path to DOCX file - - Returns: - Extracted text content - - Raises: - ExtractionError: If DOCX extraction fails - """ - try: - import docx - - logger.debug(f"Reading DOCX: {file_path}") - document = docx.Document(file_path) - - # Extract paragraphs - text_parts = self._extract_paragraphs(document) - - # Extract tables - table_text = self._extract_tables(document) - if table_text: - text_parts.extend(table_text) - - return "\n".join(text_parts) - - except ImportError: - raise ExtractionError( - message="python-docx library not installed", - details="Install with: pip install python-docx", - file_path=str(file_path), - ) - except Exception as e: - raise ExtractionError( - message=f"DOCX extraction failed: {str(e)}", - file_path=str(file_path), - ) - - def _extract_paragraphs(self, document) -> List[str]: - """ - Extract text from all paragraphs. - - Args: - document: python-docx Document object - - Returns: - List of paragraph texts - """ - paragraphs = [] - for paragraph in document.paragraphs: - text = paragraph.text.strip() - if text: - paragraphs.append(text) - return paragraphs - - def _extract_tables(self, document) -> List[str]: - """ - Extract text from all tables. - - Args: - document: python-docx Document object - - Returns: - List of table cell texts - """ - table_texts = [] - for table in document.tables: - for row in table.rows: - for cell in row.cells: - text = cell.text.strip() - if text: - table_texts.append(text) - return table_texts - def _create_metadata(self, file_path: Path) -> DocumentMetadata: """ Create source-neutral document metadata from file. diff --git a/src/adapters/outgoing/extractors/excel_extractor.py b/src/adapters/outgoing/extractors/excel_extractor.py new file mode 100644 index 0000000..7faa92e --- /dev/null +++ b/src/adapters/outgoing/extractors/excel_extractor.py @@ -0,0 +1,154 @@ +""" +Excel Extractor - Concrete implementation for Excel file extraction. + +This adapter implements the IExtractor port using Docling library. +It maps Docling exceptions to domain exceptions. +""" +import logging +from pathlib import Path +from typing import List + +from docling.document_converter import DocumentConverter + +from ....core.domain.exceptions import ( + EmptyContentError, + ExtractionError, +) +from ....core.domain.models import Document, DocumentMetadata, SourceType +from ....core.ports.outgoing.extractor import IExtractor + + +logger = logging.getLogger(__name__) + + +class ExcelExtractor(IExtractor): + """ + Concrete Excel extractor using Docling. + + This adapter: + 1. Extracts text from Excel files (.xlsx, .xls) using Docling's DocumentConverter + 2. Converts Excel to Markdown format + 3. Extracts metadata from spreadsheet + """ + + def __init__(self) -> None: + """Initialize Excel extractor with Docling converter.""" + self._supported_extensions = ['xlsx', 'xls'] + self._converter = DocumentConverter() + logger.debug("ExcelExtractor initialized with Docling") + + def extract(self, file_path: Path) -> Document: + """ + Extract text and metadata from Excel file using Docling. + + Args: + file_path: Path to the Excel file + + Returns: + Document entity with extracted content and metadata + + Raises: + ExtractionError: If extraction fails + EmptyContentError: If no text could be extracted + """ + try: + logger.info(f"Extracting text from Excel: {file_path}") + + # Validate file + self._validate_file(file_path) + + # Convert Excel to markdown using Docling + result = self._converter.convert(str(file_path)) + markdown_text = result.document.export_to_markdown() + + # Validate content + if not markdown_text or not markdown_text.strip(): + raise EmptyContentError(file_path=str(file_path)) + + # Create metadata + metadata = self._create_metadata(file_path) + + # Build document with raw_markdown + document = Document(raw_markdown=markdown_text, metadata=metadata) + + logger.info( + f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" + ) + return document + + except EmptyContentError: + raise + except ExtractionError: + raise + except Exception as e: + logger.error(f"Excel extraction failed for {file_path}: {str(e)}") + raise ExtractionError( + message=f"Failed to extract text from {file_path.name}", + details=str(e), + file_path=str(file_path), + ) + + def supports_file_type(self, file_extension: str) -> bool: + """ + Check if this extractor supports Excel files. + + Args: + file_extension: File extension (e.g., 'xlsx', 'xls') + + Returns: + True if Excel files are supported + """ + return file_extension.lower() in self._supported_extensions + + def get_supported_types(self) -> List[str]: + """ + Get list of supported file extensions. + + Returns: + List containing 'xlsx' and 'xls' + """ + return self._supported_extensions.copy() + + def _validate_file(self, file_path: Path) -> None: + """ + Validate file exists and is readable. + + Args: + file_path: Path to validate + + Raises: + ExtractionError: If file is invalid + """ + if not file_path.exists(): + raise ExtractionError( + message=f"File not found: {file_path}", + file_path=str(file_path), + ) + + if not file_path.is_file(): + raise ExtractionError( + message=f"Path is not a file: {file_path}", + file_path=str(file_path), + ) + + if file_path.stat().st_size == 0: + raise EmptyContentError(file_path=str(file_path)) + + def _create_metadata(self, file_path: Path) -> DocumentMetadata: + """ + Create document metadata from Excel file. + + Args: + file_path: Path to the Excel file + + Returns: + DocumentMetadata entity + """ + stat = file_path.stat() + + return DocumentMetadata( + source_id=str(file_path.absolute()), + source_type=SourceType.FILE, + display_name=file_path.name, + size_bytes=stat.st_size, + ) diff --git a/src/adapters/outgoing/extractors/pdf_extractor.py b/src/adapters/outgoing/extractors/pdf_extractor.py index 8ee0090..72a5bc1 100644 --- a/src/adapters/outgoing/extractors/pdf_extractor.py +++ b/src/adapters/outgoing/extractors/pdf_extractor.py @@ -1,13 +1,15 @@ """ PDF Extractor - Concrete implementation for PDF text extraction. -This adapter implements the IExtractor port using PyPDF2 library. -It maps PyPDF2 exceptions to domain exceptions. +This adapter implements the IExtractor port using Docling library. +It maps Docling exceptions to domain exceptions. """ import logging from pathlib import Path from typing import List +from docling.document_converter import DocumentConverter + from ....core.domain.exceptions import ( EmptyContentError, ExtractionError, @@ -21,22 +23,23 @@ logger = logging.getLogger(__name__) class PDFExtractor(IExtractor): """ - Concrete PDF extractor using PyPDF2. + Concrete PDF extractor using Docling. This adapter: - 1. Extracts text from PDF files using PyPDF2 - 2. Maps PyPDF2 exceptions to domain exceptions - 3. Creates Document entities with metadata + 1. Extracts text from PDF files using Docling's DocumentConverter + 2. Converts PDF to Markdown format + 3. Extracts metadata including page count """ def __init__(self) -> None: - """Initialize PDF extractor.""" + """Initialize PDF extractor with Docling converter.""" self._supported_extensions = ['pdf'] - logger.debug("PDFExtractor initialized") + self._converter = DocumentConverter() + logger.debug("PDFExtractor initialized with Docling") def extract(self, file_path: Path) -> Document: """ - Extract text and metadata from PDF file. + Extract text and metadata from PDF file using Docling. Args: file_path: Path to the PDF file @@ -54,21 +57,22 @@ class PDFExtractor(IExtractor): # Validate file self._validate_file(file_path) - # Extract text - text = self._extract_text_from_pdf(file_path) + # Convert PDF to markdown using Docling + result = self._converter.convert(str(file_path)) + markdown_text = result.document.export_to_markdown() # Validate content - if not text or not text.strip(): + if not markdown_text or not markdown_text.strip(): raise EmptyContentError(file_path=str(file_path)) - # Create metadata - metadata = self._create_metadata(file_path) + # Create metadata with page count from Docling result + metadata = self._create_metadata(file_path, result) # Build document with raw_markdown - document = Document(raw_markdown=text, metadata=metadata) + document = Document(raw_markdown=markdown_text, metadata=metadata) logger.info( - f"Successfully extracted {len(text)} characters from {file_path.name}" + f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" ) return document @@ -130,89 +134,35 @@ class PDFExtractor(IExtractor): if file_path.stat().st_size == 0: raise EmptyContentError(file_path=str(file_path)) - def _extract_text_from_pdf(self, file_path: Path) -> str: + def _create_metadata(self, file_path: Path, result) -> DocumentMetadata: """ - Extract text from PDF using PyPDF2. + Create document metadata from PDF file and Docling result. Args: - file_path: Path to PDF file - - Returns: - Extracted text content - - Raises: - ExtractionError: If PDF extraction fails - """ - try: - import PyPDF2 - - logger.debug(f"Reading PDF: {file_path}") - text_parts = [] - - with open(file_path, 'rb') as pdf_file: - pdf_reader = PyPDF2.PdfReader(pdf_file) - num_pages = len(pdf_reader.pages) - logger.debug(f"PDF has {num_pages} pages") - - for page_num, page in enumerate(pdf_reader.pages, start=1): - page_text = self._extract_page_text(page, page_num) - if page_text: - text_parts.append(page_text) - - return "\n\n".join(text_parts) - - except ImportError: - raise ExtractionError( - message="PyPDF2 library not installed", - details="Install with: pip install PyPDF2", - file_path=str(file_path), - ) - except Exception as e: - raise ExtractionError( - message=f"PDF extraction failed: {str(e)}", - file_path=str(file_path), - ) - - def _extract_page_text(self, page, page_num: int) -> str: - """ - Extract text from a single page. - - Args: - page: PyPDF2 page object - page_num: Page number for logging - - Returns: - Extracted page text - """ - try: - import PyPDF2 - - text = page.extract_text() - logger.debug(f"Extracted page {page_num}") - return text - - except PyPDF2.errors.PdfReadError as e: - logger.warning(f"Failed to extract page {page_num}: {str(e)}") - return "" - except Exception as e: - logger.warning(f"Error on page {page_num}: {str(e)}") - return "" - - def _create_metadata(self, file_path: Path) -> DocumentMetadata: - """ - Create source-neutral document metadata from file. - - Args: - file_path: Path to the file + file_path: Path to the PDF file + result: Docling conversion result Returns: DocumentMetadata entity """ stat = file_path.stat() + # Extract page count from Docling result + page_count = None + try: + if hasattr(result.document, 'pages'): + page_count = len(result.document.pages) + except Exception as e: + logger.warning(f"Could not extract page count: {str(e)}") + + extra_metadata = {} + if page_count is not None: + extra_metadata['page_count'] = str(page_count) + return DocumentMetadata( source_id=str(file_path.absolute()), source_type=SourceType.FILE, display_name=file_path.name, size_bytes=stat.st_size, + extra_metadata=extra_metadata, ) diff --git a/src/bootstrap.py b/src/bootstrap.py index b87c886..e752bb9 100644 --- a/src/bootstrap.py +++ b/src/bootstrap.py @@ -15,6 +15,7 @@ from .adapters.outgoing.chunkers.context import ChunkingContext from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker from .adapters.outgoing.extractors.docx_extractor import DocxExtractor +from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor from .adapters.outgoing.extractors.factory import ExtractorFactory from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor @@ -118,6 +119,7 @@ class ApplicationContainer: # Register all extractors factory.register_extractor(PDFExtractor()) factory.register_extractor(DocxExtractor()) + factory.register_extractor(ExcelExtractor()) factory.register_extractor(TxtExtractor()) factory.register_extractor(MarkdownExtractor()) factory.register_extractor(ZipExtractor())