use docling in extractors

2026-01-24 13:43:07 +03:30 · 2026-01-24 13:43:07 +03:30 · 2ccb38179d
commit 2ccb38179d
parent ad163eb665
5 changed files with 212 additions and 180 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -11,8 +11,7 @@ uvicorn[standard]==0.34.0
 python-multipart==0.0.20
 # Document Processing - Extractors
-PyPDF2==3.0.1          # PDF extraction
+docling                # Unified document extraction (PDF, DOCX, Excel)
 python-docx==1.1.2     # DOCX extraction
 # Cloud Storage
 boto3==1.35.94         # AWS S3 integration
--- a/src/adapters/outgoing/extractors/docx_extractor.py
+++ b/src/adapters/outgoing/extractors/docx_extractor.py
@ -1,13 +1,15 @@
 """
 DOCX Extractor - Concrete implementation for Word document extraction.
-This adapter implements the IExtractor port using python-docx library.
+This adapter implements the IExtractor port using Docling library.
-It maps python-docx exceptions to domain exceptions.
+It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List
 from docling.document_converter import DocumentConverter
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
 class DocxExtractor(IExtractor):
    """
-    Concrete DOCX extractor using python-docx.
+    Concrete DOCX extractor using Docling.
    This adapter:
-    1. Extracts text from DOCX files using python-docx
+    1. Extracts text from DOCX files using Docling's DocumentConverter
-    2. Handles paragraphs and tables
+    2. Converts DOCX to Markdown format
-    3. Maps exceptions to domain exceptions
+    3. Extracts metadata from document
    """
    def __init__(self) -> None:
-        """Initialize DOCX extractor."""
+        """Initialize DOCX extractor with Docling converter."""
        self._supported_extensions = ['docx']
-        logger.debug("DocxExtractor initialized")
+        self._converter = DocumentConverter()
        logger.debug("DocxExtractor initialized with Docling")
    def extract(self, file_path: Path) -> Document:
        """
-        Extract text and metadata from DOCX file.
+        Extract text and metadata from DOCX file using Docling.
        Args:
            file_path: Path to the DOCX file
@ -54,21 +57,22 @@ class DocxExtractor(IExtractor):
            # Validate file
            self._validate_file(file_path)
-            # Extract text
+            # Convert DOCX to markdown using Docling
-            text = self._extract_text_from_docx(file_path)
+            result = self._converter.convert(str(file_path))
            markdown_text = result.document.export_to_markdown()
            # Validate content
-            if not text or not text.strip():
+            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))
            # Create metadata
            metadata = self._create_metadata(file_path)
            # Build document with raw_markdown
-            document = Document(raw_markdown=text, metadata=metadata)
+            document = Document(raw_markdown=markdown_text, metadata=metadata)
            logger.info(
-                f"Successfully extracted {len(text)} characters from {file_path.name}"
+                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document
@ -130,83 +134,6 @@ class DocxExtractor(IExtractor):
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))
    def _extract_text_from_docx(self, file_path: Path) -> str:
        """
        Extract text from DOCX using python-docx.
        Args:
            file_path: Path to DOCX file
        Returns:
            Extracted text content
        Raises:
            ExtractionError: If DOCX extraction fails
        """
        try:
            import docx
            logger.debug(f"Reading DOCX: {file_path}")
            document = docx.Document(file_path)
            # Extract paragraphs
            text_parts = self._extract_paragraphs(document)
            # Extract tables
            table_text = self._extract_tables(document)
            if table_text:
                text_parts.extend(table_text)
            return "\n".join(text_parts)
        except ImportError:
            raise ExtractionError(
                message="python-docx library not installed",
                details="Install with: pip install python-docx",
                file_path=str(file_path),
            )
        except Exception as e:
            raise ExtractionError(
                message=f"DOCX extraction failed: {str(e)}",
                file_path=str(file_path),
            )
    def _extract_paragraphs(self, document) -> List[str]:
        """
        Extract text from all paragraphs.
        Args:
            document: python-docx Document object
        Returns:
            List of paragraph texts
        """
        paragraphs = []
        for paragraph in document.paragraphs:
            text = paragraph.text.strip()
            if text:
                paragraphs.append(text)
        return paragraphs
    def _extract_tables(self, document) -> List[str]:
        """
        Extract text from all tables.
        Args:
            document: python-docx Document object
        Returns:
            List of table cell texts
        """
        table_texts = []
        for table in document.tables:
            for row in table.rows:
                for cell in row.cells:
                    text = cell.text.strip()
                    if text:
                        table_texts.append(text)
        return table_texts
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
        Create source-neutral document metadata from file.
--- a/src/adapters/outgoing/extractors/excel_extractor.py
+++ b/src/adapters/outgoing/extractors/excel_extractor.py
@ -0,0 +1,154 @@
 """
 Excel Extractor - Concrete implementation for Excel file extraction.
 This adapter implements the IExtractor port using Docling library.
 It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List
 from docling.document_converter import DocumentConverter
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
 )
 from ....core.domain.models import Document, DocumentMetadata, SourceType
 from ....core.ports.outgoing.extractor import IExtractor
 logger = logging.getLogger(__name__)
 class ExcelExtractor(IExtractor):
    """
    Concrete Excel extractor using Docling.
    This adapter:
    1. Extracts text from Excel files (.xlsx, .xls) using Docling's DocumentConverter
    2. Converts Excel to Markdown format
    3. Extracts metadata from spreadsheet
    """
    def __init__(self) -> None:
        """Initialize Excel extractor with Docling converter."""
        self._supported_extensions = ['xlsx', 'xls']
        self._converter = DocumentConverter()
        logger.debug("ExcelExtractor initialized with Docling")
    def extract(self, file_path: Path) -> Document:
        """
        Extract text and metadata from Excel file using Docling.
        Args:
            file_path: Path to the Excel file
        Returns:
            Document entity with extracted content and metadata
        Raises:
            ExtractionError: If extraction fails
            EmptyContentError: If no text could be extracted
        """
        try:
            logger.info(f"Extracting text from Excel: {file_path}")
            # Validate file
            self._validate_file(file_path)
            # Convert Excel to markdown using Docling
            result = self._converter.convert(str(file_path))
            markdown_text = result.document.export_to_markdown()
            # Validate content
            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))
            # Create metadata
            metadata = self._create_metadata(file_path)
            # Build document with raw_markdown
            document = Document(raw_markdown=markdown_text, metadata=metadata)
            logger.info(
                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document
        except EmptyContentError:
            raise
        except ExtractionError:
            raise
        except Exception as e:
            logger.error(f"Excel extraction failed for {file_path}: {str(e)}")
            raise ExtractionError(
                message=f"Failed to extract text from {file_path.name}",
                details=str(e),
                file_path=str(file_path),
            )
    def supports_file_type(self, file_extension: str) -> bool:
        """
        Check if this extractor supports Excel files.
        Args:
            file_extension: File extension (e.g., 'xlsx', 'xls')
        Returns:
            True if Excel files are supported
        """
        return file_extension.lower() in self._supported_extensions
    def get_supported_types(self) -> List[str]:
        """
        Get list of supported file extensions.
        Returns:
            List containing 'xlsx' and 'xls'
        """
        return self._supported_extensions.copy()
    def _validate_file(self, file_path: Path) -> None:
        """
        Validate file exists and is readable.
        Args:
            file_path: Path to validate
        Raises:
            ExtractionError: If file is invalid
        """
        if not file_path.exists():
            raise ExtractionError(
                message=f"File not found: {file_path}",
                file_path=str(file_path),
            )
        if not file_path.is_file():
            raise ExtractionError(
                message=f"Path is not a file: {file_path}",
                file_path=str(file_path),
            )
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
        Create document metadata from Excel file.
        Args:
            file_path: Path to the Excel file
        Returns:
            DocumentMetadata entity
        """
        stat = file_path.stat()
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
            display_name=file_path.name,
            size_bytes=stat.st_size,
        )
--- a/src/adapters/outgoing/extractors/pdf_extractor.py
+++ b/src/adapters/outgoing/extractors/pdf_extractor.py
@ -1,13 +1,15 @@
 """
 PDF Extractor - Concrete implementation for PDF text extraction.
-This adapter implements the IExtractor port using PyPDF2 library.
+This adapter implements the IExtractor port using Docling library.
-It maps PyPDF2 exceptions to domain exceptions.
+It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List
 from docling.document_converter import DocumentConverter
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
 class PDFExtractor(IExtractor):
    """
-    Concrete PDF extractor using PyPDF2.
+    Concrete PDF extractor using Docling.
    This adapter:
-    1. Extracts text from PDF files using PyPDF2
+    1. Extracts text from PDF files using Docling's DocumentConverter
-    2. Maps PyPDF2 exceptions to domain exceptions
+    2. Converts PDF to Markdown format
-    3. Creates Document entities with metadata
+    3. Extracts metadata including page count
    """
    def __init__(self) -> None:
-        """Initialize PDF extractor."""
+        """Initialize PDF extractor with Docling converter."""
        self._supported_extensions = ['pdf']
-        logger.debug("PDFExtractor initialized")
+        self._converter = DocumentConverter()
        logger.debug("PDFExtractor initialized with Docling")
    def extract(self, file_path: Path) -> Document:
        """
-        Extract text and metadata from PDF file.
+        Extract text and metadata from PDF file using Docling.
        Args:
            file_path: Path to the PDF file
@ -54,21 +57,22 @@ class PDFExtractor(IExtractor):
            # Validate file
            self._validate_file(file_path)
-            # Extract text
+            # Convert PDF to markdown using Docling
-            text = self._extract_text_from_pdf(file_path)
+            result = self._converter.convert(str(file_path))
            markdown_text = result.document.export_to_markdown()
            # Validate content
-            if not text or not text.strip():
+            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))
-            # Create metadata
+            # Create metadata with page count from Docling result
-            metadata = self._create_metadata(file_path)
+            metadata = self._create_metadata(file_path, result)
            # Build document with raw_markdown
-            document = Document(raw_markdown=text, metadata=metadata)
+            document = Document(raw_markdown=markdown_text, metadata=metadata)
            logger.info(
-                f"Successfully extracted {len(text)} characters from {file_path.name}"
+                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document
@ -130,89 +134,35 @@ class PDFExtractor(IExtractor):
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))
-    def _extract_text_from_pdf(self, file_path: Path) -> str:
+    def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
        """
-        Extract text from PDF using PyPDF2.
+        Create document metadata from PDF file and Docling result.
        Args:
-            file_path: Path to PDF file
+            file_path: Path to the PDF file
-
+            result: Docling conversion result
        Returns:
            Extracted text content
        Raises:
            ExtractionError: If PDF extraction fails
        """
        try:
            import PyPDF2
            logger.debug(f"Reading PDF: {file_path}")
            text_parts = []
            with open(file_path, 'rb') as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                num_pages = len(pdf_reader.pages)
                logger.debug(f"PDF has {num_pages} pages")
                for page_num, page in enumerate(pdf_reader.pages, start=1):
                    page_text = self._extract_page_text(page, page_num)
                    if page_text:
                        text_parts.append(page_text)
            return "\n\n".join(text_parts)
        except ImportError:
            raise ExtractionError(
                message="PyPDF2 library not installed",
                details="Install with: pip install PyPDF2",
                file_path=str(file_path),
            )
        except Exception as e:
            raise ExtractionError(
                message=f"PDF extraction failed: {str(e)}",
                file_path=str(file_path),
            )
    def _extract_page_text(self, page, page_num: int) -> str:
        """
        Extract text from a single page.
        Args:
            page: PyPDF2 page object
            page_num: Page number for logging
        Returns:
            Extracted page text
        """
        try:
            import PyPDF2
            text = page.extract_text()
            logger.debug(f"Extracted page {page_num}")
            return text
        except PyPDF2.errors.PdfReadError as e:
            logger.warning(f"Failed to extract page {page_num}: {str(e)}")
            return ""
        except Exception as e:
            logger.warning(f"Error on page {page_num}: {str(e)}")
            return ""
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
        Create source-neutral document metadata from file.
        Args:
            file_path: Path to the file
        Returns:
            DocumentMetadata entity
        """
        stat = file_path.stat()
        # Extract page count from Docling result
        page_count = None
        try:
            if hasattr(result.document, 'pages'):
                page_count = len(result.document.pages)
        except Exception as e:
            logger.warning(f"Could not extract page count: {str(e)}")
        extra_metadata = {}
        if page_count is not None:
            extra_metadata['page_count'] = str(page_count)
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
            display_name=file_path.name,
            size_bytes=stat.st_size,
            extra_metadata=extra_metadata,
        )
--- a/src/bootstrap.py
+++ b/src/bootstrap.py
@ -15,6 +15,7 @@ from .adapters.outgoing.chunkers.context import ChunkingContext
 from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
 from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
 from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
 from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
 from .adapters.outgoing.extractors.factory import ExtractorFactory
 from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
 from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
@ -118,6 +119,7 @@ class ApplicationContainer:
        # Register all extractors
        factory.register_extractor(PDFExtractor())
        factory.register_extractor(DocxExtractor())
        factory.register_extractor(ExcelExtractor())
        factory.register_extractor(TxtExtractor())
        factory.register_extractor(MarkdownExtractor())
        factory.register_extractor(ZipExtractor())