use docling in extractors

2026-01-24 13:43:07 +03:30 · 2026-01-24 13:43:07 +03:30 · 2ccb38179d
commit 2ccb38179d
parent ad163eb665
5 changed files with 212 additions and 180 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -11,8 +11,7 @@ uvicorn[standard]==0.34.0
 python-multipart==0.0.20

 # Document Processing - Extractors
-PyPDF2==3.0.1          # PDF extraction
-python-docx==1.1.2     # DOCX extraction
+docling                # Unified document extraction (PDF, DOCX, Excel)

 # Cloud Storage
 boto3==1.35.94         # AWS S3 integration
--- a/src/adapters/outgoing/extractors/docx_extractor.py
+++ b/src/adapters/outgoing/extractors/docx_extractor.py
@ -1,13 +1,15 @@
 """
 DOCX Extractor - Concrete implementation for Word document extraction.

-This adapter implements the IExtractor port using python-docx library.
-It maps python-docx exceptions to domain exceptions.
+This adapter implements the IExtractor port using Docling library.
+It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List

+from docling.document_converter import DocumentConverter
+
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)

 class DocxExtractor(IExtractor):
    """
-    Concrete DOCX extractor using python-docx.
+    Concrete DOCX extractor using Docling.

    This adapter:
-    1. Extracts text from DOCX files using python-docx
-    2. Handles paragraphs and tables
-    3. Maps exceptions to domain exceptions
+    1. Extracts text from DOCX files using Docling's DocumentConverter
+    2. Converts DOCX to Markdown format
+    3. Extracts metadata from document
    """

    def __init__(self) -> None:
-        """Initialize DOCX extractor."""
+        """Initialize DOCX extractor with Docling converter."""
        self._supported_extensions = ['docx']
-        logger.debug("DocxExtractor initialized")
+        self._converter = DocumentConverter()
+        logger.debug("DocxExtractor initialized with Docling")

    def extract(self, file_path: Path) -> Document:
        """
-        Extract text and metadata from DOCX file.
+        Extract text and metadata from DOCX file using Docling.

        Args:
            file_path: Path to the DOCX file
@ -54,21 +57,22 @@ class DocxExtractor(IExtractor):
            # Validate file
            self._validate_file(file_path)

-            # Extract text
-            text = self._extract_text_from_docx(file_path)
+            # Convert DOCX to markdown using Docling
+            result = self._converter.convert(str(file_path))
+            markdown_text = result.document.export_to_markdown()

            # Validate content
-            if not text or not text.strip():
+            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))

            # Create metadata
            metadata = self._create_metadata(file_path)

            # Build document with raw_markdown
-            document = Document(raw_markdown=text, metadata=metadata)
+            document = Document(raw_markdown=markdown_text, metadata=metadata)

            logger.info(
-                f"Successfully extracted {len(text)} characters from {file_path.name}"
+                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document

@ -130,83 +134,6 @@ class DocxExtractor(IExtractor):
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))

-    def _extract_text_from_docx(self, file_path: Path) -> str:
-        """
-        Extract text from DOCX using python-docx.
-
-        Args:
-            file_path: Path to DOCX file
-
-        Returns:
-            Extracted text content
-
-        Raises:
-            ExtractionError: If DOCX extraction fails
-        """
-        try:
-            import docx
-
-            logger.debug(f"Reading DOCX: {file_path}")
-            document = docx.Document(file_path)
-
-            # Extract paragraphs
-            text_parts = self._extract_paragraphs(document)
-
-            # Extract tables
-            table_text = self._extract_tables(document)
-            if table_text:
-                text_parts.extend(table_text)
-
-            return "\n".join(text_parts)
-
-        except ImportError:
-            raise ExtractionError(
-                message="python-docx library not installed",
-                details="Install with: pip install python-docx",
-                file_path=str(file_path),
-            )
-        except Exception as e:
-            raise ExtractionError(
-                message=f"DOCX extraction failed: {str(e)}",
-                file_path=str(file_path),
-            )
-
-    def _extract_paragraphs(self, document) -> List[str]:
-        """
-        Extract text from all paragraphs.
-
-        Args:
-            document: python-docx Document object
-
-        Returns:
-            List of paragraph texts
-        """
-        paragraphs = []
-        for paragraph in document.paragraphs:
-            text = paragraph.text.strip()
-            if text:
-                paragraphs.append(text)
-        return paragraphs
-
-    def _extract_tables(self, document) -> List[str]:
-        """
-        Extract text from all tables.
-
-        Args:
-            document: python-docx Document object
-
-        Returns:
-            List of table cell texts
-        """
-        table_texts = []
-        for table in document.tables:
-            for row in table.rows:
-                for cell in row.cells:
-                    text = cell.text.strip()
-                    if text:
-                        table_texts.append(text)
-        return table_texts
-
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
        Create source-neutral document metadata from file.
--- a/src/adapters/outgoing/extractors/excel_extractor.py
+++ b/src/adapters/outgoing/extractors/excel_extractor.py
@ -0,0 +1,154 @@
+"""
+Excel Extractor - Concrete implementation for Excel file extraction.
+
+This adapter implements the IExtractor port using Docling library.
+It maps Docling exceptions to domain exceptions.
+"""
+import logging
+from pathlib import Path
+from typing import List
+
+from docling.document_converter import DocumentConverter
+
+from ....core.domain.exceptions import (
+    EmptyContentError,
+    ExtractionError,
+)
+from ....core.domain.models import Document, DocumentMetadata, SourceType
+from ....core.ports.outgoing.extractor import IExtractor
+
+
+logger = logging.getLogger(__name__)
+
+
+class ExcelExtractor(IExtractor):
+    """
+    Concrete Excel extractor using Docling.
+
+    This adapter:
+    1. Extracts text from Excel files (.xlsx, .xls) using Docling's DocumentConverter
+    2. Converts Excel to Markdown format
+    3. Extracts metadata from spreadsheet
+    """
+
+    def __init__(self) -> None:
+        """Initialize Excel extractor with Docling converter."""
+        self._supported_extensions = ['xlsx', 'xls']
+        self._converter = DocumentConverter()
+        logger.debug("ExcelExtractor initialized with Docling")
+
+    def extract(self, file_path: Path) -> Document:
+        """
+        Extract text and metadata from Excel file using Docling.
+
+        Args:
+            file_path: Path to the Excel file
+
+        Returns:
+            Document entity with extracted content and metadata
+
+        Raises:
+            ExtractionError: If extraction fails
+            EmptyContentError: If no text could be extracted
+        """
+        try:
+            logger.info(f"Extracting text from Excel: {file_path}")
+
+            # Validate file
+            self._validate_file(file_path)
+
+            # Convert Excel to markdown using Docling
+            result = self._converter.convert(str(file_path))
+            markdown_text = result.document.export_to_markdown()
+
+            # Validate content
+            if not markdown_text or not markdown_text.strip():
+                raise EmptyContentError(file_path=str(file_path))
+
+            # Create metadata
+            metadata = self._create_metadata(file_path)
+
+            # Build document with raw_markdown
+            document = Document(raw_markdown=markdown_text, metadata=metadata)
+
+            logger.info(
+                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
+            )
+            return document
+
+        except EmptyContentError:
+            raise
+        except ExtractionError:
+            raise
+        except Exception as e:
+            logger.error(f"Excel extraction failed for {file_path}: {str(e)}")
+            raise ExtractionError(
+                message=f"Failed to extract text from {file_path.name}",
+                details=str(e),
+                file_path=str(file_path),
+            )
+
+    def supports_file_type(self, file_extension: str) -> bool:
+        """
+        Check if this extractor supports Excel files.
+
+        Args:
+            file_extension: File extension (e.g., 'xlsx', 'xls')
+
+        Returns:
+            True if Excel files are supported
+        """
+        return file_extension.lower() in self._supported_extensions
+
+    def get_supported_types(self) -> List[str]:
+        """
+        Get list of supported file extensions.
+
+        Returns:
+            List containing 'xlsx' and 'xls'
+        """
+        return self._supported_extensions.copy()
+
+    def _validate_file(self, file_path: Path) -> None:
+        """
+        Validate file exists and is readable.
+
+        Args:
+            file_path: Path to validate
+
+        Raises:
+            ExtractionError: If file is invalid
+        """
+        if not file_path.exists():
+            raise ExtractionError(
+                message=f"File not found: {file_path}",
+                file_path=str(file_path),
+            )
+
+        if not file_path.is_file():
+            raise ExtractionError(
+                message=f"Path is not a file: {file_path}",
+                file_path=str(file_path),
+            )
+
+        if file_path.stat().st_size == 0:
+            raise EmptyContentError(file_path=str(file_path))
+
+    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
+        """
+        Create document metadata from Excel file.
+
+        Args:
+            file_path: Path to the Excel file
+
+        Returns:
+            DocumentMetadata entity
+        """
+        stat = file_path.stat()
+
+        return DocumentMetadata(
+            source_id=str(file_path.absolute()),
+            source_type=SourceType.FILE,
+            display_name=file_path.name,
+            size_bytes=stat.st_size,
+        )
--- a/src/adapters/outgoing/extractors/pdf_extractor.py
+++ b/src/adapters/outgoing/extractors/pdf_extractor.py
@ -1,13 +1,15 @@
 """
 PDF Extractor - Concrete implementation for PDF text extraction.

-This adapter implements the IExtractor port using PyPDF2 library.
-It maps PyPDF2 exceptions to domain exceptions.
+This adapter implements the IExtractor port using Docling library.
+It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List

+from docling.document_converter import DocumentConverter
+
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)

 class PDFExtractor(IExtractor):
    """
-    Concrete PDF extractor using PyPDF2.
+    Concrete PDF extractor using Docling.

    This adapter:
-    1. Extracts text from PDF files using PyPDF2
-    2. Maps PyPDF2 exceptions to domain exceptions
-    3. Creates Document entities with metadata
+    1. Extracts text from PDF files using Docling's DocumentConverter
+    2. Converts PDF to Markdown format
+    3. Extracts metadata including page count
    """

    def __init__(self) -> None:
-        """Initialize PDF extractor."""
+        """Initialize PDF extractor with Docling converter."""
        self._supported_extensions = ['pdf']
-        logger.debug("PDFExtractor initialized")
+        self._converter = DocumentConverter()
+        logger.debug("PDFExtractor initialized with Docling")

    def extract(self, file_path: Path) -> Document:
        """
-        Extract text and metadata from PDF file.
+        Extract text and metadata from PDF file using Docling.

        Args:
            file_path: Path to the PDF file
@ -54,21 +57,22 @@ class PDFExtractor(IExtractor):
            # Validate file
            self._validate_file(file_path)

-            # Extract text
-            text = self._extract_text_from_pdf(file_path)
+            # Convert PDF to markdown using Docling
+            result = self._converter.convert(str(file_path))
+            markdown_text = result.document.export_to_markdown()

            # Validate content
-            if not text or not text.strip():
+            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))

-            # Create metadata
-            metadata = self._create_metadata(file_path)
+            # Create metadata with page count from Docling result
+            metadata = self._create_metadata(file_path, result)

            # Build document with raw_markdown
-            document = Document(raw_markdown=text, metadata=metadata)
+            document = Document(raw_markdown=markdown_text, metadata=metadata)

            logger.info(
-                f"Successfully extracted {len(text)} characters from {file_path.name}"
+                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document

@ -130,89 +134,35 @@ class PDFExtractor(IExtractor):
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))

-    def _extract_text_from_pdf(self, file_path: Path) -> str:
+    def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
        """
-        Extract text from PDF using PyPDF2.
+        Create document metadata from PDF file and Docling result.

        Args:
-            file_path: Path to PDF file
-
-        Returns:
-            Extracted text content
-
-        Raises:
-            ExtractionError: If PDF extraction fails
-        """
-        try:
-            import PyPDF2
-
-            logger.debug(f"Reading PDF: {file_path}")
-            text_parts = []
-
-            with open(file_path, 'rb') as pdf_file:
-                pdf_reader = PyPDF2.PdfReader(pdf_file)
-                num_pages = len(pdf_reader.pages)
-                logger.debug(f"PDF has {num_pages} pages")
-
-                for page_num, page in enumerate(pdf_reader.pages, start=1):
-                    page_text = self._extract_page_text(page, page_num)
-                    if page_text:
-                        text_parts.append(page_text)
-
-            return "\n\n".join(text_parts)
-
-        except ImportError:
-            raise ExtractionError(
-                message="PyPDF2 library not installed",
-                details="Install with: pip install PyPDF2",
-                file_path=str(file_path),
-            )
-        except Exception as e:
-            raise ExtractionError(
-                message=f"PDF extraction failed: {str(e)}",
-                file_path=str(file_path),
-            )
-
-    def _extract_page_text(self, page, page_num: int) -> str:
-        """
-        Extract text from a single page.
-
-        Args:
-            page: PyPDF2 page object
-            page_num: Page number for logging
-
-        Returns:
-            Extracted page text
-        """
-        try:
-            import PyPDF2
-
-            text = page.extract_text()
-            logger.debug(f"Extracted page {page_num}")
-            return text
-
-        except PyPDF2.errors.PdfReadError as e:
-            logger.warning(f"Failed to extract page {page_num}: {str(e)}")
-            return ""
-        except Exception as e:
-            logger.warning(f"Error on page {page_num}: {str(e)}")
-            return ""
-
-    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
-        """
-        Create source-neutral document metadata from file.
-
-        Args:
-            file_path: Path to the file
+            file_path: Path to the PDF file
+            result: Docling conversion result

        Returns:
            DocumentMetadata entity
        """
        stat = file_path.stat()

+        # Extract page count from Docling result
+        page_count = None
+        try:
+            if hasattr(result.document, 'pages'):
+                page_count = len(result.document.pages)
+        except Exception as e:
+            logger.warning(f"Could not extract page count: {str(e)}")
+
+        extra_metadata = {}
+        if page_count is not None:
+            extra_metadata['page_count'] = str(page_count)
+
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
            display_name=file_path.name,
            size_bytes=stat.st_size,
+            extra_metadata=extra_metadata,
        )
--- a/src/bootstrap.py
+++ b/src/bootstrap.py
@ -15,6 +15,7 @@ from .adapters.outgoing.chunkers.context import ChunkingContext
 from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
 from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
 from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
+from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
 from .adapters.outgoing.extractors.factory import ExtractorFactory
 from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
 from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
@ -118,6 +119,7 @@ class ApplicationContainer:
        # Register all extractors
        factory.register_extractor(PDFExtractor())
        factory.register_extractor(DocxExtractor())
+        factory.register_extractor(ExcelExtractor())
        factory.register_extractor(TxtExtractor())
        factory.register_extractor(MarkdownExtractor())
        factory.register_extractor(ZipExtractor())