use docling in extractors
This commit is contained in:
parent
ad163eb665
commit
2ccb38179d
@ -11,8 +11,7 @@ uvicorn[standard]==0.34.0
|
|||||||
python-multipart==0.0.20
|
python-multipart==0.0.20
|
||||||
|
|
||||||
# Document Processing - Extractors
|
# Document Processing - Extractors
|
||||||
PyPDF2==3.0.1 # PDF extraction
|
docling # Unified document extraction (PDF, DOCX, Excel)
|
||||||
python-docx==1.1.2 # DOCX extraction
|
|
||||||
|
|
||||||
# Cloud Storage
|
# Cloud Storage
|
||||||
boto3==1.35.94 # AWS S3 integration
|
boto3==1.35.94 # AWS S3 integration
|
||||||
|
|||||||
@ -1,13 +1,15 @@
|
|||||||
"""
|
"""
|
||||||
DOCX Extractor - Concrete implementation for Word document extraction.
|
DOCX Extractor - Concrete implementation for Word document extraction.
|
||||||
|
|
||||||
This adapter implements the IExtractor port using python-docx library.
|
This adapter implements the IExtractor port using Docling library.
|
||||||
It maps python-docx exceptions to domain exceptions.
|
It maps Docling exceptions to domain exceptions.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
from ....core.domain.exceptions import (
|
from ....core.domain.exceptions import (
|
||||||
EmptyContentError,
|
EmptyContentError,
|
||||||
ExtractionError,
|
ExtractionError,
|
||||||
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class DocxExtractor(IExtractor):
|
class DocxExtractor(IExtractor):
|
||||||
"""
|
"""
|
||||||
Concrete DOCX extractor using python-docx.
|
Concrete DOCX extractor using Docling.
|
||||||
|
|
||||||
This adapter:
|
This adapter:
|
||||||
1. Extracts text from DOCX files using python-docx
|
1. Extracts text from DOCX files using Docling's DocumentConverter
|
||||||
2. Handles paragraphs and tables
|
2. Converts DOCX to Markdown format
|
||||||
3. Maps exceptions to domain exceptions
|
3. Extracts metadata from document
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize DOCX extractor."""
|
"""Initialize DOCX extractor with Docling converter."""
|
||||||
self._supported_extensions = ['docx']
|
self._supported_extensions = ['docx']
|
||||||
logger.debug("DocxExtractor initialized")
|
self._converter = DocumentConverter()
|
||||||
|
logger.debug("DocxExtractor initialized with Docling")
|
||||||
|
|
||||||
def extract(self, file_path: Path) -> Document:
|
def extract(self, file_path: Path) -> Document:
|
||||||
"""
|
"""
|
||||||
Extract text and metadata from DOCX file.
|
Extract text and metadata from DOCX file using Docling.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the DOCX file
|
file_path: Path to the DOCX file
|
||||||
@ -54,21 +57,22 @@ class DocxExtractor(IExtractor):
|
|||||||
# Validate file
|
# Validate file
|
||||||
self._validate_file(file_path)
|
self._validate_file(file_path)
|
||||||
|
|
||||||
# Extract text
|
# Convert DOCX to markdown using Docling
|
||||||
text = self._extract_text_from_docx(file_path)
|
result = self._converter.convert(str(file_path))
|
||||||
|
markdown_text = result.document.export_to_markdown()
|
||||||
|
|
||||||
# Validate content
|
# Validate content
|
||||||
if not text or not text.strip():
|
if not markdown_text or not markdown_text.strip():
|
||||||
raise EmptyContentError(file_path=str(file_path))
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
# Create metadata
|
# Create metadata
|
||||||
metadata = self._create_metadata(file_path)
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=text, metadata=metadata)
|
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(text)} characters from {file_path.name}"
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
)
|
)
|
||||||
return document
|
return document
|
||||||
|
|
||||||
@ -130,83 +134,6 @@ class DocxExtractor(IExtractor):
|
|||||||
if file_path.stat().st_size == 0:
|
if file_path.stat().st_size == 0:
|
||||||
raise EmptyContentError(file_path=str(file_path))
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
def _extract_text_from_docx(self, file_path: Path) -> str:
|
|
||||||
"""
|
|
||||||
Extract text from DOCX using python-docx.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to DOCX file
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Extracted text content
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ExtractionError: If DOCX extraction fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import docx
|
|
||||||
|
|
||||||
logger.debug(f"Reading DOCX: {file_path}")
|
|
||||||
document = docx.Document(file_path)
|
|
||||||
|
|
||||||
# Extract paragraphs
|
|
||||||
text_parts = self._extract_paragraphs(document)
|
|
||||||
|
|
||||||
# Extract tables
|
|
||||||
table_text = self._extract_tables(document)
|
|
||||||
if table_text:
|
|
||||||
text_parts.extend(table_text)
|
|
||||||
|
|
||||||
return "\n".join(text_parts)
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
raise ExtractionError(
|
|
||||||
message="python-docx library not installed",
|
|
||||||
details="Install with: pip install python-docx",
|
|
||||||
file_path=str(file_path),
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise ExtractionError(
|
|
||||||
message=f"DOCX extraction failed: {str(e)}",
|
|
||||||
file_path=str(file_path),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_paragraphs(self, document) -> List[str]:
|
|
||||||
"""
|
|
||||||
Extract text from all paragraphs.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document: python-docx Document object
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of paragraph texts
|
|
||||||
"""
|
|
||||||
paragraphs = []
|
|
||||||
for paragraph in document.paragraphs:
|
|
||||||
text = paragraph.text.strip()
|
|
||||||
if text:
|
|
||||||
paragraphs.append(text)
|
|
||||||
return paragraphs
|
|
||||||
|
|
||||||
def _extract_tables(self, document) -> List[str]:
|
|
||||||
"""
|
|
||||||
Extract text from all tables.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document: python-docx Document object
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of table cell texts
|
|
||||||
"""
|
|
||||||
table_texts = []
|
|
||||||
for table in document.tables:
|
|
||||||
for row in table.rows:
|
|
||||||
for cell in row.cells:
|
|
||||||
text = cell.text.strip()
|
|
||||||
if text:
|
|
||||||
table_texts.append(text)
|
|
||||||
return table_texts
|
|
||||||
|
|
||||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||||
"""
|
"""
|
||||||
Create source-neutral document metadata from file.
|
Create source-neutral document metadata from file.
|
||||||
|
|||||||
154
src/adapters/outgoing/extractors/excel_extractor.py
Normal file
154
src/adapters/outgoing/extractors/excel_extractor.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
"""
|
||||||
|
Excel Extractor - Concrete implementation for Excel file extraction.
|
||||||
|
|
||||||
|
This adapter implements the IExtractor port using Docling library.
|
||||||
|
It maps Docling exceptions to domain exceptions.
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
from ....core.domain.exceptions import (
|
||||||
|
EmptyContentError,
|
||||||
|
ExtractionError,
|
||||||
|
)
|
||||||
|
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||||
|
from ....core.ports.outgoing.extractor import IExtractor
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ExcelExtractor(IExtractor):
|
||||||
|
"""
|
||||||
|
Concrete Excel extractor using Docling.
|
||||||
|
|
||||||
|
This adapter:
|
||||||
|
1. Extracts text from Excel files (.xlsx, .xls) using Docling's DocumentConverter
|
||||||
|
2. Converts Excel to Markdown format
|
||||||
|
3. Extracts metadata from spreadsheet
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
"""Initialize Excel extractor with Docling converter."""
|
||||||
|
self._supported_extensions = ['xlsx', 'xls']
|
||||||
|
self._converter = DocumentConverter()
|
||||||
|
logger.debug("ExcelExtractor initialized with Docling")
|
||||||
|
|
||||||
|
def extract(self, file_path: Path) -> Document:
|
||||||
|
"""
|
||||||
|
Extract text and metadata from Excel file using Docling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the Excel file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document entity with extracted content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If extraction fails
|
||||||
|
EmptyContentError: If no text could be extracted
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Extracting text from Excel: {file_path}")
|
||||||
|
|
||||||
|
# Validate file
|
||||||
|
self._validate_file(file_path)
|
||||||
|
|
||||||
|
# Convert Excel to markdown using Docling
|
||||||
|
result = self._converter.convert(str(file_path))
|
||||||
|
markdown_text = result.document.export_to_markdown()
|
||||||
|
|
||||||
|
# Validate content
|
||||||
|
if not markdown_text or not markdown_text.strip():
|
||||||
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
|
# Create metadata
|
||||||
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
|
# Build document with raw_markdown
|
||||||
|
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
|
)
|
||||||
|
return document
|
||||||
|
|
||||||
|
except EmptyContentError:
|
||||||
|
raise
|
||||||
|
except ExtractionError:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Excel extraction failed for {file_path}: {str(e)}")
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"Failed to extract text from {file_path.name}",
|
||||||
|
details=str(e),
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
def supports_file_type(self, file_extension: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this extractor supports Excel files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_extension: File extension (e.g., 'xlsx', 'xls')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if Excel files are supported
|
||||||
|
"""
|
||||||
|
return file_extension.lower() in self._supported_extensions
|
||||||
|
|
||||||
|
def get_supported_types(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get list of supported file extensions.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List containing 'xlsx' and 'xls'
|
||||||
|
"""
|
||||||
|
return self._supported_extensions.copy()
|
||||||
|
|
||||||
|
def _validate_file(self, file_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
Validate file exists and is readable.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to validate
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If file is invalid
|
||||||
|
"""
|
||||||
|
if not file_path.exists():
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"File not found: {file_path}",
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not file_path.is_file():
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"Path is not a file: {file_path}",
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_path.stat().st_size == 0:
|
||||||
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
|
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||||
|
"""
|
||||||
|
Create document metadata from Excel file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the Excel file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DocumentMetadata entity
|
||||||
|
"""
|
||||||
|
stat = file_path.stat()
|
||||||
|
|
||||||
|
return DocumentMetadata(
|
||||||
|
source_id=str(file_path.absolute()),
|
||||||
|
source_type=SourceType.FILE,
|
||||||
|
display_name=file_path.name,
|
||||||
|
size_bytes=stat.st_size,
|
||||||
|
)
|
||||||
@ -1,13 +1,15 @@
|
|||||||
"""
|
"""
|
||||||
PDF Extractor - Concrete implementation for PDF text extraction.
|
PDF Extractor - Concrete implementation for PDF text extraction.
|
||||||
|
|
||||||
This adapter implements the IExtractor port using PyPDF2 library.
|
This adapter implements the IExtractor port using Docling library.
|
||||||
It maps PyPDF2 exceptions to domain exceptions.
|
It maps Docling exceptions to domain exceptions.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
from ....core.domain.exceptions import (
|
from ....core.domain.exceptions import (
|
||||||
EmptyContentError,
|
EmptyContentError,
|
||||||
ExtractionError,
|
ExtractionError,
|
||||||
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class PDFExtractor(IExtractor):
|
class PDFExtractor(IExtractor):
|
||||||
"""
|
"""
|
||||||
Concrete PDF extractor using PyPDF2.
|
Concrete PDF extractor using Docling.
|
||||||
|
|
||||||
This adapter:
|
This adapter:
|
||||||
1. Extracts text from PDF files using PyPDF2
|
1. Extracts text from PDF files using Docling's DocumentConverter
|
||||||
2. Maps PyPDF2 exceptions to domain exceptions
|
2. Converts PDF to Markdown format
|
||||||
3. Creates Document entities with metadata
|
3. Extracts metadata including page count
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize PDF extractor."""
|
"""Initialize PDF extractor with Docling converter."""
|
||||||
self._supported_extensions = ['pdf']
|
self._supported_extensions = ['pdf']
|
||||||
logger.debug("PDFExtractor initialized")
|
self._converter = DocumentConverter()
|
||||||
|
logger.debug("PDFExtractor initialized with Docling")
|
||||||
|
|
||||||
def extract(self, file_path: Path) -> Document:
|
def extract(self, file_path: Path) -> Document:
|
||||||
"""
|
"""
|
||||||
Extract text and metadata from PDF file.
|
Extract text and metadata from PDF file using Docling.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the PDF file
|
file_path: Path to the PDF file
|
||||||
@ -54,21 +57,22 @@ class PDFExtractor(IExtractor):
|
|||||||
# Validate file
|
# Validate file
|
||||||
self._validate_file(file_path)
|
self._validate_file(file_path)
|
||||||
|
|
||||||
# Extract text
|
# Convert PDF to markdown using Docling
|
||||||
text = self._extract_text_from_pdf(file_path)
|
result = self._converter.convert(str(file_path))
|
||||||
|
markdown_text = result.document.export_to_markdown()
|
||||||
|
|
||||||
# Validate content
|
# Validate content
|
||||||
if not text or not text.strip():
|
if not markdown_text or not markdown_text.strip():
|
||||||
raise EmptyContentError(file_path=str(file_path))
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
# Create metadata
|
# Create metadata with page count from Docling result
|
||||||
metadata = self._create_metadata(file_path)
|
metadata = self._create_metadata(file_path, result)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=text, metadata=metadata)
|
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(text)} characters from {file_path.name}"
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
)
|
)
|
||||||
return document
|
return document
|
||||||
|
|
||||||
@ -130,89 +134,35 @@ class PDFExtractor(IExtractor):
|
|||||||
if file_path.stat().st_size == 0:
|
if file_path.stat().st_size == 0:
|
||||||
raise EmptyContentError(file_path=str(file_path))
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
def _extract_text_from_pdf(self, file_path: Path) -> str:
|
def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
|
||||||
"""
|
"""
|
||||||
Extract text from PDF using PyPDF2.
|
Create document metadata from PDF file and Docling result.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to PDF file
|
file_path: Path to the PDF file
|
||||||
|
result: Docling conversion result
|
||||||
Returns:
|
|
||||||
Extracted text content
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ExtractionError: If PDF extraction fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import PyPDF2
|
|
||||||
|
|
||||||
logger.debug(f"Reading PDF: {file_path}")
|
|
||||||
text_parts = []
|
|
||||||
|
|
||||||
with open(file_path, 'rb') as pdf_file:
|
|
||||||
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
|
||||||
num_pages = len(pdf_reader.pages)
|
|
||||||
logger.debug(f"PDF has {num_pages} pages")
|
|
||||||
|
|
||||||
for page_num, page in enumerate(pdf_reader.pages, start=1):
|
|
||||||
page_text = self._extract_page_text(page, page_num)
|
|
||||||
if page_text:
|
|
||||||
text_parts.append(page_text)
|
|
||||||
|
|
||||||
return "\n\n".join(text_parts)
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
raise ExtractionError(
|
|
||||||
message="PyPDF2 library not installed",
|
|
||||||
details="Install with: pip install PyPDF2",
|
|
||||||
file_path=str(file_path),
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise ExtractionError(
|
|
||||||
message=f"PDF extraction failed: {str(e)}",
|
|
||||||
file_path=str(file_path),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_page_text(self, page, page_num: int) -> str:
|
|
||||||
"""
|
|
||||||
Extract text from a single page.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
page: PyPDF2 page object
|
|
||||||
page_num: Page number for logging
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Extracted page text
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import PyPDF2
|
|
||||||
|
|
||||||
text = page.extract_text()
|
|
||||||
logger.debug(f"Extracted page {page_num}")
|
|
||||||
return text
|
|
||||||
|
|
||||||
except PyPDF2.errors.PdfReadError as e:
|
|
||||||
logger.warning(f"Failed to extract page {page_num}: {str(e)}")
|
|
||||||
return ""
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Error on page {page_num}: {str(e)}")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
|
||||||
"""
|
|
||||||
Create source-neutral document metadata from file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to the file
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DocumentMetadata entity
|
DocumentMetadata entity
|
||||||
"""
|
"""
|
||||||
stat = file_path.stat()
|
stat = file_path.stat()
|
||||||
|
|
||||||
|
# Extract page count from Docling result
|
||||||
|
page_count = None
|
||||||
|
try:
|
||||||
|
if hasattr(result.document, 'pages'):
|
||||||
|
page_count = len(result.document.pages)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not extract page count: {str(e)}")
|
||||||
|
|
||||||
|
extra_metadata = {}
|
||||||
|
if page_count is not None:
|
||||||
|
extra_metadata['page_count'] = str(page_count)
|
||||||
|
|
||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.name,
|
display_name=file_path.name,
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
|
extra_metadata=extra_metadata,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -15,6 +15,7 @@ from .adapters.outgoing.chunkers.context import ChunkingContext
|
|||||||
from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
|
from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
|
||||||
from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
||||||
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
|
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
|
||||||
|
from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
|
||||||
from .adapters.outgoing.extractors.factory import ExtractorFactory
|
from .adapters.outgoing.extractors.factory import ExtractorFactory
|
||||||
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
|
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
|
||||||
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
|
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
|
||||||
@ -118,6 +119,7 @@ class ApplicationContainer:
|
|||||||
# Register all extractors
|
# Register all extractors
|
||||||
factory.register_extractor(PDFExtractor())
|
factory.register_extractor(PDFExtractor())
|
||||||
factory.register_extractor(DocxExtractor())
|
factory.register_extractor(DocxExtractor())
|
||||||
|
factory.register_extractor(ExcelExtractor())
|
||||||
factory.register_extractor(TxtExtractor())
|
factory.register_extractor(TxtExtractor())
|
||||||
factory.register_extractor(MarkdownExtractor())
|
factory.register_extractor(MarkdownExtractor())
|
||||||
factory.register_extractor(ZipExtractor())
|
factory.register_extractor(ZipExtractor())
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user