Compare commits
8 Commits
91f8035043
...
80dd901e42
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
80dd901e42 | ||
|
|
9e1e49bc59 | ||
|
|
cda128e438 | ||
|
|
8ecbd88498 | ||
|
|
3aad734140 | ||
|
|
c6302bc792 | ||
|
|
2ccb38179d | ||
|
|
ad163eb665 |
@ -11,8 +11,7 @@ uvicorn[standard]==0.34.0
|
|||||||
python-multipart==0.0.20
|
python-multipart==0.0.20
|
||||||
|
|
||||||
# Document Processing - Extractors
|
# Document Processing - Extractors
|
||||||
PyPDF2==3.0.1 # PDF extraction
|
docling # Unified document extraction (PDF, DOCX, Excel)
|
||||||
python-docx==1.1.2 # DOCX extraction
|
|
||||||
|
|
||||||
# Cloud Storage
|
# Cloud Storage
|
||||||
boto3==1.35.94 # AWS S3 integration
|
boto3==1.35.94 # AWS S3 integration
|
||||||
|
|||||||
@ -18,7 +18,11 @@ from pathlib import Path
|
|||||||
from typing import Iterator, List, Optional
|
from typing import Iterator, List, Optional
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, FastAPI, File, Form, HTTPException, UploadFile, status
|
from fastapi import APIRouter, Depends, FastAPI, File, Form, HTTPException, UploadFile, status
|
||||||
|
from fastapi.openapi.docs import get_swagger_ui_html, get_redoc_html
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
|
from fastapi.security import HTTPBasicCredentials
|
||||||
|
|
||||||
|
from .auth import check_docs_credentials, validate_api_key
|
||||||
|
|
||||||
from ...core.config import get_settings
|
from ...core.config import get_settings
|
||||||
from ...core.domain.exceptions import (
|
from ...core.domain.exceptions import (
|
||||||
@ -41,11 +45,6 @@ from .api_schemas import (
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# Application Setup
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
# Load settings
|
# Load settings
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
|
|
||||||
@ -53,12 +52,19 @@ app = FastAPI(
|
|||||||
title="Text Processor API",
|
title="Text Processor API",
|
||||||
description="Text extraction and chunking system using Hexagonal Architecture",
|
description="Text extraction and chunking system using Hexagonal Architecture",
|
||||||
version="1.0.0",
|
version="1.0.0",
|
||||||
docs_url="/docs",
|
# docs_url=None,
|
||||||
redoc_url="/redoc",
|
# redoc_url=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/v1", tags=["Text Processing"])
|
router = APIRouter(
|
||||||
|
prefix="/api/v1",
|
||||||
|
tags=["Text Processing"],
|
||||||
|
dependencies=[Depends(validate_api_key)]
|
||||||
|
)
|
||||||
|
|
||||||
|
public_router = APIRouter(
|
||||||
|
tags=["System"],
|
||||||
|
)
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Global Exception Handler
|
# Global Exception Handler
|
||||||
@ -101,7 +107,7 @@ def get_service() -> ITextProcessor:
|
|||||||
|
|
||||||
def get_chunking_strategy(
|
def get_chunking_strategy(
|
||||||
strategy_name: ChunkingMethod = Form(..., description="Chunking method"),
|
strategy_name: ChunkingMethod = Form(..., description="Chunking method"),
|
||||||
chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
|
chunk_size: int = Form(512, description="Target chunk size in characters", ge=1, le=10000),
|
||||||
overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
|
overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
|
||||||
respect_boundaries: bool = Form(True, description="Respect text boundaries"),
|
respect_boundaries: bool = Form(True, description="Respect text boundaries"),
|
||||||
) -> ChunkingStrategy:
|
) -> ChunkingStrategy:
|
||||||
@ -210,8 +216,6 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
|
|||||||
document_id=str(chunk.document_id),
|
document_id=str(chunk.document_id),
|
||||||
content=chunk.content,
|
content=chunk.content,
|
||||||
sequence_number=chunk.sequence_number,
|
sequence_number=chunk.sequence_number,
|
||||||
start_char=chunk.start_char,
|
|
||||||
end_char=chunk.end_char,
|
|
||||||
length=chunk.get_length(),
|
length=chunk.get_length(),
|
||||||
)
|
)
|
||||||
for chunk in chunks
|
for chunk in chunks
|
||||||
@ -231,8 +235,8 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
|
|||||||
)
|
)
|
||||||
async def perform_chunking(
|
async def perform_chunking(
|
||||||
file: Optional[UploadFile] = File(None, description="Markdown file (.md) to upload"),
|
file: Optional[UploadFile] = File(None, description="Markdown file (.md) to upload"),
|
||||||
text: Optional[str] = Form(None, description="Markdown text to process", json_schema_extra={"x-textarea": True}),
|
text: Optional[str] = Form('', description="Markdown text to process"),
|
||||||
title: str = Form("markdown_input", description="Optional title for the document"),
|
title: Optional[str] = Form('', description="Optional title for the document"),
|
||||||
strategy: ChunkingStrategy = Depends(get_chunking_strategy),
|
strategy: ChunkingStrategy = Depends(get_chunking_strategy),
|
||||||
service: ITextProcessor = Depends(get_service),
|
service: ITextProcessor = Depends(get_service),
|
||||||
) -> ChunkListResponse:
|
) -> ChunkListResponse:
|
||||||
@ -339,7 +343,7 @@ async def process_file(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@router.get(
|
@public_router.get(
|
||||||
"/health",
|
"/health",
|
||||||
response_model=HealthCheckResponse,
|
response_model=HealthCheckResponse,
|
||||||
status_code=status.HTTP_200_OK,
|
status_code=status.HTTP_200_OK,
|
||||||
@ -356,21 +360,29 @@ async def health_check() -> HealthCheckResponse:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Protected Documentation Routes
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# @app.get("/docs", include_in_schema=False)
|
||||||
|
# def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
|
||||||
|
# return get_swagger_ui_html(
|
||||||
|
# openapi_url="/openapi.json",
|
||||||
|
# title="Protected Text-Processor API Docs"
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# @app.get("/redoc", include_in_schema=False)
|
||||||
|
# def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
|
||||||
|
# return get_redoc_html(
|
||||||
|
# openapi_url="/openapi.json",
|
||||||
|
# title="Protected Text-Processor API Docs"
|
||||||
|
# )
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Application Setup
|
# Application Setup
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
# Include router in app
|
# Include routers in app
|
||||||
app.include_router(router)
|
app.include_router(router)
|
||||||
|
app.include_router(public_router)
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
async def root():
|
|
||||||
"""Root endpoint with API information."""
|
|
||||||
return {
|
|
||||||
"name": "Text Processor API",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "Text extraction and chunking system using Hexagonal Architecture",
|
|
||||||
"docs_url": "/docs",
|
|
||||||
"api_prefix": "/api/v1",
|
|
||||||
}
|
|
||||||
|
|||||||
@ -101,8 +101,6 @@ class ChunkResponse(BaseModel):
|
|||||||
document_id: str
|
document_id: str
|
||||||
content: str
|
content: str
|
||||||
sequence_number: int
|
sequence_number: int
|
||||||
start_char: int
|
|
||||||
end_char: int
|
|
||||||
length: int
|
length: int
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
34
src/adapters/incoming/auth.py
Normal file
34
src/adapters/incoming/auth.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import secrets
|
||||||
|
from fastapi import Depends, HTTPException, Security, status
|
||||||
|
from fastapi.security import APIKeyHeader, HTTPBasic, HTTPBasicCredentials
|
||||||
|
from ...core.config import get_settings
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
# This allows Swagger UI to detect the "Authorize" button
|
||||||
|
api_key_header = APIKeyHeader(name=settings.API_KEY_NAME, auto_error=False)
|
||||||
|
http_basic = HTTPBasic()
|
||||||
|
|
||||||
|
async def validate_api_key(api_key: str = Security(api_key_header)):
|
||||||
|
"""
|
||||||
|
Validates the X-API-Key header.
|
||||||
|
Using secrets.compare_digest protects against timing attacks.
|
||||||
|
"""
|
||||||
|
if not api_key or not secrets.compare_digest(api_key, settings.API_KEY):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_403_FORBIDDEN,
|
||||||
|
detail="Could not validate credentials. Invalid or missing API Key.",
|
||||||
|
)
|
||||||
|
return api_key
|
||||||
|
|
||||||
|
|
||||||
|
security = HTTPBasic()
|
||||||
|
|
||||||
|
def check_docs_credentials(credentials: HTTPBasicCredentials = Depends(security)):
|
||||||
|
is_correct_user = secrets.compare_digest(credentials.username, settings.DOCS_USERNAME)
|
||||||
|
is_correct_password = secrets.compare_digest(credentials.password, settings.DOCS_PASSWORD)
|
||||||
|
|
||||||
|
if not (is_correct_user and is_correct_password):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
headers={"WWW-Authenticate": "Basic"},
|
||||||
|
)
|
||||||
@ -70,8 +70,8 @@ class FixedSizeChunker(IChunker):
|
|||||||
chunks = self._chunk_by_sections(document, strategy)
|
chunks = self._chunk_by_sections(document, strategy)
|
||||||
else:
|
else:
|
||||||
# Standard chunking: process entire raw_markdown
|
# Standard chunking: process entire raw_markdown
|
||||||
segments = self._split_into_segments(document.raw_markdown, strategy)
|
chunk_texts = self._split_into_segments(document.raw_markdown, strategy)
|
||||||
chunks = self._create_chunks(segments, document.id)
|
chunks = self._create_chunks(chunk_texts, document.id)
|
||||||
|
|
||||||
logger.info(f"Created {len(chunks)} fixed-size chunks")
|
logger.info(f"Created {len(chunks)} fixed-size chunks")
|
||||||
return chunks
|
return chunks
|
||||||
@ -136,7 +136,7 @@ class FixedSizeChunker(IChunker):
|
|||||||
self,
|
self,
|
||||||
text: str,
|
text: str,
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[tuple[str, int, int]]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Split text into fixed-size segments.
|
Split text into fixed-size segments.
|
||||||
|
|
||||||
@ -145,7 +145,7 @@ class FixedSizeChunker(IChunker):
|
|||||||
strategy: Chunking strategy configuration
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of (chunk_text, start_position, end_position) tuples
|
List of chunk text strings
|
||||||
"""
|
"""
|
||||||
segments = []
|
segments = []
|
||||||
text_length = len(text)
|
text_length = len(text)
|
||||||
@ -155,7 +155,7 @@ class FixedSizeChunker(IChunker):
|
|||||||
position = 0
|
position = 0
|
||||||
|
|
||||||
while position < text_length:
|
while position < text_length:
|
||||||
segment = self._extract_segment(
|
chunk_text = self._extract_segment(
|
||||||
text=text,
|
text=text,
|
||||||
position=position,
|
position=position,
|
||||||
chunk_size=chunk_size,
|
chunk_size=chunk_size,
|
||||||
@ -163,10 +163,8 @@ class FixedSizeChunker(IChunker):
|
|||||||
respect_boundaries=strategy.respect_boundaries,
|
respect_boundaries=strategy.respect_boundaries,
|
||||||
)
|
)
|
||||||
|
|
||||||
if segment:
|
if chunk_text and chunk_text.strip():
|
||||||
chunk_text, start_pos, end_pos = segment
|
segments.append(chunk_text)
|
||||||
if chunk_text.strip():
|
|
||||||
segments.append((chunk_text, start_pos, end_pos))
|
|
||||||
|
|
||||||
position += step_size
|
position += step_size
|
||||||
|
|
||||||
@ -183,7 +181,7 @@ class FixedSizeChunker(IChunker):
|
|||||||
chunk_size: int,
|
chunk_size: int,
|
||||||
text_length: int,
|
text_length: int,
|
||||||
respect_boundaries: bool,
|
respect_boundaries: bool,
|
||||||
) -> tuple[str, int, int] | None:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Extract a single segment from text.
|
Extract a single segment from text.
|
||||||
|
|
||||||
@ -195,16 +193,15 @@ class FixedSizeChunker(IChunker):
|
|||||||
respect_boundaries: Whether to respect boundaries
|
respect_boundaries: Whether to respect boundaries
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (chunk_text, start_pos, end_pos) or None
|
Chunk text string
|
||||||
"""
|
"""
|
||||||
end_pos = min(position + chunk_size, text_length)
|
end_pos = min(position + chunk_size, text_length)
|
||||||
chunk_text = text[position:end_pos]
|
chunk_text = text[position:end_pos]
|
||||||
|
|
||||||
if respect_boundaries and end_pos < text_length:
|
if respect_boundaries and end_pos < text_length:
|
||||||
chunk_text = self._adjust_to_boundary(text, position, end_pos)
|
chunk_text = self._adjust_to_boundary(text, position, end_pos)
|
||||||
end_pos = position + len(chunk_text)
|
|
||||||
|
|
||||||
return (chunk_text, position, end_pos)
|
return chunk_text
|
||||||
|
|
||||||
def _adjust_to_boundary(
|
def _adjust_to_boundary(
|
||||||
self,
|
self,
|
||||||
@ -258,17 +255,15 @@ class FixedSizeChunker(IChunker):
|
|||||||
global_sequence = 0
|
global_sequence = 0
|
||||||
|
|
||||||
for section_index, section in enumerate(document.sections):
|
for section_index, section in enumerate(document.sections):
|
||||||
# Split this section's content into segments
|
# Split this section's content into chunks
|
||||||
segments = self._split_into_segments(section.content, strategy)
|
chunk_texts = self._split_into_segments(section.content, strategy)
|
||||||
|
|
||||||
# Create chunks for this section
|
# Create chunks for this section
|
||||||
for text, start_char, end_char in segments:
|
for text in chunk_texts:
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
content=text,
|
content=text,
|
||||||
sequence_number=global_sequence,
|
sequence_number=global_sequence,
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
section_title=section.title,
|
section_title=section.title,
|
||||||
section_index=section_index,
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
@ -282,16 +277,16 @@ class FixedSizeChunker(IChunker):
|
|||||||
|
|
||||||
def _create_chunks(
|
def _create_chunks(
|
||||||
self,
|
self,
|
||||||
segments: List[tuple[str, int, int]],
|
chunk_texts: List[str],
|
||||||
document_id,
|
document_id,
|
||||||
section_title: Optional[str] = None,
|
section_title: Optional[str] = None,
|
||||||
section_index: Optional[int] = None,
|
section_index: Optional[int] = None,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Create Chunk entities from text segments.
|
Create Chunk entities from text strings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
segments: List of (text, start_pos, end_pos) tuples
|
chunk_texts: List of chunk text strings
|
||||||
document_id: ID of parent document
|
document_id: ID of parent document
|
||||||
section_title: Optional section title
|
section_title: Optional section title
|
||||||
section_index: Optional section index
|
section_index: Optional section index
|
||||||
@ -301,13 +296,11 @@ class FixedSizeChunker(IChunker):
|
|||||||
"""
|
"""
|
||||||
chunks = []
|
chunks = []
|
||||||
|
|
||||||
for sequence_number, (text, start_char, end_char) in enumerate(segments):
|
for sequence_number, text in enumerate(chunk_texts):
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
content=text,
|
content=text,
|
||||||
sequence_number=sequence_number,
|
sequence_number=sequence_number,
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
section_title=section_title,
|
section_title=section_title,
|
||||||
section_index=section_index,
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -70,8 +70,8 @@ class ParagraphChunker(IChunker):
|
|||||||
chunks = self._chunk_by_sections(document, strategy)
|
chunks = self._chunk_by_sections(document, strategy)
|
||||||
else:
|
else:
|
||||||
# Standard chunking: process entire raw_markdown
|
# Standard chunking: process entire raw_markdown
|
||||||
segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
|
chunk_texts = self._split_and_group_paragraphs(document.raw_markdown, strategy)
|
||||||
chunks = self._create_chunks(segments, document.id)
|
chunks = self._create_chunks(chunk_texts, document.id)
|
||||||
|
|
||||||
logger.info(f"Created {len(chunks)} paragraph-based chunks")
|
logger.info(f"Created {len(chunks)} paragraph-based chunks")
|
||||||
return chunks
|
return chunks
|
||||||
@ -136,7 +136,7 @@ class ParagraphChunker(IChunker):
|
|||||||
self,
|
self,
|
||||||
text: str,
|
text: str,
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[tuple[str, int, int]]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Split text into paragraphs and group them into chunks.
|
Split text into paragraphs and group them into chunks.
|
||||||
|
|
||||||
@ -145,14 +145,14 @@ class ParagraphChunker(IChunker):
|
|||||||
strategy: Chunking strategy configuration
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of (chunk_text, start_position, end_position) tuples
|
List of chunk text strings
|
||||||
"""
|
"""
|
||||||
# Split into paragraphs
|
# Split into paragraphs
|
||||||
paragraphs = logic_utils.split_into_paragraphs(text)
|
paragraphs = logic_utils.split_into_paragraphs(text)
|
||||||
|
|
||||||
if not paragraphs:
|
if not paragraphs:
|
||||||
# No paragraphs found, return whole text as single chunk
|
# No paragraphs found, return whole text as single chunk
|
||||||
return [(text, 0, len(text))]
|
return [text]
|
||||||
|
|
||||||
# Group paragraphs into chunks
|
# Group paragraphs into chunks
|
||||||
return self._group_paragraphs(paragraphs, strategy)
|
return self._group_paragraphs(paragraphs, strategy)
|
||||||
@ -161,7 +161,7 @@ class ParagraphChunker(IChunker):
|
|||||||
self,
|
self,
|
||||||
paragraphs: List[str],
|
paragraphs: List[str],
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[tuple[str, int, int]]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Group paragraphs into chunks based on target size.
|
Group paragraphs into chunks based on target size.
|
||||||
|
|
||||||
@ -170,12 +170,11 @@ class ParagraphChunker(IChunker):
|
|||||||
strategy: Chunking strategy
|
strategy: Chunking strategy
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of (chunk_text, start_pos, end_pos) tuples
|
List of chunk text strings
|
||||||
"""
|
"""
|
||||||
segments = []
|
segments = []
|
||||||
current_paragraphs = []
|
current_paragraphs = []
|
||||||
current_size = 0
|
current_size = 0
|
||||||
current_start = 0
|
|
||||||
|
|
||||||
for paragraph in paragraphs:
|
for paragraph in paragraphs:
|
||||||
para_size = len(paragraph)
|
para_size = len(paragraph)
|
||||||
@ -185,13 +184,11 @@ class ParagraphChunker(IChunker):
|
|||||||
current_size, para_size, strategy.chunk_size, current_paragraphs
|
current_size, para_size, strategy.chunk_size, current_paragraphs
|
||||||
):
|
):
|
||||||
# Create chunk from accumulated paragraphs
|
# Create chunk from accumulated paragraphs
|
||||||
segment = self._create_segment(
|
segment = self._create_segment(current_paragraphs)
|
||||||
current_paragraphs, current_start
|
|
||||||
)
|
|
||||||
segments.append(segment)
|
segments.append(segment)
|
||||||
|
|
||||||
# Handle overlap
|
# Handle overlap
|
||||||
current_paragraphs, current_start, current_size = (
|
current_paragraphs, current_size = (
|
||||||
self._handle_overlap(
|
self._handle_overlap(
|
||||||
segment, paragraph, para_size, strategy.overlap_size
|
segment, paragraph, para_size, strategy.overlap_size
|
||||||
)
|
)
|
||||||
@ -203,7 +200,7 @@ class ParagraphChunker(IChunker):
|
|||||||
|
|
||||||
# Add final chunk
|
# Add final chunk
|
||||||
if current_paragraphs:
|
if current_paragraphs:
|
||||||
segment = self._create_segment(current_paragraphs, current_start)
|
segment = self._create_segment(current_paragraphs)
|
||||||
segments.append(segment)
|
segments.append(segment)
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
@ -237,56 +234,49 @@ class ParagraphChunker(IChunker):
|
|||||||
def _create_segment(
|
def _create_segment(
|
||||||
self,
|
self,
|
||||||
paragraphs: List[str],
|
paragraphs: List[str],
|
||||||
start_pos: int,
|
) -> str:
|
||||||
) -> tuple[str, int, int]:
|
|
||||||
"""
|
"""
|
||||||
Create a segment from paragraphs.
|
Create a segment from paragraphs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
paragraphs: List of paragraph strings
|
paragraphs: List of paragraph strings
|
||||||
start_pos: Starting position
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (chunk_text, start_pos, end_pos)
|
Chunk text string
|
||||||
"""
|
"""
|
||||||
chunk_text = "\n\n".join(paragraphs)
|
return "\n\n".join(paragraphs)
|
||||||
end_pos = start_pos + len(chunk_text)
|
|
||||||
return (chunk_text, start_pos, end_pos)
|
|
||||||
|
|
||||||
def _handle_overlap(
|
def _handle_overlap(
|
||||||
self,
|
self,
|
||||||
previous_segment: tuple[str, int, int],
|
previous_segment: str,
|
||||||
new_paragraph: str,
|
new_paragraph: str,
|
||||||
new_para_size: int,
|
new_para_size: int,
|
||||||
overlap_size: int,
|
overlap_size: int,
|
||||||
) -> tuple[List[str], int, int]:
|
) -> tuple[List[str], int]:
|
||||||
"""
|
"""
|
||||||
Handle overlap between chunks.
|
Handle overlap between chunks.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
previous_segment: Previous chunk segment
|
previous_segment: Previous chunk text
|
||||||
new_paragraph: New paragraph to start with
|
new_paragraph: New paragraph to start with
|
||||||
new_para_size: Size of new paragraph
|
new_para_size: Size of new paragraph
|
||||||
overlap_size: Desired overlap size
|
overlap_size: Desired overlap size
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (new_paragraphs, new_start, new_size)
|
Tuple of (new_paragraphs, new_size)
|
||||||
"""
|
"""
|
||||||
if overlap_size > 0:
|
if overlap_size > 0:
|
||||||
prev_text, _, prev_end = previous_segment
|
|
||||||
overlap_text = logic_utils.calculate_overlap_text(
|
overlap_text = logic_utils.calculate_overlap_text(
|
||||||
text=prev_text,
|
text=previous_segment,
|
||||||
overlap_size=overlap_size,
|
overlap_size=overlap_size,
|
||||||
from_start=False,
|
from_start=False,
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
[overlap_text, new_paragraph],
|
[overlap_text, new_paragraph],
|
||||||
prev_end - len(overlap_text),
|
|
||||||
len(overlap_text) + new_para_size,
|
len(overlap_text) + new_para_size,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
_, _, prev_end = previous_segment
|
return ([new_paragraph], new_para_size)
|
||||||
return ([new_paragraph], prev_end, new_para_size)
|
|
||||||
|
|
||||||
def _chunk_by_sections(
|
def _chunk_by_sections(
|
||||||
self,
|
self,
|
||||||
@ -297,6 +287,7 @@ class ParagraphChunker(IChunker):
|
|||||||
Chunk document by processing each section independently.
|
Chunk document by processing each section independently.
|
||||||
|
|
||||||
This prevents chunks from spanning across section boundaries.
|
This prevents chunks from spanning across section boundaries.
|
||||||
|
Each chunk is prefixed with the document title and section title.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
document: Document with sections
|
document: Document with sections
|
||||||
@ -308,18 +299,22 @@ class ParagraphChunker(IChunker):
|
|||||||
all_chunks = []
|
all_chunks = []
|
||||||
global_sequence = 0
|
global_sequence = 0
|
||||||
|
|
||||||
for section_index, section in enumerate(document.sections):
|
# Get document title from metadata
|
||||||
# Split this section's content into paragraph-based segments
|
document_title = document.metadata.display_name
|
||||||
segments = self._split_and_group_paragraphs(section.content, strategy)
|
|
||||||
|
for section_index, section in enumerate(document.sections):
|
||||||
|
# Split this section's content into paragraph-based chunks
|
||||||
|
chunk_texts = self._split_and_group_paragraphs(section.content, strategy)
|
||||||
|
|
||||||
|
# Create chunks for this section with title prefix
|
||||||
|
for text in chunk_texts:
|
||||||
|
# Prepend document title and section title to chunk content
|
||||||
|
prefixed_content = f"{document_title}\n{section.title}\n{text}"
|
||||||
|
|
||||||
# Create chunks for this section
|
|
||||||
for text, start_char, end_char in segments:
|
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
content=text,
|
content=prefixed_content,
|
||||||
sequence_number=global_sequence,
|
sequence_number=global_sequence,
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
section_title=section.title,
|
section_title=section.title,
|
||||||
section_index=section_index,
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
@ -333,16 +328,16 @@ class ParagraphChunker(IChunker):
|
|||||||
|
|
||||||
def _create_chunks(
|
def _create_chunks(
|
||||||
self,
|
self,
|
||||||
segments: List[tuple[str, int, int]],
|
chunk_texts: List[str],
|
||||||
document_id,
|
document_id,
|
||||||
section_title: Optional[str] = None,
|
section_title: Optional[str] = None,
|
||||||
section_index: Optional[int] = None,
|
section_index: Optional[int] = None,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Create Chunk entities from text segments.
|
Create Chunk entities from text strings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
segments: List of (text, start_pos, end_pos) tuples
|
chunk_texts: List of chunk text strings
|
||||||
document_id: ID of parent document
|
document_id: ID of parent document
|
||||||
section_title: Optional section title
|
section_title: Optional section title
|
||||||
section_index: Optional section index
|
section_index: Optional section index
|
||||||
@ -352,13 +347,11 @@ class ParagraphChunker(IChunker):
|
|||||||
"""
|
"""
|
||||||
chunks = []
|
chunks = []
|
||||||
|
|
||||||
for sequence_number, (text, start_char, end_char) in enumerate(segments):
|
for sequence_number, text in enumerate(chunk_texts):
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
content=text,
|
content=text,
|
||||||
sequence_number=sequence_number,
|
sequence_number=sequence_number,
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
section_title=section_title,
|
section_title=section_title,
|
||||||
section_index=section_index,
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,13 +1,15 @@
|
|||||||
"""
|
"""
|
||||||
DOCX Extractor - Concrete implementation for Word document extraction.
|
DOCX Extractor - Concrete implementation for Word document extraction.
|
||||||
|
|
||||||
This adapter implements the IExtractor port using python-docx library.
|
This adapter implements the IExtractor port using Docling library.
|
||||||
It maps python-docx exceptions to domain exceptions.
|
It maps Docling exceptions to domain exceptions.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
from ....core.domain.exceptions import (
|
from ....core.domain.exceptions import (
|
||||||
EmptyContentError,
|
EmptyContentError,
|
||||||
ExtractionError,
|
ExtractionError,
|
||||||
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class DocxExtractor(IExtractor):
|
class DocxExtractor(IExtractor):
|
||||||
"""
|
"""
|
||||||
Concrete DOCX extractor using python-docx.
|
Concrete DOCX extractor using Docling.
|
||||||
|
|
||||||
This adapter:
|
This adapter:
|
||||||
1. Extracts text from DOCX files using python-docx
|
1. Extracts text from DOCX files using Docling's DocumentConverter
|
||||||
2. Handles paragraphs and tables
|
2. Converts DOCX to Markdown format
|
||||||
3. Maps exceptions to domain exceptions
|
3. Extracts metadata from document
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize DOCX extractor."""
|
"""Initialize DOCX extractor with Docling converter."""
|
||||||
self._supported_extensions = ['docx']
|
self._supported_extensions = ['docx']
|
||||||
logger.debug("DocxExtractor initialized")
|
self._converter = DocumentConverter()
|
||||||
|
logger.debug("DocxExtractor initialized with Docling")
|
||||||
|
|
||||||
def extract(self, file_path: Path) -> Document:
|
def extract(self, file_path: Path) -> Document:
|
||||||
"""
|
"""
|
||||||
Extract text and metadata from DOCX file.
|
Extract text and metadata from DOCX file using Docling.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the DOCX file
|
file_path: Path to the DOCX file
|
||||||
@ -54,21 +57,22 @@ class DocxExtractor(IExtractor):
|
|||||||
# Validate file
|
# Validate file
|
||||||
self._validate_file(file_path)
|
self._validate_file(file_path)
|
||||||
|
|
||||||
# Extract text
|
# Convert DOCX to markdown using Docling
|
||||||
text = self._extract_text_from_docx(file_path)
|
result = self._converter.convert(str(file_path))
|
||||||
|
markdown_text = result.document.export_to_markdown()
|
||||||
|
|
||||||
# Validate content
|
# Validate content
|
||||||
if not text or not text.strip():
|
if not markdown_text or not markdown_text.strip():
|
||||||
raise EmptyContentError(file_path=str(file_path))
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
# Create metadata
|
# Create metadata
|
||||||
metadata = self._create_metadata(file_path)
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=text, metadata=metadata)
|
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(text)} characters from {file_path.name}"
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
)
|
)
|
||||||
return document
|
return document
|
||||||
|
|
||||||
@ -130,83 +134,6 @@ class DocxExtractor(IExtractor):
|
|||||||
if file_path.stat().st_size == 0:
|
if file_path.stat().st_size == 0:
|
||||||
raise EmptyContentError(file_path=str(file_path))
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
def _extract_text_from_docx(self, file_path: Path) -> str:
|
|
||||||
"""
|
|
||||||
Extract text from DOCX using python-docx.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to DOCX file
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Extracted text content
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ExtractionError: If DOCX extraction fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import docx
|
|
||||||
|
|
||||||
logger.debug(f"Reading DOCX: {file_path}")
|
|
||||||
document = docx.Document(file_path)
|
|
||||||
|
|
||||||
# Extract paragraphs
|
|
||||||
text_parts = self._extract_paragraphs(document)
|
|
||||||
|
|
||||||
# Extract tables
|
|
||||||
table_text = self._extract_tables(document)
|
|
||||||
if table_text:
|
|
||||||
text_parts.extend(table_text)
|
|
||||||
|
|
||||||
return "\n".join(text_parts)
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
raise ExtractionError(
|
|
||||||
message="python-docx library not installed",
|
|
||||||
details="Install with: pip install python-docx",
|
|
||||||
file_path=str(file_path),
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise ExtractionError(
|
|
||||||
message=f"DOCX extraction failed: {str(e)}",
|
|
||||||
file_path=str(file_path),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_paragraphs(self, document) -> List[str]:
|
|
||||||
"""
|
|
||||||
Extract text from all paragraphs.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document: python-docx Document object
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of paragraph texts
|
|
||||||
"""
|
|
||||||
paragraphs = []
|
|
||||||
for paragraph in document.paragraphs:
|
|
||||||
text = paragraph.text.strip()
|
|
||||||
if text:
|
|
||||||
paragraphs.append(text)
|
|
||||||
return paragraphs
|
|
||||||
|
|
||||||
def _extract_tables(self, document) -> List[str]:
|
|
||||||
"""
|
|
||||||
Extract text from all tables.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document: python-docx Document object
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of table cell texts
|
|
||||||
"""
|
|
||||||
table_texts = []
|
|
||||||
for table in document.tables:
|
|
||||||
for row in table.rows:
|
|
||||||
for cell in row.cells:
|
|
||||||
text = cell.text.strip()
|
|
||||||
if text:
|
|
||||||
table_texts.append(text)
|
|
||||||
return table_texts
|
|
||||||
|
|
||||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||||
"""
|
"""
|
||||||
Create source-neutral document metadata from file.
|
Create source-neutral document metadata from file.
|
||||||
@ -222,6 +149,6 @@ class DocxExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.name,
|
display_name=file_path.stem,
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
154
src/adapters/outgoing/extractors/excel_extractor.py
Normal file
154
src/adapters/outgoing/extractors/excel_extractor.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
"""
|
||||||
|
Excel Extractor - Concrete implementation for Excel file extraction.
|
||||||
|
|
||||||
|
This adapter implements the IExtractor port using Docling library.
|
||||||
|
It maps Docling exceptions to domain exceptions.
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
from ....core.domain.exceptions import (
|
||||||
|
EmptyContentError,
|
||||||
|
ExtractionError,
|
||||||
|
)
|
||||||
|
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||||
|
from ....core.ports.outgoing.extractor import IExtractor
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ExcelExtractor(IExtractor):
|
||||||
|
"""
|
||||||
|
Concrete Excel extractor using Docling.
|
||||||
|
|
||||||
|
This adapter:
|
||||||
|
1. Extracts text from Excel files (.xlsx, .xls) using Docling's DocumentConverter
|
||||||
|
2. Converts Excel to Markdown format
|
||||||
|
3. Extracts metadata from spreadsheet
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
"""Initialize Excel extractor with Docling converter."""
|
||||||
|
self._supported_extensions = ['xlsx', 'xls']
|
||||||
|
self._converter = DocumentConverter()
|
||||||
|
logger.debug("ExcelExtractor initialized with Docling")
|
||||||
|
|
||||||
|
def extract(self, file_path: Path) -> Document:
|
||||||
|
"""
|
||||||
|
Extract text and metadata from Excel file using Docling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the Excel file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document entity with extracted content and metadata
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If extraction fails
|
||||||
|
EmptyContentError: If no text could be extracted
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Extracting text from Excel: {file_path}")
|
||||||
|
|
||||||
|
# Validate file
|
||||||
|
self._validate_file(file_path)
|
||||||
|
|
||||||
|
# Convert Excel to markdown using Docling
|
||||||
|
result = self._converter.convert(str(file_path))
|
||||||
|
markdown_text = result.document.export_to_markdown()
|
||||||
|
|
||||||
|
# Validate content
|
||||||
|
if not markdown_text or not markdown_text.strip():
|
||||||
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
|
# Create metadata
|
||||||
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
|
# Build document with raw_markdown
|
||||||
|
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
|
)
|
||||||
|
return document
|
||||||
|
|
||||||
|
except EmptyContentError:
|
||||||
|
raise
|
||||||
|
except ExtractionError:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Excel extraction failed for {file_path}: {str(e)}")
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"Failed to extract text from {file_path.name}",
|
||||||
|
details=str(e),
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
def supports_file_type(self, file_extension: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if this extractor supports Excel files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_extension: File extension (e.g., 'xlsx', 'xls')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if Excel files are supported
|
||||||
|
"""
|
||||||
|
return file_extension.lower() in self._supported_extensions
|
||||||
|
|
||||||
|
def get_supported_types(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get list of supported file extensions.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List containing 'xlsx' and 'xls'
|
||||||
|
"""
|
||||||
|
return self._supported_extensions.copy()
|
||||||
|
|
||||||
|
def _validate_file(self, file_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
Validate file exists and is readable.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to validate
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If file is invalid
|
||||||
|
"""
|
||||||
|
if not file_path.exists():
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"File not found: {file_path}",
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not file_path.is_file():
|
||||||
|
raise ExtractionError(
|
||||||
|
message=f"Path is not a file: {file_path}",
|
||||||
|
file_path=str(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_path.stat().st_size == 0:
|
||||||
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
|
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||||
|
"""
|
||||||
|
Create document metadata from Excel file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the Excel file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DocumentMetadata entity
|
||||||
|
"""
|
||||||
|
stat = file_path.stat()
|
||||||
|
|
||||||
|
return DocumentMetadata(
|
||||||
|
source_id=str(file_path.absolute()),
|
||||||
|
source_type=SourceType.FILE,
|
||||||
|
display_name=file_path.stem,
|
||||||
|
size_bytes=stat.st_size,
|
||||||
|
)
|
||||||
@ -181,6 +181,6 @@ class MarkdownExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.name,
|
display_name=file_path.stem,
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,13 +1,15 @@
|
|||||||
"""
|
"""
|
||||||
PDF Extractor - Concrete implementation for PDF text extraction.
|
PDF Extractor - Concrete implementation for PDF text extraction.
|
||||||
|
|
||||||
This adapter implements the IExtractor port using PyPDF2 library.
|
This adapter implements the IExtractor port using Docling library.
|
||||||
It maps PyPDF2 exceptions to domain exceptions.
|
It maps Docling exceptions to domain exceptions.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
from ....core.domain.exceptions import (
|
from ....core.domain.exceptions import (
|
||||||
EmptyContentError,
|
EmptyContentError,
|
||||||
ExtractionError,
|
ExtractionError,
|
||||||
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class PDFExtractor(IExtractor):
|
class PDFExtractor(IExtractor):
|
||||||
"""
|
"""
|
||||||
Concrete PDF extractor using PyPDF2.
|
Concrete PDF extractor using Docling.
|
||||||
|
|
||||||
This adapter:
|
This adapter:
|
||||||
1. Extracts text from PDF files using PyPDF2
|
1. Extracts text from PDF files using Docling's DocumentConverter
|
||||||
2. Maps PyPDF2 exceptions to domain exceptions
|
2. Converts PDF to Markdown format
|
||||||
3. Creates Document entities with metadata
|
3. Extracts metadata including page count
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize PDF extractor."""
|
"""Initialize PDF extractor with Docling converter."""
|
||||||
self._supported_extensions = ['pdf']
|
self._supported_extensions = ['pdf']
|
||||||
logger.debug("PDFExtractor initialized")
|
self._converter = DocumentConverter()
|
||||||
|
logger.debug("PDFExtractor initialized with Docling")
|
||||||
|
|
||||||
def extract(self, file_path: Path) -> Document:
|
def extract(self, file_path: Path) -> Document:
|
||||||
"""
|
"""
|
||||||
Extract text and metadata from PDF file.
|
Extract text and metadata from PDF file using Docling.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the PDF file
|
file_path: Path to the PDF file
|
||||||
@ -54,21 +57,22 @@ class PDFExtractor(IExtractor):
|
|||||||
# Validate file
|
# Validate file
|
||||||
self._validate_file(file_path)
|
self._validate_file(file_path)
|
||||||
|
|
||||||
# Extract text
|
# Convert PDF to markdown using Docling
|
||||||
text = self._extract_text_from_pdf(file_path)
|
result = self._converter.convert(str(file_path))
|
||||||
|
markdown_text = result.document.export_to_markdown()
|
||||||
|
|
||||||
# Validate content
|
# Validate content
|
||||||
if not text or not text.strip():
|
if not markdown_text or not markdown_text.strip():
|
||||||
raise EmptyContentError(file_path=str(file_path))
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
# Create metadata
|
# Create metadata with page count from Docling result
|
||||||
metadata = self._create_metadata(file_path)
|
metadata = self._create_metadata(file_path, result)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=text, metadata=metadata)
|
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(text)} characters from {file_path.name}"
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
)
|
)
|
||||||
return document
|
return document
|
||||||
|
|
||||||
@ -130,89 +134,35 @@ class PDFExtractor(IExtractor):
|
|||||||
if file_path.stat().st_size == 0:
|
if file_path.stat().st_size == 0:
|
||||||
raise EmptyContentError(file_path=str(file_path))
|
raise EmptyContentError(file_path=str(file_path))
|
||||||
|
|
||||||
def _extract_text_from_pdf(self, file_path: Path) -> str:
|
def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
|
||||||
"""
|
"""
|
||||||
Extract text from PDF using PyPDF2.
|
Create document metadata from PDF file and Docling result.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to PDF file
|
file_path: Path to the PDF file
|
||||||
|
result: Docling conversion result
|
||||||
Returns:
|
|
||||||
Extracted text content
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ExtractionError: If PDF extraction fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import PyPDF2
|
|
||||||
|
|
||||||
logger.debug(f"Reading PDF: {file_path}")
|
|
||||||
text_parts = []
|
|
||||||
|
|
||||||
with open(file_path, 'rb') as pdf_file:
|
|
||||||
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
|
||||||
num_pages = len(pdf_reader.pages)
|
|
||||||
logger.debug(f"PDF has {num_pages} pages")
|
|
||||||
|
|
||||||
for page_num, page in enumerate(pdf_reader.pages, start=1):
|
|
||||||
page_text = self._extract_page_text(page, page_num)
|
|
||||||
if page_text:
|
|
||||||
text_parts.append(page_text)
|
|
||||||
|
|
||||||
return "\n\n".join(text_parts)
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
raise ExtractionError(
|
|
||||||
message="PyPDF2 library not installed",
|
|
||||||
details="Install with: pip install PyPDF2",
|
|
||||||
file_path=str(file_path),
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise ExtractionError(
|
|
||||||
message=f"PDF extraction failed: {str(e)}",
|
|
||||||
file_path=str(file_path),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_page_text(self, page, page_num: int) -> str:
|
|
||||||
"""
|
|
||||||
Extract text from a single page.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
page: PyPDF2 page object
|
|
||||||
page_num: Page number for logging
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Extracted page text
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import PyPDF2
|
|
||||||
|
|
||||||
text = page.extract_text()
|
|
||||||
logger.debug(f"Extracted page {page_num}")
|
|
||||||
return text
|
|
||||||
|
|
||||||
except PyPDF2.errors.PdfReadError as e:
|
|
||||||
logger.warning(f"Failed to extract page {page_num}: {str(e)}")
|
|
||||||
return ""
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Error on page {page_num}: {str(e)}")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
|
||||||
"""
|
|
||||||
Create source-neutral document metadata from file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to the file
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DocumentMetadata entity
|
DocumentMetadata entity
|
||||||
"""
|
"""
|
||||||
stat = file_path.stat()
|
stat = file_path.stat()
|
||||||
|
|
||||||
|
# Extract page count from Docling result
|
||||||
|
page_count = None
|
||||||
|
try:
|
||||||
|
if hasattr(result.document, 'pages'):
|
||||||
|
page_count = len(result.document.pages)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not extract page count: {str(e)}")
|
||||||
|
|
||||||
|
extra_metadata = {}
|
||||||
|
if page_count is not None:
|
||||||
|
extra_metadata['page_count'] = str(page_count)
|
||||||
|
|
||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.name,
|
display_name=file_path.stem,
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
|
extra_metadata=extra_metadata,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -200,6 +200,6 @@ class TxtExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.name,
|
display_name=file_path.stem,
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -227,7 +227,7 @@ class ZipExtractor(IExtractor):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip files with 'nohf' in their name
|
# Skip files with 'nohf' in their name
|
||||||
if 'nohf' in filename.lower():
|
if 'nohf' not in filename.lower():
|
||||||
logger.debug(f"Skipping 'nohf' file: {filename}")
|
logger.debug(f"Skipping 'nohf' file: {filename}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -312,6 +312,6 @@ class ZipExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.name,
|
display_name=file_path.stem,
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -15,6 +15,7 @@ from .adapters.outgoing.chunkers.context import ChunkingContext
|
|||||||
from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
|
from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
|
||||||
from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
||||||
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
|
from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
|
||||||
|
from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
|
||||||
from .adapters.outgoing.extractors.factory import ExtractorFactory
|
from .adapters.outgoing.extractors.factory import ExtractorFactory
|
||||||
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
|
from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
|
||||||
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
|
from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
|
||||||
@ -118,6 +119,7 @@ class ApplicationContainer:
|
|||||||
# Register all extractors
|
# Register all extractors
|
||||||
factory.register_extractor(PDFExtractor())
|
factory.register_extractor(PDFExtractor())
|
||||||
factory.register_extractor(DocxExtractor())
|
factory.register_extractor(DocxExtractor())
|
||||||
|
factory.register_extractor(ExcelExtractor())
|
||||||
factory.register_extractor(TxtExtractor())
|
factory.register_extractor(TxtExtractor())
|
||||||
factory.register_extractor(MarkdownExtractor())
|
factory.register_extractor(MarkdownExtractor())
|
||||||
factory.register_extractor(ZipExtractor())
|
factory.register_extractor(ZipExtractor())
|
||||||
|
|||||||
@ -14,6 +14,13 @@ class Settings(BaseSettings):
|
|||||||
S3_ENDPOINT_URL: Optional[str] = "https://cdn.d.aiengines.ir"
|
S3_ENDPOINT_URL: Optional[str] = "https://cdn.d.aiengines.ir"
|
||||||
S3_PRESIGNED_URL_EXPIRATION: int = 3600
|
S3_PRESIGNED_URL_EXPIRATION: int = 3600
|
||||||
S3_UPLOAD_PATH_PREFIX: str = "extractions"
|
S3_UPLOAD_PATH_PREFIX: str = "extractions"
|
||||||
|
|
||||||
|
API_KEY: str = "some-secret-api-key"
|
||||||
|
API_KEY_NAME: str = "API-Key"
|
||||||
|
|
||||||
|
DOCS_USERNAME: str = "admin"
|
||||||
|
DOCS_PASSWORD: str = "admin"
|
||||||
|
|
||||||
LOG_LEVEL: str = "INFO"
|
LOG_LEVEL: str = "INFO"
|
||||||
|
|
||||||
model_config = SettingsConfigDict(
|
model_config = SettingsConfigDict(
|
||||||
|
|||||||
@ -126,7 +126,7 @@ class DocumentSection(BaseModel):
|
|||||||
level: Header level (1-6 for h1-h6, 0 for Introduction)
|
level: Header level (1-6 for h1-h6, 0 for Introduction)
|
||||||
content: Section content with preserved Markdown formatting
|
content: Section content with preserved Markdown formatting
|
||||||
"""
|
"""
|
||||||
title: str = Field(..., min_length=1, description="Section title")
|
title: Optional[str] = Field(None, min_length=1, description="Section title")
|
||||||
level: int = Field(..., ge=0, le=6, description="Header level (0=intro)")
|
level: int = Field(..., ge=0, le=6, description="Header level (0=intro)")
|
||||||
content: str = Field(..., description="Section content with formatting")
|
content: str = Field(..., description="Section content with formatting")
|
||||||
|
|
||||||
@ -138,7 +138,9 @@ class DocumentSection(BaseModel):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def normalize_title(cls, value: str) -> str:
|
def normalize_title(cls, value: str) -> str:
|
||||||
"""Normalize title by stripping whitespace."""
|
"""Normalize title by stripping whitespace."""
|
||||||
return value.strip()
|
if value:
|
||||||
|
return value.strip()
|
||||||
|
return value
|
||||||
|
|
||||||
def is_introduction(self) -> bool:
|
def is_introduction(self) -> bool:
|
||||||
"""Check if this is the introduction section."""
|
"""Check if this is the introduction section."""
|
||||||
@ -358,8 +360,6 @@ class Chunk(BaseModel):
|
|||||||
document_id: ID of the parent document
|
document_id: ID of the parent document
|
||||||
content: Text content of the chunk
|
content: Text content of the chunk
|
||||||
sequence_number: Order of this chunk in the document
|
sequence_number: Order of this chunk in the document
|
||||||
start_char: Starting character position in original document
|
|
||||||
end_char: Ending character position in original document
|
|
||||||
section_title: Title of the section this chunk belongs to
|
section_title: Title of the section this chunk belongs to
|
||||||
section_index: Index of the section in document.sections
|
section_index: Index of the section in document.sections
|
||||||
metadata: Optional metadata specific to this chunk
|
metadata: Optional metadata specific to this chunk
|
||||||
@ -368,8 +368,6 @@ class Chunk(BaseModel):
|
|||||||
document_id: UUID = Field(..., description="Parent document ID")
|
document_id: UUID = Field(..., description="Parent document ID")
|
||||||
content: str = Field(..., min_length=1, description="Chunk text content")
|
content: str = Field(..., min_length=1, description="Chunk text content")
|
||||||
sequence_number: int = Field(..., ge=0, description="Chunk order in document")
|
sequence_number: int = Field(..., ge=0, description="Chunk order in document")
|
||||||
start_char: int = Field(..., ge=0, description="Start position in document")
|
|
||||||
end_char: int = Field(..., gt=0, description="End position in document")
|
|
||||||
section_title: Optional[str] = Field(None, description="Section title")
|
section_title: Optional[str] = Field(None, description="Section title")
|
||||||
section_index: Optional[int] = Field(None, ge=0, description="Section index")
|
section_index: Optional[int] = Field(None, ge=0, description="Section index")
|
||||||
metadata: Dict[str, str] = Field(default_factory=dict)
|
metadata: Dict[str, str] = Field(default_factory=dict)
|
||||||
@ -378,27 +376,6 @@ class Chunk(BaseModel):
|
|||||||
"frozen": True, # Chunks are immutable
|
"frozen": True, # Chunks are immutable
|
||||||
}
|
}
|
||||||
|
|
||||||
@model_validator(mode='after')
|
|
||||||
def validate_position_consistency(self) -> 'Chunk':
|
|
||||||
"""Ensure end position is after start position."""
|
|
||||||
if self.end_char <= self.start_char:
|
|
||||||
raise ValueError(
|
|
||||||
f"end_char ({self.end_char}) must be greater than "
|
|
||||||
f"start_char ({self.start_char})"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Validate content length matches position range
|
|
||||||
content_length = len(self.content)
|
|
||||||
position_range = self.end_char - self.start_char
|
|
||||||
|
|
||||||
if abs(content_length - position_range) > 10: # Allow small variance
|
|
||||||
raise ValueError(
|
|
||||||
f"Content length ({content_length}) doesn't match "
|
|
||||||
f"position range ({position_range})"
|
|
||||||
)
|
|
||||||
|
|
||||||
return self
|
|
||||||
|
|
||||||
def get_length(self) -> int:
|
def get_length(self) -> int:
|
||||||
"""Get the length of the chunk content."""
|
"""Get the length of the chunk content."""
|
||||||
return len(self.content)
|
return len(self.content)
|
||||||
|
|||||||
@ -50,40 +50,25 @@ def parse_markdown(text: str) -> List[DocumentSection]:
|
|||||||
sections: List[DocumentSection] = []
|
sections: List[DocumentSection] = []
|
||||||
current_heading: str | None = None
|
current_heading: str | None = None
|
||||||
current_level: int = 0
|
current_level: int = 0
|
||||||
current_content_parts: List[str] = []
|
|
||||||
|
|
||||||
def finalize_section() -> None:
|
|
||||||
"""Helper to finalize and append the current section."""
|
|
||||||
if current_heading is not None or current_content_parts:
|
|
||||||
content = "".join(current_content_parts).strip()
|
|
||||||
if content: # Only add sections with actual content
|
|
||||||
title = current_heading if current_heading else "Introduction"
|
|
||||||
sections.append(
|
|
||||||
DocumentSection(
|
|
||||||
title=title,
|
|
||||||
level=current_level,
|
|
||||||
content=content,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Walk through all children of the document
|
# Walk through all children of the document
|
||||||
for child in doc.children:
|
for child in doc.children:
|
||||||
if isinstance(child, Heading):
|
if isinstance(child, Heading):
|
||||||
# Finalize previous section before starting new one
|
# Update current heading context
|
||||||
finalize_section()
|
|
||||||
|
|
||||||
# Start new section
|
|
||||||
current_heading = _extract_heading_text(child)
|
current_heading = _extract_heading_text(child)
|
||||||
current_level = child.level
|
current_level = child.level
|
||||||
current_content_parts = []
|
|
||||||
else:
|
else:
|
||||||
# Render content back to markdown format instead of HTML
|
# Render content back to markdown format instead of HTML
|
||||||
rendered = md_renderer.render(child).strip()
|
rendered = md_renderer.render(child).strip()
|
||||||
if rendered:
|
if rendered:
|
||||||
current_content_parts.append(rendered + "\n\n")
|
# Create a separate section for each paragraph/block
|
||||||
|
sections.append(
|
||||||
# Finalize the last section
|
DocumentSection(
|
||||||
finalize_section()
|
title=current_heading,
|
||||||
|
level=current_level,
|
||||||
|
content=rendered,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user