fix: remove file extension from DocumentMetadata.display_name

add document title and section title to the beginning of each chunk in paragraph chunker
one paragraph per chunk in paragraph chunking method
2026-01-25 11:33:50 +03:30 · 2026-01-25 11:32:35 +03:30 · 2026-01-25 11:03:54 +03:30 · 2026-01-24 20:25:34 +03:30 · 2026-01-24 17:06:25 +03:30 · 2026-01-24 17:05:29 +03:30
16 changed files with 364 additions and 333 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -11,8 +11,7 @@ uvicorn[standard]==0.34.0
 python-multipart==0.0.20
 # Document Processing - Extractors
-PyPDF2==3.0.1          # PDF extraction
+docling                # Unified document extraction (PDF, DOCX, Excel)
 python-docx==1.1.2     # DOCX extraction
 # Cloud Storage
 boto3==1.35.94         # AWS S3 integration
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -18,7 +18,11 @@ from pathlib import Path
 from typing import Iterator, List, Optional
 from fastapi import APIRouter, Depends, FastAPI, File, Form, HTTPException, UploadFile, status
 from fastapi.openapi.docs import get_swagger_ui_html, get_redoc_html
 from fastapi.responses import JSONResponse
 from fastapi.security import HTTPBasicCredentials
 from .auth import check_docs_credentials, validate_api_key
 from ...core.config import get_settings
 from ...core.domain.exceptions import (
@ -41,11 +45,6 @@ from .api_schemas import (
 logger = logging.getLogger(__name__)
 # =============================================================================
 # Application Setup
 # =============================================================================
 # Load settings
 settings = get_settings()
@ -53,12 +52,19 @@ app = FastAPI(
    title="Text Processor API",
    description="Text extraction and chunking system using Hexagonal Architecture",
    version="1.0.0",
-    docs_url="/docs",
+    # docs_url=None,
-    redoc_url="/redoc",
+    # redoc_url=None,
 )
-router = APIRouter(prefix="/api/v1", tags=["Text Processing"])
+router = APIRouter(
    prefix="/api/v1",
    tags=["Text Processing"],
    dependencies=[Depends(validate_api_key)]
 )
 public_router = APIRouter(
    tags=["System"],
 )
 # =============================================================================
 # Global Exception Handler
@ -101,7 +107,7 @@ def get_service() -> ITextProcessor:
 def get_chunking_strategy(
    strategy_name: ChunkingMethod = Form(..., description="Chunking method"),
-    chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
+    chunk_size: int = Form(512, description="Target chunk size in characters", ge=1, le=10000),
    overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
    respect_boundaries: bool = Form(True, description="Respect text boundaries"),
 ) -> ChunkingStrategy:
@ -210,8 +216,6 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
            document_id=str(chunk.document_id),
            content=chunk.content,
            sequence_number=chunk.sequence_number,
            start_char=chunk.start_char,
            end_char=chunk.end_char,
            length=chunk.get_length(),
        )
        for chunk in chunks
@ -231,8 +235,8 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
 )
 async def perform_chunking(
    file: Optional[UploadFile] = File(None, description="Markdown file (.md) to upload"),
-    text: Optional[str] = Form(None, description="Markdown text to process", json_schema_extra={"x-textarea": True}),
+    text: Optional[str] = Form('', description="Markdown text to process"),
-    title: str = Form("markdown_input", description="Optional title for the document"),
+    title: Optional[str] = Form('', description="Optional title for the document"),
    strategy: ChunkingStrategy = Depends(get_chunking_strategy),
    service: ITextProcessor = Depends(get_service),
 ) -> ChunkListResponse:
@ -339,7 +343,7 @@ async def process_file(
    )
-@router.get(
+@public_router.get(
    "/health",
    response_model=HealthCheckResponse,
    status_code=status.HTTP_200_OK,
@ -356,21 +360,29 @@ async def health_check() -> HealthCheckResponse:
    )
 # =============================================================================
 # Protected Documentation Routes
 # =============================================================================
 # @app.get("/docs", include_in_schema=False)
 # def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
 #     return get_swagger_ui_html(
 #         openapi_url="/openapi.json",
 #         title="Protected Text-Processor API Docs"
 #     )
 #
 #
 # @app.get("/redoc", include_in_schema=False)
 # def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
 #     return get_redoc_html(
 #         openapi_url="/openapi.json",
 #         title="Protected Text-Processor API Docs"
 #     )
 # =============================================================================
 # Application Setup
 # =============================================================================
-# Include router in app
+# Include routers in app
 app.include_router(router)
-
+app.include_router(public_router)
@app.get("/")
 async def root():
    """Root endpoint with API information."""
    return {
        "name": "Text Processor API",
        "version": "1.0.0",
        "description": "Text extraction and chunking system using Hexagonal Architecture",
        "docs_url": "/docs",
        "api_prefix": "/api/v1",
    }
--- a/src/adapters/incoming/api_schemas.py
+++ b/src/adapters/incoming/api_schemas.py
@ -101,8 +101,6 @@ class ChunkResponse(BaseModel):
    document_id: str
    content: str
    sequence_number: int
    start_char: int
    end_char: int
    length: int
--- a/src/adapters/incoming/auth.py
+++ b/src/adapters/incoming/auth.py
@ -0,0 +1,34 @@
 import secrets
 from fastapi import Depends, HTTPException, Security, status
 from fastapi.security import APIKeyHeader, HTTPBasic, HTTPBasicCredentials
 from ...core.config import get_settings
 settings = get_settings()
 # This allows Swagger UI to detect the "Authorize" button
 api_key_header = APIKeyHeader(name=settings.API_KEY_NAME, auto_error=False)
 http_basic = HTTPBasic()
 async def validate_api_key(api_key: str = Security(api_key_header)):
    """
    Validates the X-API-Key header.
    Using secrets.compare_digest protects against timing attacks.
    """
    if not api_key or not secrets.compare_digest(api_key, settings.API_KEY):
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
            detail="Could not validate credentials. Invalid or missing API Key.",
        )
    return api_key
 security = HTTPBasic()
 def check_docs_credentials(credentials: HTTPBasicCredentials = Depends(security)):
    is_correct_user = secrets.compare_digest(credentials.username, settings.DOCS_USERNAME)
    is_correct_password = secrets.compare_digest(credentials.password, settings.DOCS_PASSWORD)
    if not (is_correct_user and is_correct_password):
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            headers={"WWW-Authenticate": "Basic"},
        )
--- a/src/adapters/outgoing/chunkers/fixed_size_chunker.py
+++ b/src/adapters/outgoing/chunkers/fixed_size_chunker.py
@ -70,8 +70,8 @@ class FixedSizeChunker(IChunker):
                chunks = self._chunk_by_sections(document, strategy)
            else:
                # Standard chunking: process entire raw_markdown
-                segments = self._split_into_segments(document.raw_markdown, strategy)
+                chunk_texts = self._split_into_segments(document.raw_markdown, strategy)
-                chunks = self._create_chunks(segments, document.id)
+                chunks = self._create_chunks(chunk_texts, document.id)
            logger.info(f"Created {len(chunks)} fixed-size chunks")
            return chunks
@ -136,7 +136,7 @@ class FixedSizeChunker(IChunker):
        self,
        text: str,
        strategy: ChunkingStrategy,
-    ) -> List[tuple[str, int, int]]:
+    ) -> List[str]:
        """
        Split text into fixed-size segments.
@ -145,7 +145,7 @@ class FixedSizeChunker(IChunker):
            strategy: Chunking strategy configuration
        Returns:
-            List of (chunk_text, start_position, end_position) tuples
+            List of chunk text strings
        """
        segments = []
        text_length = len(text)
@ -155,7 +155,7 @@ class FixedSizeChunker(IChunker):
        position = 0
        while position < text_length:
-            segment = self._extract_segment(
+            chunk_text = self._extract_segment(
                text=text,
                position=position,
                chunk_size=chunk_size,
@ -163,10 +163,8 @@ class FixedSizeChunker(IChunker):
                respect_boundaries=strategy.respect_boundaries,
            )
-            if segment:
+            if chunk_text and chunk_text.strip():
-                chunk_text, start_pos, end_pos = segment
+                segments.append(chunk_text)
                if chunk_text.strip():
                    segments.append((chunk_text, start_pos, end_pos))
            position += step_size
@ -183,7 +181,7 @@ class FixedSizeChunker(IChunker):
        chunk_size: int,
        text_length: int,
        respect_boundaries: bool,
-    ) -> tuple[str, int, int] | None:
+    ) -> str:
        """
        Extract a single segment from text.
@ -195,16 +193,15 @@ class FixedSizeChunker(IChunker):
            respect_boundaries: Whether to respect boundaries
        Returns:
-            Tuple of (chunk_text, start_pos, end_pos) or None
+            Chunk text string
        """
        end_pos = min(position + chunk_size, text_length)
        chunk_text = text[position:end_pos]
        if respect_boundaries and end_pos < text_length:
            chunk_text = self._adjust_to_boundary(text, position, end_pos)
            end_pos = position + len(chunk_text)
-        return (chunk_text, position, end_pos)
+        return chunk_text
    def _adjust_to_boundary(
        self,
@ -258,17 +255,15 @@ class FixedSizeChunker(IChunker):
        global_sequence = 0
        for section_index, section in enumerate(document.sections):
-            # Split this section's content into segments
+            # Split this section's content into chunks
-            segments = self._split_into_segments(section.content, strategy)
+            chunk_texts = self._split_into_segments(section.content, strategy)
            # Create chunks for this section
-            for text, start_char, end_char in segments:
+            for text in chunk_texts:
                chunk = Chunk(
                    document_id=document.id,
                    content=text,
                    sequence_number=global_sequence,
                    start_char=start_char,
                    end_char=end_char,
                    section_title=section.title,
                    section_index=section_index,
                )
@ -282,16 +277,16 @@ class FixedSizeChunker(IChunker):
    def _create_chunks(
        self,
-        segments: List[tuple[str, int, int]],
+        chunk_texts: List[str],
        document_id,
        section_title: Optional[str] = None,
        section_index: Optional[int] = None,
    ) -> List[Chunk]:
        """
-        Create Chunk entities from text segments.
+        Create Chunk entities from text strings.
        Args:
-            segments: List of (text, start_pos, end_pos) tuples
+            chunk_texts: List of chunk text strings
            document_id: ID of parent document
            section_title: Optional section title
            section_index: Optional section index
@ -301,13 +296,11 @@ class FixedSizeChunker(IChunker):
        """
        chunks = []
-        for sequence_number, (text, start_char, end_char) in enumerate(segments):
+        for sequence_number, text in enumerate(chunk_texts):
            chunk = Chunk(
                document_id=document_id,
                content=text,
                sequence_number=sequence_number,
                start_char=start_char,
                end_char=end_char,
                section_title=section_title,
                section_index=section_index,
            )
--- a/src/adapters/outgoing/chunkers/paragraph_chunker.py
+++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py
@ -70,8 +70,8 @@ class ParagraphChunker(IChunker):
                chunks = self._chunk_by_sections(document, strategy)
            else:
                # Standard chunking: process entire raw_markdown
-                segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
+                chunk_texts = self._split_and_group_paragraphs(document.raw_markdown, strategy)
-                chunks = self._create_chunks(segments, document.id)
+                chunks = self._create_chunks(chunk_texts, document.id)
            logger.info(f"Created {len(chunks)} paragraph-based chunks")
            return chunks
@ -136,7 +136,7 @@ class ParagraphChunker(IChunker):
        self,
        text: str,
        strategy: ChunkingStrategy,
-    ) -> List[tuple[str, int, int]]:
+    ) -> List[str]:
        """
        Split text into paragraphs and group them into chunks.
@ -145,14 +145,14 @@ class ParagraphChunker(IChunker):
            strategy: Chunking strategy configuration
        Returns:
-            List of (chunk_text, start_position, end_position) tuples
+            List of chunk text strings
        """
        # Split into paragraphs
        paragraphs = logic_utils.split_into_paragraphs(text)
        if not paragraphs:
            # No paragraphs found, return whole text as single chunk
-            return [(text, 0, len(text))]
+            return [text]
        # Group paragraphs into chunks
        return self._group_paragraphs(paragraphs, strategy)
@ -161,7 +161,7 @@ class ParagraphChunker(IChunker):
        self,
        paragraphs: List[str],
        strategy: ChunkingStrategy,
-    ) -> List[tuple[str, int, int]]:
+    ) -> List[str]:
        """
        Group paragraphs into chunks based on target size.
@ -170,12 +170,11 @@ class ParagraphChunker(IChunker):
            strategy: Chunking strategy
        Returns:
-            List of (chunk_text, start_pos, end_pos) tuples
+            List of chunk text strings
        """
        segments = []
        current_paragraphs = []
        current_size = 0
        current_start = 0
        for paragraph in paragraphs:
            para_size = len(paragraph)
@ -185,13 +184,11 @@ class ParagraphChunker(IChunker):
                current_size, para_size, strategy.chunk_size, current_paragraphs
            ):
                # Create chunk from accumulated paragraphs
-                segment = self._create_segment(
+                segment = self._create_segment(current_paragraphs)
                    current_paragraphs, current_start
                )
                segments.append(segment)
                # Handle overlap
-                current_paragraphs, current_start, current_size = (
+                current_paragraphs, current_size = (
                    self._handle_overlap(
                        segment, paragraph, para_size, strategy.overlap_size
                    )
@ -203,7 +200,7 @@ class ParagraphChunker(IChunker):
        # Add final chunk
        if current_paragraphs:
-            segment = self._create_segment(current_paragraphs, current_start)
+            segment = self._create_segment(current_paragraphs)
            segments.append(segment)
        logger.debug(
@ -237,56 +234,49 @@ class ParagraphChunker(IChunker):
    def _create_segment(
        self,
        paragraphs: List[str],
-        start_pos: int,
+    ) -> str:
    ) -> tuple[str, int, int]:
        """
        Create a segment from paragraphs.
        Args:
            paragraphs: List of paragraph strings
            start_pos: Starting position
        Returns:
-            Tuple of (chunk_text, start_pos, end_pos)
+            Chunk text string
        """
-        chunk_text = "\n\n".join(paragraphs)
+        return "\n\n".join(paragraphs)
        end_pos = start_pos + len(chunk_text)
        return (chunk_text, start_pos, end_pos)
    def _handle_overlap(
        self,
-        previous_segment: tuple[str, int, int],
+        previous_segment: str,
        new_paragraph: str,
        new_para_size: int,
        overlap_size: int,
-    ) -> tuple[List[str], int, int]:
+    ) -> tuple[List[str], int]:
        """
        Handle overlap between chunks.
        Args:
-            previous_segment: Previous chunk segment
+            previous_segment: Previous chunk text
            new_paragraph: New paragraph to start with
            new_para_size: Size of new paragraph
            overlap_size: Desired overlap size
        Returns:
-            Tuple of (new_paragraphs, new_start, new_size)
+            Tuple of (new_paragraphs, new_size)
        """
        if overlap_size > 0:
            prev_text, _, prev_end = previous_segment
            overlap_text = logic_utils.calculate_overlap_text(
-                text=prev_text,
+                text=previous_segment,
                overlap_size=overlap_size,
                from_start=False,
            )
            return (
                [overlap_text, new_paragraph],
                prev_end - len(overlap_text),
                len(overlap_text) + new_para_size,
            )
        else:
-            _, _, prev_end = previous_segment
+            return ([new_paragraph], new_para_size)
            return ([new_paragraph], prev_end, new_para_size)
    def _chunk_by_sections(
        self,
@ -297,6 +287,7 @@ class ParagraphChunker(IChunker):
        Chunk document by processing each section independently.
        This prevents chunks from spanning across section boundaries.
        Each chunk is prefixed with the document title and section title.
        Args:
            document: Document with sections
@ -308,18 +299,22 @@ class ParagraphChunker(IChunker):
        all_chunks = []
        global_sequence = 0
-        for section_index, section in enumerate(document.sections):
+        # Get document title from metadata
-            # Split this section's content into paragraph-based segments
+        document_title = document.metadata.display_name
-            segments = self._split_and_group_paragraphs(section.content, strategy)
+
        for section_index, section in enumerate(document.sections):
            # Split this section's content into paragraph-based chunks
            chunk_texts = self._split_and_group_paragraphs(section.content, strategy)
            # Create chunks for this section with title prefix
            for text in chunk_texts:
                # Prepend document title and section title to chunk content
                prefixed_content = f"{document_title}\n{section.title}\n{text}"
            # Create chunks for this section
            for text, start_char, end_char in segments:
                chunk = Chunk(
                    document_id=document.id,
-                    content=text,
+                    content=prefixed_content,
                    sequence_number=global_sequence,
                    start_char=start_char,
                    end_char=end_char,
                    section_title=section.title,
                    section_index=section_index,
                )
@ -333,16 +328,16 @@ class ParagraphChunker(IChunker):
    def _create_chunks(
        self,
-        segments: List[tuple[str, int, int]],
+        chunk_texts: List[str],
        document_id,
        section_title: Optional[str] = None,
        section_index: Optional[int] = None,
    ) -> List[Chunk]:
        """
-        Create Chunk entities from text segments.
+        Create Chunk entities from text strings.
        Args:
-            segments: List of (text, start_pos, end_pos) tuples
+            chunk_texts: List of chunk text strings
            document_id: ID of parent document
            section_title: Optional section title
            section_index: Optional section index
@ -352,13 +347,11 @@ class ParagraphChunker(IChunker):
        """
        chunks = []
-        for sequence_number, (text, start_char, end_char) in enumerate(segments):
+        for sequence_number, text in enumerate(chunk_texts):
            chunk = Chunk(
                document_id=document_id,
                content=text,
                sequence_number=sequence_number,
                start_char=start_char,
                end_char=end_char,
                section_title=section_title,
                section_index=section_index,
            )
--- a/src/adapters/outgoing/extractors/docx_extractor.py
+++ b/src/adapters/outgoing/extractors/docx_extractor.py
@ -1,13 +1,15 @@
 """
 DOCX Extractor - Concrete implementation for Word document extraction.
-This adapter implements the IExtractor port using python-docx library.
+This adapter implements the IExtractor port using Docling library.
-It maps python-docx exceptions to domain exceptions.
+It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List
 from docling.document_converter import DocumentConverter
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
 class DocxExtractor(IExtractor):
    """
-    Concrete DOCX extractor using python-docx.
+    Concrete DOCX extractor using Docling.
    This adapter:
-    1. Extracts text from DOCX files using python-docx
+    1. Extracts text from DOCX files using Docling's DocumentConverter
-    2. Handles paragraphs and tables
+    2. Converts DOCX to Markdown format
-    3. Maps exceptions to domain exceptions
+    3. Extracts metadata from document
    """
    def __init__(self) -> None:
-        """Initialize DOCX extractor."""
+        """Initialize DOCX extractor with Docling converter."""
        self._supported_extensions = ['docx']
-        logger.debug("DocxExtractor initialized")
+        self._converter = DocumentConverter()
        logger.debug("DocxExtractor initialized with Docling")
    def extract(self, file_path: Path) -> Document:
        """
-        Extract text and metadata from DOCX file.
+        Extract text and metadata from DOCX file using Docling.
        Args:
            file_path: Path to the DOCX file
@ -54,21 +57,22 @@ class DocxExtractor(IExtractor):
            # Validate file
            self._validate_file(file_path)
-            # Extract text
+            # Convert DOCX to markdown using Docling
-            text = self._extract_text_from_docx(file_path)
+            result = self._converter.convert(str(file_path))
            markdown_text = result.document.export_to_markdown()
            # Validate content
-            if not text or not text.strip():
+            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))
            # Create metadata
            metadata = self._create_metadata(file_path)
            # Build document with raw_markdown
-            document = Document(raw_markdown=text, metadata=metadata)
+            document = Document(raw_markdown=markdown_text, metadata=metadata)
            logger.info(
-                f"Successfully extracted {len(text)} characters from {file_path.name}"
+                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document
@ -130,83 +134,6 @@ class DocxExtractor(IExtractor):
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))
    def _extract_text_from_docx(self, file_path: Path) -> str:
        """
        Extract text from DOCX using python-docx.
        Args:
            file_path: Path to DOCX file
        Returns:
            Extracted text content
        Raises:
            ExtractionError: If DOCX extraction fails
        """
        try:
            import docx
            logger.debug(f"Reading DOCX: {file_path}")
            document = docx.Document(file_path)
            # Extract paragraphs
            text_parts = self._extract_paragraphs(document)
            # Extract tables
            table_text = self._extract_tables(document)
            if table_text:
                text_parts.extend(table_text)
            return "\n".join(text_parts)
        except ImportError:
            raise ExtractionError(
                message="python-docx library not installed",
                details="Install with: pip install python-docx",
                file_path=str(file_path),
            )
        except Exception as e:
            raise ExtractionError(
                message=f"DOCX extraction failed: {str(e)}",
                file_path=str(file_path),
            )
    def _extract_paragraphs(self, document) -> List[str]:
        """
        Extract text from all paragraphs.
        Args:
            document: python-docx Document object
        Returns:
            List of paragraph texts
        """
        paragraphs = []
        for paragraph in document.paragraphs:
            text = paragraph.text.strip()
            if text:
                paragraphs.append(text)
        return paragraphs
    def _extract_tables(self, document) -> List[str]:
        """
        Extract text from all tables.
        Args:
            document: python-docx Document object
        Returns:
            List of table cell texts
        """
        table_texts = []
        for table in document.tables:
            for row in table.rows:
                for cell in row.cells:
                    text = cell.text.strip()
                    if text:
                        table_texts.append(text)
        return table_texts
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
        Create source-neutral document metadata from file.
@ -222,6 +149,6 @@ class DocxExtractor(IExtractor):
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
-            display_name=file_path.name,
+            display_name=file_path.stem,
            size_bytes=stat.st_size,
        )
--- a/src/adapters/outgoing/extractors/excel_extractor.py
+++ b/src/adapters/outgoing/extractors/excel_extractor.py
@ -0,0 +1,154 @@
 """
 Excel Extractor - Concrete implementation for Excel file extraction.
 This adapter implements the IExtractor port using Docling library.
 It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List
 from docling.document_converter import DocumentConverter
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
 )
 from ....core.domain.models import Document, DocumentMetadata, SourceType
 from ....core.ports.outgoing.extractor import IExtractor
 logger = logging.getLogger(__name__)
 class ExcelExtractor(IExtractor):
    """
    Concrete Excel extractor using Docling.
    This adapter:
    1. Extracts text from Excel files (.xlsx, .xls) using Docling's DocumentConverter
    2. Converts Excel to Markdown format
    3. Extracts metadata from spreadsheet
    """
    def __init__(self) -> None:
        """Initialize Excel extractor with Docling converter."""
        self._supported_extensions = ['xlsx', 'xls']
        self._converter = DocumentConverter()
        logger.debug("ExcelExtractor initialized with Docling")
    def extract(self, file_path: Path) -> Document:
        """
        Extract text and metadata from Excel file using Docling.
        Args:
            file_path: Path to the Excel file
        Returns:
            Document entity with extracted content and metadata
        Raises:
            ExtractionError: If extraction fails
            EmptyContentError: If no text could be extracted
        """
        try:
            logger.info(f"Extracting text from Excel: {file_path}")
            # Validate file
            self._validate_file(file_path)
            # Convert Excel to markdown using Docling
            result = self._converter.convert(str(file_path))
            markdown_text = result.document.export_to_markdown()
            # Validate content
            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))
            # Create metadata
            metadata = self._create_metadata(file_path)
            # Build document with raw_markdown
            document = Document(raw_markdown=markdown_text, metadata=metadata)
            logger.info(
                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document
        except EmptyContentError:
            raise
        except ExtractionError:
            raise
        except Exception as e:
            logger.error(f"Excel extraction failed for {file_path}: {str(e)}")
            raise ExtractionError(
                message=f"Failed to extract text from {file_path.name}",
                details=str(e),
                file_path=str(file_path),
            )
    def supports_file_type(self, file_extension: str) -> bool:
        """
        Check if this extractor supports Excel files.
        Args:
            file_extension: File extension (e.g., 'xlsx', 'xls')
        Returns:
            True if Excel files are supported
        """
        return file_extension.lower() in self._supported_extensions
    def get_supported_types(self) -> List[str]:
        """
        Get list of supported file extensions.
        Returns:
            List containing 'xlsx' and 'xls'
        """
        return self._supported_extensions.copy()
    def _validate_file(self, file_path: Path) -> None:
        """
        Validate file exists and is readable.
        Args:
            file_path: Path to validate
        Raises:
            ExtractionError: If file is invalid
        """
        if not file_path.exists():
            raise ExtractionError(
                message=f"File not found: {file_path}",
                file_path=str(file_path),
            )
        if not file_path.is_file():
            raise ExtractionError(
                message=f"Path is not a file: {file_path}",
                file_path=str(file_path),
            )
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
        Create document metadata from Excel file.
        Args:
            file_path: Path to the Excel file
        Returns:
            DocumentMetadata entity
        """
        stat = file_path.stat()
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
            display_name=file_path.stem,
            size_bytes=stat.st_size,
        )
--- a/src/adapters/outgoing/extractors/markdown_extractor.py
+++ b/src/adapters/outgoing/extractors/markdown_extractor.py
@ -181,6 +181,6 @@ class MarkdownExtractor(IExtractor):
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
-            display_name=file_path.name,
+            display_name=file_path.stem,
            size_bytes=stat.st_size,
        )
--- a/src/adapters/outgoing/extractors/pdf_extractor.py
+++ b/src/adapters/outgoing/extractors/pdf_extractor.py
@ -1,13 +1,15 @@
 """
 PDF Extractor - Concrete implementation for PDF text extraction.
-This adapter implements the IExtractor port using PyPDF2 library.
+This adapter implements the IExtractor port using Docling library.
-It maps PyPDF2 exceptions to domain exceptions.
+It maps Docling exceptions to domain exceptions.
 """
 import logging
 from pathlib import Path
 from typing import List
 from docling.document_converter import DocumentConverter
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
@ -21,22 +23,23 @@ logger = logging.getLogger(__name__)
 class PDFExtractor(IExtractor):
    """
-    Concrete PDF extractor using PyPDF2.
+    Concrete PDF extractor using Docling.
    This adapter:
-    1. Extracts text from PDF files using PyPDF2
+    1. Extracts text from PDF files using Docling's DocumentConverter
-    2. Maps PyPDF2 exceptions to domain exceptions
+    2. Converts PDF to Markdown format
-    3. Creates Document entities with metadata
+    3. Extracts metadata including page count
    """
    def __init__(self) -> None:
-        """Initialize PDF extractor."""
+        """Initialize PDF extractor with Docling converter."""
        self._supported_extensions = ['pdf']
-        logger.debug("PDFExtractor initialized")
+        self._converter = DocumentConverter()
        logger.debug("PDFExtractor initialized with Docling")
    def extract(self, file_path: Path) -> Document:
        """
-        Extract text and metadata from PDF file.
+        Extract text and metadata from PDF file using Docling.
        Args:
            file_path: Path to the PDF file
@ -54,21 +57,22 @@ class PDFExtractor(IExtractor):
            # Validate file
            self._validate_file(file_path)
-            # Extract text
+            # Convert PDF to markdown using Docling
-            text = self._extract_text_from_pdf(file_path)
+            result = self._converter.convert(str(file_path))
            markdown_text = result.document.export_to_markdown()
            # Validate content
-            if not text or not text.strip():
+            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))
-            # Create metadata
+            # Create metadata with page count from Docling result
-            metadata = self._create_metadata(file_path)
+            metadata = self._create_metadata(file_path, result)
            # Build document with raw_markdown
-            document = Document(raw_markdown=text, metadata=metadata)
+            document = Document(raw_markdown=markdown_text, metadata=metadata)
            logger.info(
-                f"Successfully extracted {len(text)} characters from {file_path.name}"
+                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document
@ -130,89 +134,35 @@ class PDFExtractor(IExtractor):
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))
-    def _extract_text_from_pdf(self, file_path: Path) -> str:
+    def _create_metadata(self, file_path: Path, result) -> DocumentMetadata:
        """
-        Extract text from PDF using PyPDF2.
+        Create document metadata from PDF file and Docling result.
        Args:
-            file_path: Path to PDF file
+            file_path: Path to the PDF file
-
+            result: Docling conversion result
        Returns:
            Extracted text content
        Raises:
            ExtractionError: If PDF extraction fails
        """
        try:
            import PyPDF2
            logger.debug(f"Reading PDF: {file_path}")
            text_parts = []
            with open(file_path, 'rb') as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                num_pages = len(pdf_reader.pages)
                logger.debug(f"PDF has {num_pages} pages")
                for page_num, page in enumerate(pdf_reader.pages, start=1):
                    page_text = self._extract_page_text(page, page_num)
                    if page_text:
                        text_parts.append(page_text)
            return "\n\n".join(text_parts)
        except ImportError:
            raise ExtractionError(
                message="PyPDF2 library not installed",
                details="Install with: pip install PyPDF2",
                file_path=str(file_path),
            )
        except Exception as e:
            raise ExtractionError(
                message=f"PDF extraction failed: {str(e)}",
                file_path=str(file_path),
            )
    def _extract_page_text(self, page, page_num: int) -> str:
        """
        Extract text from a single page.
        Args:
            page: PyPDF2 page object
            page_num: Page number for logging
        Returns:
            Extracted page text
        """
        try:
            import PyPDF2
            text = page.extract_text()
            logger.debug(f"Extracted page {page_num}")
            return text
        except PyPDF2.errors.PdfReadError as e:
            logger.warning(f"Failed to extract page {page_num}: {str(e)}")
            return ""
        except Exception as e:
            logger.warning(f"Error on page {page_num}: {str(e)}")
            return ""
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
        Create source-neutral document metadata from file.
        Args:
            file_path: Path to the file
        Returns:
            DocumentMetadata entity
        """
        stat = file_path.stat()
        # Extract page count from Docling result
        page_count = None
        try:
            if hasattr(result.document, 'pages'):
                page_count = len(result.document.pages)
        except Exception as e:
            logger.warning(f"Could not extract page count: {str(e)}")
        extra_metadata = {}
        if page_count is not None:
            extra_metadata['page_count'] = str(page_count)
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
-            display_name=file_path.name,
+            display_name=file_path.stem,
            size_bytes=stat.st_size,
            extra_metadata=extra_metadata,
        )
--- a/src/adapters/outgoing/extractors/txt_extractor.py
+++ b/src/adapters/outgoing/extractors/txt_extractor.py
@ -200,6 +200,6 @@ class TxtExtractor(IExtractor):
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
-            display_name=file_path.name,
+            display_name=file_path.stem,
            size_bytes=stat.st_size,
        )
--- a/src/adapters/outgoing/extractors/zip_extractor.py
+++ b/src/adapters/outgoing/extractors/zip_extractor.py
@ -227,7 +227,7 @@ class ZipExtractor(IExtractor):
                continue
            # Skip files with 'nohf' in their name
-            if 'nohf' in filename.lower():
+            if 'nohf' not in filename.lower():
                logger.debug(f"Skipping 'nohf' file: {filename}")
                continue
@ -312,6 +312,6 @@ class ZipExtractor(IExtractor):
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
-            display_name=file_path.name,
+            display_name=file_path.stem,
            size_bytes=stat.st_size,
        )
--- a/src/bootstrap.py
+++ b/src/bootstrap.py
@ -15,6 +15,7 @@ from .adapters.outgoing.chunkers.context import ChunkingContext
 from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
 from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
 from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
 from .adapters.outgoing.extractors.excel_extractor import ExcelExtractor
 from .adapters.outgoing.extractors.factory import ExtractorFactory
 from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
 from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
@ -118,6 +119,7 @@ class ApplicationContainer:
        # Register all extractors
        factory.register_extractor(PDFExtractor())
        factory.register_extractor(DocxExtractor())
        factory.register_extractor(ExcelExtractor())
        factory.register_extractor(TxtExtractor())
        factory.register_extractor(MarkdownExtractor())
        factory.register_extractor(ZipExtractor())
--- a/src/core/config.py
+++ b/src/core/config.py
@ -14,6 +14,13 @@ class Settings(BaseSettings):
    S3_ENDPOINT_URL: Optional[str] = "https://cdn.d.aiengines.ir"
    S3_PRESIGNED_URL_EXPIRATION: int = 3600
    S3_UPLOAD_PATH_PREFIX: str = "extractions"
    API_KEY: str = "some-secret-api-key"
    API_KEY_NAME: str = "API-Key"
    DOCS_USERNAME: str = "admin"
    DOCS_PASSWORD: str = "admin"
    LOG_LEVEL: str = "INFO"
    model_config = SettingsConfigDict(
--- a/src/core/domain/models.py
+++ b/src/core/domain/models.py
@ -126,7 +126,7 @@ class DocumentSection(BaseModel):
        level: Header level (1-6 for h1-h6, 0 for Introduction)
        content: Section content with preserved Markdown formatting
    """
-    title: str = Field(..., min_length=1, description="Section title")
+    title: Optional[str] = Field(None, min_length=1, description="Section title")
    level: int = Field(..., ge=0, le=6, description="Header level (0=intro)")
    content: str = Field(..., description="Section content with formatting")
@ -138,7 +138,9 @@ class DocumentSection(BaseModel):
    @classmethod
    def normalize_title(cls, value: str) -> str:
        """Normalize title by stripping whitespace."""
-        return value.strip()
+        if value:
            return value.strip()
        return value
    def is_introduction(self) -> bool:
        """Check if this is the introduction section."""
@ -358,8 +360,6 @@ class Chunk(BaseModel):
        document_id: ID of the parent document
        content: Text content of the chunk
        sequence_number: Order of this chunk in the document
        start_char: Starting character position in original document
        end_char: Ending character position in original document
        section_title: Title of the section this chunk belongs to
        section_index: Index of the section in document.sections
        metadata: Optional metadata specific to this chunk
@ -368,8 +368,6 @@ class Chunk(BaseModel):
    document_id: UUID = Field(..., description="Parent document ID")
    content: str = Field(..., min_length=1, description="Chunk text content")
    sequence_number: int = Field(..., ge=0, description="Chunk order in document")
    start_char: int = Field(..., ge=0, description="Start position in document")
    end_char: int = Field(..., gt=0, description="End position in document")
    section_title: Optional[str] = Field(None, description="Section title")
    section_index: Optional[int] = Field(None, ge=0, description="Section index")
    metadata: Dict[str, str] = Field(default_factory=dict)
@ -378,27 +376,6 @@ class Chunk(BaseModel):
        "frozen": True,  # Chunks are immutable
    }
    @model_validator(mode='after')
    def validate_position_consistency(self) -> 'Chunk':
        """Ensure end position is after start position."""
        if self.end_char <= self.start_char:
            raise ValueError(
                f"end_char ({self.end_char}) must be greater than "
                f"start_char ({self.start_char})"
            )
        # Validate content length matches position range
        content_length = len(self.content)
        position_range = self.end_char - self.start_char
        if abs(content_length - position_range) > 10:  # Allow small variance
            raise ValueError(
                f"Content length ({content_length}) doesn't match "
                f"position range ({position_range})"
            )
        return self
    def get_length(self) -> int:
        """Get the length of the chunk content."""
        return len(self.content)
--- a/src/core/domain/parsers.py
+++ b/src/core/domain/parsers.py
@ -50,40 +50,25 @@ def parse_markdown(text: str) -> List[DocumentSection]:
    sections: List[DocumentSection] = []
    current_heading: str | None = None
    current_level: int = 0
    current_content_parts: List[str] = []
    def finalize_section() -> None:
        """Helper to finalize and append the current section."""
        if current_heading is not None or current_content_parts:
            content = "".join(current_content_parts).strip()
            if content:  # Only add sections with actual content
                title = current_heading if current_heading else "Introduction"
                sections.append(
                    DocumentSection(
                        title=title,
                        level=current_level,
                        content=content,
                    )
                )
    # Walk through all children of the document
    for child in doc.children:
        if isinstance(child, Heading):
-            # Finalize previous section before starting new one
+            # Update current heading context
            finalize_section()
            # Start new section
            current_heading = _extract_heading_text(child)
            current_level = child.level
            current_content_parts = []
        else:
            # Render content back to markdown format instead of HTML
            rendered = md_renderer.render(child).strip()
            if rendered:
-                current_content_parts.append(rendered + "\n\n")
+                # Create a separate section for each paragraph/block
-
+                sections.append(
-    # Finalize the last section
+                    DocumentSection(
-    finalize_section()
+                        title=current_heading,
                        level=current_level,
                        content=rendered,
                    )
                )
    return sections
Author	SHA1	Message	Date
m.dabbagh	80dd901e42	fix: remove file extension from DocumentMetadata.display_name	2026-01-25 11:33:50 +03:30
m.dabbagh	9e1e49bc59	add document title and section title to the beginning of each chunk in paragraph chunker	2026-01-25 11:32:35 +03:30
m.dabbagh	cda128e438	one paragraph per chunk in paragraph chunking method	2026-01-25 11:03:54 +03:30
m.dabbagh	8ecbd88498	make DocumentSection.title optional	2026-01-24 20:25:34 +03:30
m.dabbagh	3aad734140	comment out swagger authentication	2026-01-24 17:06:25 +03:30
m.dabbagh	c6302bc792	add api-key header and swagger authentication	2026-01-24 17:05:29 +03:30
m.dabbagh	2ccb38179d	use docling in extractors	2026-01-24 13:43:07 +03:30
m.dabbagh	ad163eb665	change api defaults	2026-01-20 23:36:02 +03:30