From f06370e0b950bf56342cc3a62c3f5b03410fe6bf Mon Sep 17 00:00:00 2001
From: "m.dabbagh" <mostafadabbagh76@gmail.com>
Date: Thu, 8 Jan 2026 16:47:50 +0330
Subject: [PATCH] some fixes in concrete implementations of chunkers

---
 src/adapters/outgoing/chunkers/context.py     | 18 ++--
 .../outgoing/chunkers/fixed_size_chunker.py   | 98 ++++++++++++++-----
 .../outgoing/chunkers/paragraph_chunker.py    | 98 ++++++++++++++-----
 3 files changed, 159 insertions(+), 55 deletions(-)

diff --git a/src/adapters/outgoing/chunkers/context.py b/src/adapters/outgoing/chunkers/context.py
index 46ec6f9..7828ec8 100644
--- a/src/adapters/outgoing/chunkers/context.py
+++ b/src/adapters/outgoing/chunkers/context.py
@@ -6,10 +6,9 @@ This is an ADAPTER that implements the IChunkingContext port from Core.
 """
 import logging
 from typing import Dict, List
-from uuid import UUID
 
 from ....core.domain.exceptions import ChunkingError
-from ....core.domain.models import Chunk, ChunkingStrategy
+from ....core.domain.models import Chunk, ChunkingStrategy, Document
 from ....core.ports.outgoing.chunker import IChunker
 from ....core.ports.outgoing.chunking_context import IChunkingContext
 
@@ -46,23 +45,21 @@ class ChunkingContext(IChunkingContext):
 
     def execute_chunking(
         self,
-        text: str,
-        document_id: UUID,
+        document: Document,
         strategy: ChunkingStrategy,
     ) -> List[Chunk]:
         """
         Execute chunking using the specified strategy.
 
-        This method is stateless and thread-safe. It selects the appropriate
-        chunker based on the strategy configuration for each call.
+        This method is stateless and thread-safe. It accepts the full
+        Document object (with sections) to enable section-aware chunking.
 
         Args:
-            text: Text to chunk
-            document_id: ID of parent document
+            document: Full Document entity with raw_markdown and sections
             strategy: Chunking strategy configuration (includes strategy_name)
 
         Returns:
-            List of chunks
+            List of chunks with section metadata
 
         Raises:
             ChunkingError: If strategy is not registered or chunking fails
@@ -83,8 +80,7 @@ class ChunkingContext(IChunkingContext):
         )
 
         return chunker.chunk(
-            text=text,
-            document_id=document_id,
+            document=document,
             strategy=strategy,
         )
 
diff --git a/src/adapters/outgoing/chunkers/fixed_size_chunker.py b/src/adapters/outgoing/chunkers/fixed_size_chunker.py
index bb8d163..b4812a0 100644
--- a/src/adapters/outgoing/chunkers/fixed_size_chunker.py
+++ b/src/adapters/outgoing/chunkers/fixed_size_chunker.py
@@ -2,15 +2,14 @@
 Fixed Size Chunker - Concrete implementation for fixed-size chunking.
 
 This adapter implements the IChunker port using a fixed-size strategy
-with optional overlap and boundary respect.
+with optional overlap, boundary respect, and section-aware chunking.
 """
 import logging
-from typing import List
-from uuid import UUID
+from typing import List, Optional
 
 from ....core.domain import logic_utils
 from ....core.domain.exceptions import ChunkingError, ValidationError
-from ....core.domain.models import Chunk, ChunkingStrategy
+from ....core.domain.models import Chunk, ChunkingStrategy, Document
 from ....core.ports.outgoing.chunker import IChunker
 
 
@@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)
 
 class FixedSizeChunker(IChunker):
     """
-    Concrete fixed-size chunker implementation.
+    Concrete fixed-size chunker implementation with section awareness.
 
     This adapter:
-    1. Splits text into fixed-size chunks
+    1. Splits documents into fixed-size chunks
     2. Supports overlap between chunks
     3. Respects word and sentence boundaries when configured
+    4. Can process each section independently (section-aware chunking)
     """
 
     def __init__(self) -> None:
@@ -34,20 +34,21 @@ class FixedSizeChunker(IChunker):
 
     def chunk(
         self,
-        text: str,
-        document_id: UUID,
+        document: Document,
         strategy: ChunkingStrategy,
     ) -> List[Chunk]:
         """
-        Split text into fixed-size chunks with overlap.
+        Split document into fixed-size chunks with optional section awareness.
+
+        If respect_boundaries is True and document has sections, chunks
+        will not span across section boundaries.
 
         Args:
-            text: Text content to chunk
-            document_id: ID of the parent document
+            document: Full Document entity with raw_markdown and sections
             strategy: Chunking strategy configuration
 
         Returns:
-            List of Chunk entities
+            List of Chunk entities with section metadata
 
         Raises:
             ChunkingError: If chunking fails
@@ -55,18 +56,22 @@ class FixedSizeChunker(IChunker):
         """
         try:
             logger.info(
-                f"Chunking text with fixed_size strategy "
-                f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
+                f"Chunking document with fixed_size strategy "
+                f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
+                f"sections={len(document.sections)})"
             )
 
             # Validate inputs
-            self._validate_input(text, strategy)
+            self._validate_input(document.raw_markdown, strategy)
 
-            # Split text into segments
-            segments = self._split_into_segments(text, strategy)
-
-            # Create Chunk entities
-            chunks = self._create_chunks(segments, document_id)
+            # Choose chunking approach based on strategy and document structure
+            if strategy.respect_boundaries and document.sections:
+                # Section-aware chunking: process each section independently
+                chunks = self._chunk_by_sections(document, strategy)
+            else:
+                # Standard chunking: process entire raw_markdown
+                segments = self._split_into_segments(document.raw_markdown, strategy)
+                chunks = self._create_chunks(segments, document.id)
 
             logger.info(f"Created {len(chunks)} fixed-size chunks")
             return chunks
@@ -78,7 +83,7 @@ class FixedSizeChunker(IChunker):
         except Exception as e:
             logger.error(f"Fixed-size chunking failed: {str(e)}")
             raise ChunkingError(
-                message="Failed to chunk text with fixed_size strategy",
+                message="Failed to chunk document with fixed_size strategy",
                 details=str(e),
                 strategy_name=self._strategy_name,
             )
@@ -232,10 +237,55 @@ class FixedSizeChunker(IChunker):
             respect_boundary=True,
         )
 
+    def _chunk_by_sections(
+        self,
+        document: Document,
+        strategy: ChunkingStrategy,
+    ) -> List[Chunk]:
+        """
+        Chunk document by processing each section independently.
+
+        This prevents chunks from spanning across section boundaries.
+
+        Args:
+            document: Document with sections
+            strategy: Chunking strategy configuration
+
+        Returns:
+            List of Chunk entities with section metadata
+        """
+        all_chunks = []
+        global_sequence = 0
+
+        for section_index, section in enumerate(document.sections):
+            # Split this section's content into segments
+            segments = self._split_into_segments(section.content, strategy)
+
+            # Create chunks for this section
+            for text, start_char, end_char in segments:
+                chunk = Chunk(
+                    document_id=document.id,
+                    content=text,
+                    sequence_number=global_sequence,
+                    start_char=start_char,
+                    end_char=end_char,
+                    section_title=section.title,
+                    section_index=section_index,
+                )
+                all_chunks.append(chunk)
+                global_sequence += 1
+
+        logger.debug(
+            f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
+        )
+        return all_chunks
+
     def _create_chunks(
         self,
         segments: List[tuple[str, int, int]],
-        document_id: UUID,
+        document_id,
+        section_title: Optional[str] = None,
+        section_index: Optional[int] = None,
     ) -> List[Chunk]:
         """
         Create Chunk entities from text segments.
@@ -243,6 +293,8 @@ class FixedSizeChunker(IChunker):
         Args:
             segments: List of (text, start_pos, end_pos) tuples
             document_id: ID of parent document
+            section_title: Optional section title
+            section_index: Optional section index
 
         Returns:
             List of Chunk entities
@@ -256,6 +308,8 @@ class FixedSizeChunker(IChunker):
                 sequence_number=sequence_number,
                 start_char=start_char,
                 end_char=end_char,
+                section_title=section_title,
+                section_index=section_index,
             )
             chunks.append(chunk)
 
diff --git a/src/adapters/outgoing/chunkers/paragraph_chunker.py b/src/adapters/outgoing/chunkers/paragraph_chunker.py
index c8f403c..1150f89 100644
--- a/src/adapters/outgoing/chunkers/paragraph_chunker.py
+++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py
@@ -2,15 +2,14 @@
 Paragraph Chunker - Concrete implementation for paragraph-based chunking.
 
 This adapter implements the IChunker port using a paragraph-respecting
-strategy that combines paragraphs to reach target chunk size.
+strategy that combines paragraphs to reach target chunk size with section awareness.
 """
 import logging
-from typing import List
-from uuid import UUID
+from typing import List, Optional
 
 from ....core.domain import logic_utils
 from ....core.domain.exceptions import ChunkingError, ValidationError
-from ....core.domain.models import Chunk, ChunkingStrategy
+from ....core.domain.models import Chunk, ChunkingStrategy, Document
 from ....core.ports.outgoing.chunker import IChunker
 
 
@@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)
 
 class ParagraphChunker(IChunker):
     """
-    Concrete paragraph-based chunker implementation.
+    Concrete paragraph-based chunker implementation with section awareness.
 
     This adapter:
-    1. Splits text by paragraph boundaries
+    1. Splits documents by paragraph boundaries
     2. Combines paragraphs to reach target chunk size
     3. Preserves document structure
+    4. Can process each section independently (section-aware chunking)
     """
 
     def __init__(self) -> None:
@@ -34,20 +34,21 @@ class ParagraphChunker(IChunker):
 
     def chunk(
         self,
-        text: str,
-        document_id: UUID,
+        document: Document,
         strategy: ChunkingStrategy,
     ) -> List[Chunk]:
         """
-        Split text into paragraph-based chunks.
+        Split document into paragraph-based chunks with optional section awareness.
+
+        If respect_boundaries is True and document has sections, chunks
+        will not span across section boundaries.
 
         Args:
-            text: Text content to chunk
-            document_id: ID of the parent document
+            document: Full Document entity with raw_markdown and sections
             strategy: Chunking strategy configuration
 
         Returns:
-            List of Chunk entities
+            List of Chunk entities with section metadata
 
         Raises:
             ChunkingError: If chunking fails
@@ -55,18 +56,22 @@ class ParagraphChunker(IChunker):
         """
         try:
             logger.info(
-                f"Chunking text with paragraph strategy "
-                f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
+                f"Chunking document with paragraph strategy "
+                f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
+                f"sections={len(document.sections)})"
             )
 
             # Validate inputs
-            self._validate_input(text, strategy)
+            self._validate_input(document.raw_markdown, strategy)
 
-            # Split into paragraphs and group
-            segments = self._split_and_group_paragraphs(text, strategy)
-
-            # Create Chunk entities
-            chunks = self._create_chunks(segments, document_id)
+            # Choose chunking approach based on strategy and document structure
+            if strategy.respect_boundaries and document.sections:
+                # Section-aware chunking: process each section independently
+                chunks = self._chunk_by_sections(document, strategy)
+            else:
+                # Standard chunking: process entire raw_markdown
+                segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
+                chunks = self._create_chunks(segments, document.id)
 
             logger.info(f"Created {len(chunks)} paragraph-based chunks")
             return chunks
@@ -78,7 +83,7 @@ class ParagraphChunker(IChunker):
         except Exception as e:
             logger.error(f"Paragraph chunking failed: {str(e)}")
             raise ChunkingError(
-                message="Failed to chunk text with paragraph strategy",
+                message="Failed to chunk document with paragraph strategy",
                 details=str(e),
                 strategy_name=self._strategy_name,
             )
@@ -283,10 +288,55 @@ class ParagraphChunker(IChunker):
             _, _, prev_end = previous_segment
             return ([new_paragraph], prev_end, new_para_size)
 
+    def _chunk_by_sections(
+        self,
+        document: Document,
+        strategy: ChunkingStrategy,
+    ) -> List[Chunk]:
+        """
+        Chunk document by processing each section independently.
+
+        This prevents chunks from spanning across section boundaries.
+
+        Args:
+            document: Document with sections
+            strategy: Chunking strategy configuration
+
+        Returns:
+            List of Chunk entities with section metadata
+        """
+        all_chunks = []
+        global_sequence = 0
+
+        for section_index, section in enumerate(document.sections):
+            # Split this section's content into paragraph-based segments
+            segments = self._split_and_group_paragraphs(section.content, strategy)
+
+            # Create chunks for this section
+            for text, start_char, end_char in segments:
+                chunk = Chunk(
+                    document_id=document.id,
+                    content=text,
+                    sequence_number=global_sequence,
+                    start_char=start_char,
+                    end_char=end_char,
+                    section_title=section.title,
+                    section_index=section_index,
+                )
+                all_chunks.append(chunk)
+                global_sequence += 1
+
+        logger.debug(
+            f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
+        )
+        return all_chunks
+
     def _create_chunks(
         self,
         segments: List[tuple[str, int, int]],
-        document_id: UUID,
+        document_id,
+        section_title: Optional[str] = None,
+        section_index: Optional[int] = None,
     ) -> List[Chunk]:
         """
         Create Chunk entities from text segments.
@@ -294,6 +344,8 @@ class ParagraphChunker(IChunker):
         Args:
             segments: List of (text, start_pos, end_pos) tuples
             document_id: ID of parent document
+            section_title: Optional section title
+            section_index: Optional section index
 
         Returns:
             List of Chunk entities
@@ -307,6 +359,8 @@ class ParagraphChunker(IChunker):
                 sequence_number=sequence_number,
                 start_char=start_char,
                 end_char=end_char,
+                section_title=section_title,
+                section_index=section_index,
             )
             chunks.append(chunk)