some fixes in concrete implementations of chunkers

2026-01-08 16:47:50 +03:30 · 2026-01-08 16:47:50 +03:30 · f06370e0b9
commit f06370e0b9
parent 2c375ce6bd
3 changed files with 159 additions and 55 deletions
--- a/src/adapters/outgoing/chunkers/context.py
+++ b/src/adapters/outgoing/chunkers/context.py
@ -6,10 +6,9 @@ This is an ADAPTER that implements the IChunkingContext port from Core.
 """
 import logging
 from typing import Dict, List
-from uuid import UUID

 from ....core.domain.exceptions import ChunkingError
-from ....core.domain.models import Chunk, ChunkingStrategy
+from ....core.domain.models import Chunk, ChunkingStrategy, Document
 from ....core.ports.outgoing.chunker import IChunker
 from ....core.ports.outgoing.chunking_context import IChunkingContext

@ -46,23 +45,21 @@ class ChunkingContext(IChunkingContext):

    def execute_chunking(
        self,
-        text: str,
-        document_id: UUID,
+        document: Document,
        strategy: ChunkingStrategy,
    ) -> List[Chunk]:
        """
        Execute chunking using the specified strategy.

-        This method is stateless and thread-safe. It selects the appropriate
-        chunker based on the strategy configuration for each call.
+        This method is stateless and thread-safe. It accepts the full
+        Document object (with sections) to enable section-aware chunking.

        Args:
-            text: Text to chunk
-            document_id: ID of parent document
+            document: Full Document entity with raw_markdown and sections
            strategy: Chunking strategy configuration (includes strategy_name)

        Returns:
-            List of chunks
+            List of chunks with section metadata

        Raises:
            ChunkingError: If strategy is not registered or chunking fails
@ -83,8 +80,7 @@ class ChunkingContext(IChunkingContext):
        )

        return chunker.chunk(
-            text=text,
-            document_id=document_id,
+            document=document,
            strategy=strategy,
        )

--- a/src/adapters/outgoing/chunkers/fixed_size_chunker.py
+++ b/src/adapters/outgoing/chunkers/fixed_size_chunker.py
@ -2,15 +2,14 @@
 Fixed Size Chunker - Concrete implementation for fixed-size chunking.

 This adapter implements the IChunker port using a fixed-size strategy
-with optional overlap and boundary respect.
+with optional overlap, boundary respect, and section-aware chunking.
 """
 import logging
-from typing import List
-from uuid import UUID
+from typing import List, Optional

 from ....core.domain import logic_utils
 from ....core.domain.exceptions import ChunkingError, ValidationError
-from ....core.domain.models import Chunk, ChunkingStrategy
+from ....core.domain.models import Chunk, ChunkingStrategy, Document
 from ....core.ports.outgoing.chunker import IChunker


@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)

 class FixedSizeChunker(IChunker):
    """
-    Concrete fixed-size chunker implementation.
+    Concrete fixed-size chunker implementation with section awareness.

    This adapter:
-    1. Splits text into fixed-size chunks
+    1. Splits documents into fixed-size chunks
    2. Supports overlap between chunks
    3. Respects word and sentence boundaries when configured
+    4. Can process each section independently (section-aware chunking)
    """

    def __init__(self) -> None:
@ -34,20 +34,21 @@ class FixedSizeChunker(IChunker):

    def chunk(
        self,
-        text: str,
-        document_id: UUID,
+        document: Document,
        strategy: ChunkingStrategy,
    ) -> List[Chunk]:
        """
-        Split text into fixed-size chunks with overlap.
+        Split document into fixed-size chunks with optional section awareness.
+
+        If respect_boundaries is True and document has sections, chunks
+        will not span across section boundaries.

        Args:
-            text: Text content to chunk
-            document_id: ID of the parent document
+            document: Full Document entity with raw_markdown and sections
            strategy: Chunking strategy configuration

        Returns:
-            List of Chunk entities
+            List of Chunk entities with section metadata

        Raises:
            ChunkingError: If chunking fails
@ -55,18 +56,22 @@ class FixedSizeChunker(IChunker):
        """
        try:
            logger.info(
-                f"Chunking text with fixed_size strategy "
-                f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
+                f"Chunking document with fixed_size strategy "
+                f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
+                f"sections={len(document.sections)})"
            )

            # Validate inputs
-            self._validate_input(text, strategy)
+            self._validate_input(document.raw_markdown, strategy)

-            # Split text into segments
-            segments = self._split_into_segments(text, strategy)
-
-            # Create Chunk entities
-            chunks = self._create_chunks(segments, document_id)
+            # Choose chunking approach based on strategy and document structure
+            if strategy.respect_boundaries and document.sections:
+                # Section-aware chunking: process each section independently
+                chunks = self._chunk_by_sections(document, strategy)
+            else:
+                # Standard chunking: process entire raw_markdown
+                segments = self._split_into_segments(document.raw_markdown, strategy)
+                chunks = self._create_chunks(segments, document.id)

            logger.info(f"Created {len(chunks)} fixed-size chunks")
            return chunks
@ -78,7 +83,7 @@ class FixedSizeChunker(IChunker):
        except Exception as e:
            logger.error(f"Fixed-size chunking failed: {str(e)}")
            raise ChunkingError(
-                message="Failed to chunk text with fixed_size strategy",
+                message="Failed to chunk document with fixed_size strategy",
                details=str(e),
                strategy_name=self._strategy_name,
            )
@ -232,10 +237,55 @@ class FixedSizeChunker(IChunker):
            respect_boundary=True,
        )

+    def _chunk_by_sections(
+        self,
+        document: Document,
+        strategy: ChunkingStrategy,
+    ) -> List[Chunk]:
+        """
+        Chunk document by processing each section independently.
+
+        This prevents chunks from spanning across section boundaries.
+
+        Args:
+            document: Document with sections
+            strategy: Chunking strategy configuration
+
+        Returns:
+            List of Chunk entities with section metadata
+        """
+        all_chunks = []
+        global_sequence = 0
+
+        for section_index, section in enumerate(document.sections):
+            # Split this section's content into segments
+            segments = self._split_into_segments(section.content, strategy)
+
+            # Create chunks for this section
+            for text, start_char, end_char in segments:
+                chunk = Chunk(
+                    document_id=document.id,
+                    content=text,
+                    sequence_number=global_sequence,
+                    start_char=start_char,
+                    end_char=end_char,
+                    section_title=section.title,
+                    section_index=section_index,
+                )
+                all_chunks.append(chunk)
+                global_sequence += 1
+
+        logger.debug(
+            f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
+        )
+        return all_chunks
+
    def _create_chunks(
        self,
        segments: List[tuple[str, int, int]],
-        document_id: UUID,
+        document_id,
+        section_title: Optional[str] = None,
+        section_index: Optional[int] = None,
    ) -> List[Chunk]:
        """
        Create Chunk entities from text segments.
@ -243,6 +293,8 @@ class FixedSizeChunker(IChunker):
        Args:
            segments: List of (text, start_pos, end_pos) tuples
            document_id: ID of parent document
+            section_title: Optional section title
+            section_index: Optional section index

        Returns:
            List of Chunk entities
@ -256,6 +308,8 @@ class FixedSizeChunker(IChunker):
                sequence_number=sequence_number,
                start_char=start_char,
                end_char=end_char,
+                section_title=section_title,
+                section_index=section_index,
            )
            chunks.append(chunk)

--- a/src/adapters/outgoing/chunkers/paragraph_chunker.py
+++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py
@ -2,15 +2,14 @@
 Paragraph Chunker - Concrete implementation for paragraph-based chunking.

 This adapter implements the IChunker port using a paragraph-respecting
-strategy that combines paragraphs to reach target chunk size.
+strategy that combines paragraphs to reach target chunk size with section awareness.
 """
 import logging
-from typing import List
-from uuid import UUID
+from typing import List, Optional

 from ....core.domain import logic_utils
 from ....core.domain.exceptions import ChunkingError, ValidationError
-from ....core.domain.models import Chunk, ChunkingStrategy
+from ....core.domain.models import Chunk, ChunkingStrategy, Document
 from ....core.ports.outgoing.chunker import IChunker


@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)

 class ParagraphChunker(IChunker):
    """
-    Concrete paragraph-based chunker implementation.
+    Concrete paragraph-based chunker implementation with section awareness.

    This adapter:
-    1. Splits text by paragraph boundaries
+    1. Splits documents by paragraph boundaries
    2. Combines paragraphs to reach target chunk size
    3. Preserves document structure
+    4. Can process each section independently (section-aware chunking)
    """

    def __init__(self) -> None:
@ -34,20 +34,21 @@ class ParagraphChunker(IChunker):

    def chunk(
        self,
-        text: str,
-        document_id: UUID,
+        document: Document,
        strategy: ChunkingStrategy,
    ) -> List[Chunk]:
        """
-        Split text into paragraph-based chunks.
+        Split document into paragraph-based chunks with optional section awareness.
+
+        If respect_boundaries is True and document has sections, chunks
+        will not span across section boundaries.

        Args:
-            text: Text content to chunk
-            document_id: ID of the parent document
+            document: Full Document entity with raw_markdown and sections
            strategy: Chunking strategy configuration

        Returns:
-            List of Chunk entities
+            List of Chunk entities with section metadata

        Raises:
            ChunkingError: If chunking fails
@ -55,18 +56,22 @@ class ParagraphChunker(IChunker):
        """
        try:
            logger.info(
-                f"Chunking text with paragraph strategy "
-                f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
+                f"Chunking document with paragraph strategy "
+                f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
+                f"sections={len(document.sections)})"
            )

            # Validate inputs
-            self._validate_input(text, strategy)
+            self._validate_input(document.raw_markdown, strategy)

-            # Split into paragraphs and group
-            segments = self._split_and_group_paragraphs(text, strategy)
-
-            # Create Chunk entities
-            chunks = self._create_chunks(segments, document_id)
+            # Choose chunking approach based on strategy and document structure
+            if strategy.respect_boundaries and document.sections:
+                # Section-aware chunking: process each section independently
+                chunks = self._chunk_by_sections(document, strategy)
+            else:
+                # Standard chunking: process entire raw_markdown
+                segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
+                chunks = self._create_chunks(segments, document.id)

            logger.info(f"Created {len(chunks)} paragraph-based chunks")
            return chunks
@ -78,7 +83,7 @@ class ParagraphChunker(IChunker):
        except Exception as e:
            logger.error(f"Paragraph chunking failed: {str(e)}")
            raise ChunkingError(
-                message="Failed to chunk text with paragraph strategy",
+                message="Failed to chunk document with paragraph strategy",
                details=str(e),
                strategy_name=self._strategy_name,
            )
@ -283,10 +288,55 @@ class ParagraphChunker(IChunker):
            _, _, prev_end = previous_segment
            return ([new_paragraph], prev_end, new_para_size)

+    def _chunk_by_sections(
+        self,
+        document: Document,
+        strategy: ChunkingStrategy,
+    ) -> List[Chunk]:
+        """
+        Chunk document by processing each section independently.
+
+        This prevents chunks from spanning across section boundaries.
+
+        Args:
+            document: Document with sections
+            strategy: Chunking strategy configuration
+
+        Returns:
+            List of Chunk entities with section metadata
+        """
+        all_chunks = []
+        global_sequence = 0
+
+        for section_index, section in enumerate(document.sections):
+            # Split this section's content into paragraph-based segments
+            segments = self._split_and_group_paragraphs(section.content, strategy)
+
+            # Create chunks for this section
+            for text, start_char, end_char in segments:
+                chunk = Chunk(
+                    document_id=document.id,
+                    content=text,
+                    sequence_number=global_sequence,
+                    start_char=start_char,
+                    end_char=end_char,
+                    section_title=section.title,
+                    section_index=section_index,
+                )
+                all_chunks.append(chunk)
+                global_sequence += 1
+
+        logger.debug(
+            f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
+        )
+        return all_chunks
+
    def _create_chunks(
        self,
        segments: List[tuple[str, int, int]],
-        document_id: UUID,
+        document_id,
+        section_title: Optional[str] = None,
+        section_index: Optional[int] = None,
    ) -> List[Chunk]:
        """
        Create Chunk entities from text segments.
@ -294,6 +344,8 @@ class ParagraphChunker(IChunker):
        Args:
            segments: List of (text, start_pos, end_pos) tuples
            document_id: ID of parent document
+            section_title: Optional section title
+            section_index: Optional section index

        Returns:
            List of Chunk entities
@ -307,6 +359,8 @@ class ParagraphChunker(IChunker):
                sequence_number=sequence_number,
                start_char=start_char,
                end_char=end_char,
+                section_title=section_title,
+                section_index=section_index,
            )
            chunks.append(chunk)