From f06370e0b950bf56342cc3a62c3f5b03410fe6bf Mon Sep 17 00:00:00 2001 From: "m.dabbagh" Date: Thu, 8 Jan 2026 16:47:50 +0330 Subject: [PATCH] some fixes in concrete implementations of chunkers --- src/adapters/outgoing/chunkers/context.py | 18 ++-- .../outgoing/chunkers/fixed_size_chunker.py | 98 ++++++++++++++----- .../outgoing/chunkers/paragraph_chunker.py | 98 ++++++++++++++----- 3 files changed, 159 insertions(+), 55 deletions(-) diff --git a/src/adapters/outgoing/chunkers/context.py b/src/adapters/outgoing/chunkers/context.py index 46ec6f9..7828ec8 100644 --- a/src/adapters/outgoing/chunkers/context.py +++ b/src/adapters/outgoing/chunkers/context.py @@ -6,10 +6,9 @@ This is an ADAPTER that implements the IChunkingContext port from Core. """ import logging from typing import Dict, List -from uuid import UUID from ....core.domain.exceptions import ChunkingError -from ....core.domain.models import Chunk, ChunkingStrategy +from ....core.domain.models import Chunk, ChunkingStrategy, Document from ....core.ports.outgoing.chunker import IChunker from ....core.ports.outgoing.chunking_context import IChunkingContext @@ -46,23 +45,21 @@ class ChunkingContext(IChunkingContext): def execute_chunking( self, - text: str, - document_id: UUID, + document: Document, strategy: ChunkingStrategy, ) -> List[Chunk]: """ Execute chunking using the specified strategy. - This method is stateless and thread-safe. It selects the appropriate - chunker based on the strategy configuration for each call. + This method is stateless and thread-safe. It accepts the full + Document object (with sections) to enable section-aware chunking. Args: - text: Text to chunk - document_id: ID of parent document + document: Full Document entity with raw_markdown and sections strategy: Chunking strategy configuration (includes strategy_name) Returns: - List of chunks + List of chunks with section metadata Raises: ChunkingError: If strategy is not registered or chunking fails @@ -83,8 +80,7 @@ class ChunkingContext(IChunkingContext): ) return chunker.chunk( - text=text, - document_id=document_id, + document=document, strategy=strategy, ) diff --git a/src/adapters/outgoing/chunkers/fixed_size_chunker.py b/src/adapters/outgoing/chunkers/fixed_size_chunker.py index bb8d163..b4812a0 100644 --- a/src/adapters/outgoing/chunkers/fixed_size_chunker.py +++ b/src/adapters/outgoing/chunkers/fixed_size_chunker.py @@ -2,15 +2,14 @@ Fixed Size Chunker - Concrete implementation for fixed-size chunking. This adapter implements the IChunker port using a fixed-size strategy -with optional overlap and boundary respect. +with optional overlap, boundary respect, and section-aware chunking. """ import logging -from typing import List -from uuid import UUID +from typing import List, Optional from ....core.domain import logic_utils from ....core.domain.exceptions import ChunkingError, ValidationError -from ....core.domain.models import Chunk, ChunkingStrategy +from ....core.domain.models import Chunk, ChunkingStrategy, Document from ....core.ports.outgoing.chunker import IChunker @@ -19,12 +18,13 @@ logger = logging.getLogger(__name__) class FixedSizeChunker(IChunker): """ - Concrete fixed-size chunker implementation. + Concrete fixed-size chunker implementation with section awareness. This adapter: - 1. Splits text into fixed-size chunks + 1. Splits documents into fixed-size chunks 2. Supports overlap between chunks 3. Respects word and sentence boundaries when configured + 4. Can process each section independently (section-aware chunking) """ def __init__(self) -> None: @@ -34,20 +34,21 @@ class FixedSizeChunker(IChunker): def chunk( self, - text: str, - document_id: UUID, + document: Document, strategy: ChunkingStrategy, ) -> List[Chunk]: """ - Split text into fixed-size chunks with overlap. + Split document into fixed-size chunks with optional section awareness. + + If respect_boundaries is True and document has sections, chunks + will not span across section boundaries. Args: - text: Text content to chunk - document_id: ID of the parent document + document: Full Document entity with raw_markdown and sections strategy: Chunking strategy configuration Returns: - List of Chunk entities + List of Chunk entities with section metadata Raises: ChunkingError: If chunking fails @@ -55,18 +56,22 @@ class FixedSizeChunker(IChunker): """ try: logger.info( - f"Chunking text with fixed_size strategy " - f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})" + f"Chunking document with fixed_size strategy " + f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, " + f"sections={len(document.sections)})" ) # Validate inputs - self._validate_input(text, strategy) + self._validate_input(document.raw_markdown, strategy) - # Split text into segments - segments = self._split_into_segments(text, strategy) - - # Create Chunk entities - chunks = self._create_chunks(segments, document_id) + # Choose chunking approach based on strategy and document structure + if strategy.respect_boundaries and document.sections: + # Section-aware chunking: process each section independently + chunks = self._chunk_by_sections(document, strategy) + else: + # Standard chunking: process entire raw_markdown + segments = self._split_into_segments(document.raw_markdown, strategy) + chunks = self._create_chunks(segments, document.id) logger.info(f"Created {len(chunks)} fixed-size chunks") return chunks @@ -78,7 +83,7 @@ class FixedSizeChunker(IChunker): except Exception as e: logger.error(f"Fixed-size chunking failed: {str(e)}") raise ChunkingError( - message="Failed to chunk text with fixed_size strategy", + message="Failed to chunk document with fixed_size strategy", details=str(e), strategy_name=self._strategy_name, ) @@ -232,10 +237,55 @@ class FixedSizeChunker(IChunker): respect_boundary=True, ) + def _chunk_by_sections( + self, + document: Document, + strategy: ChunkingStrategy, + ) -> List[Chunk]: + """ + Chunk document by processing each section independently. + + This prevents chunks from spanning across section boundaries. + + Args: + document: Document with sections + strategy: Chunking strategy configuration + + Returns: + List of Chunk entities with section metadata + """ + all_chunks = [] + global_sequence = 0 + + for section_index, section in enumerate(document.sections): + # Split this section's content into segments + segments = self._split_into_segments(section.content, strategy) + + # Create chunks for this section + for text, start_char, end_char in segments: + chunk = Chunk( + document_id=document.id, + content=text, + sequence_number=global_sequence, + start_char=start_char, + end_char=end_char, + section_title=section.title, + section_index=section_index, + ) + all_chunks.append(chunk) + global_sequence += 1 + + logger.debug( + f"Created {len(all_chunks)} chunks across {len(document.sections)} sections" + ) + return all_chunks + def _create_chunks( self, segments: List[tuple[str, int, int]], - document_id: UUID, + document_id, + section_title: Optional[str] = None, + section_index: Optional[int] = None, ) -> List[Chunk]: """ Create Chunk entities from text segments. @@ -243,6 +293,8 @@ class FixedSizeChunker(IChunker): Args: segments: List of (text, start_pos, end_pos) tuples document_id: ID of parent document + section_title: Optional section title + section_index: Optional section index Returns: List of Chunk entities @@ -256,6 +308,8 @@ class FixedSizeChunker(IChunker): sequence_number=sequence_number, start_char=start_char, end_char=end_char, + section_title=section_title, + section_index=section_index, ) chunks.append(chunk) diff --git a/src/adapters/outgoing/chunkers/paragraph_chunker.py b/src/adapters/outgoing/chunkers/paragraph_chunker.py index c8f403c..1150f89 100644 --- a/src/adapters/outgoing/chunkers/paragraph_chunker.py +++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py @@ -2,15 +2,14 @@ Paragraph Chunker - Concrete implementation for paragraph-based chunking. This adapter implements the IChunker port using a paragraph-respecting -strategy that combines paragraphs to reach target chunk size. +strategy that combines paragraphs to reach target chunk size with section awareness. """ import logging -from typing import List -from uuid import UUID +from typing import List, Optional from ....core.domain import logic_utils from ....core.domain.exceptions import ChunkingError, ValidationError -from ....core.domain.models import Chunk, ChunkingStrategy +from ....core.domain.models import Chunk, ChunkingStrategy, Document from ....core.ports.outgoing.chunker import IChunker @@ -19,12 +18,13 @@ logger = logging.getLogger(__name__) class ParagraphChunker(IChunker): """ - Concrete paragraph-based chunker implementation. + Concrete paragraph-based chunker implementation with section awareness. This adapter: - 1. Splits text by paragraph boundaries + 1. Splits documents by paragraph boundaries 2. Combines paragraphs to reach target chunk size 3. Preserves document structure + 4. Can process each section independently (section-aware chunking) """ def __init__(self) -> None: @@ -34,20 +34,21 @@ class ParagraphChunker(IChunker): def chunk( self, - text: str, - document_id: UUID, + document: Document, strategy: ChunkingStrategy, ) -> List[Chunk]: """ - Split text into paragraph-based chunks. + Split document into paragraph-based chunks with optional section awareness. + + If respect_boundaries is True and document has sections, chunks + will not span across section boundaries. Args: - text: Text content to chunk - document_id: ID of the parent document + document: Full Document entity with raw_markdown and sections strategy: Chunking strategy configuration Returns: - List of Chunk entities + List of Chunk entities with section metadata Raises: ChunkingError: If chunking fails @@ -55,18 +56,22 @@ class ParagraphChunker(IChunker): """ try: logger.info( - f"Chunking text with paragraph strategy " - f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})" + f"Chunking document with paragraph strategy " + f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, " + f"sections={len(document.sections)})" ) # Validate inputs - self._validate_input(text, strategy) + self._validate_input(document.raw_markdown, strategy) - # Split into paragraphs and group - segments = self._split_and_group_paragraphs(text, strategy) - - # Create Chunk entities - chunks = self._create_chunks(segments, document_id) + # Choose chunking approach based on strategy and document structure + if strategy.respect_boundaries and document.sections: + # Section-aware chunking: process each section independently + chunks = self._chunk_by_sections(document, strategy) + else: + # Standard chunking: process entire raw_markdown + segments = self._split_and_group_paragraphs(document.raw_markdown, strategy) + chunks = self._create_chunks(segments, document.id) logger.info(f"Created {len(chunks)} paragraph-based chunks") return chunks @@ -78,7 +83,7 @@ class ParagraphChunker(IChunker): except Exception as e: logger.error(f"Paragraph chunking failed: {str(e)}") raise ChunkingError( - message="Failed to chunk text with paragraph strategy", + message="Failed to chunk document with paragraph strategy", details=str(e), strategy_name=self._strategy_name, ) @@ -283,10 +288,55 @@ class ParagraphChunker(IChunker): _, _, prev_end = previous_segment return ([new_paragraph], prev_end, new_para_size) + def _chunk_by_sections( + self, + document: Document, + strategy: ChunkingStrategy, + ) -> List[Chunk]: + """ + Chunk document by processing each section independently. + + This prevents chunks from spanning across section boundaries. + + Args: + document: Document with sections + strategy: Chunking strategy configuration + + Returns: + List of Chunk entities with section metadata + """ + all_chunks = [] + global_sequence = 0 + + for section_index, section in enumerate(document.sections): + # Split this section's content into paragraph-based segments + segments = self._split_and_group_paragraphs(section.content, strategy) + + # Create chunks for this section + for text, start_char, end_char in segments: + chunk = Chunk( + document_id=document.id, + content=text, + sequence_number=global_sequence, + start_char=start_char, + end_char=end_char, + section_title=section.title, + section_index=section_index, + ) + all_chunks.append(chunk) + global_sequence += 1 + + logger.debug( + f"Created {len(all_chunks)} chunks across {len(document.sections)} sections" + ) + return all_chunks + def _create_chunks( self, segments: List[tuple[str, int, int]], - document_id: UUID, + document_id, + section_title: Optional[str] = None, + section_index: Optional[int] = None, ) -> List[Chunk]: """ Create Chunk entities from text segments. @@ -294,6 +344,8 @@ class ParagraphChunker(IChunker): Args: segments: List of (text, start_pos, end_pos) tuples document_id: ID of parent document + section_title: Optional section title + section_index: Optional section index Returns: List of Chunk entities @@ -307,6 +359,8 @@ class ParagraphChunker(IChunker): sequence_number=sequence_number, start_char=start_char, end_char=end_char, + section_title=section_title, + section_index=section_index, ) chunks.append(chunk)