some fixes in concrete implementations of chunkers

This commit is contained in:
m.dabbagh 2026-01-08 16:47:50 +03:30
parent 2c375ce6bd
commit f06370e0b9
3 changed files with 159 additions and 55 deletions

View File

@ -6,10 +6,9 @@ This is an ADAPTER that implements the IChunkingContext port from Core.
"""
import logging
from typing import Dict, List
from uuid import UUID
from ....core.domain.exceptions import ChunkingError
from ....core.domain.models import Chunk, ChunkingStrategy
from ....core.domain.models import Chunk, ChunkingStrategy, Document
from ....core.ports.outgoing.chunker import IChunker
from ....core.ports.outgoing.chunking_context import IChunkingContext
@ -46,23 +45,21 @@ class ChunkingContext(IChunkingContext):
def execute_chunking(
self,
text: str,
document_id: UUID,
document: Document,
strategy: ChunkingStrategy,
) -> List[Chunk]:
"""
Execute chunking using the specified strategy.
This method is stateless and thread-safe. It selects the appropriate
chunker based on the strategy configuration for each call.
This method is stateless and thread-safe. It accepts the full
Document object (with sections) to enable section-aware chunking.
Args:
text: Text to chunk
document_id: ID of parent document
document: Full Document entity with raw_markdown and sections
strategy: Chunking strategy configuration (includes strategy_name)
Returns:
List of chunks
List of chunks with section metadata
Raises:
ChunkingError: If strategy is not registered or chunking fails
@ -83,8 +80,7 @@ class ChunkingContext(IChunkingContext):
)
return chunker.chunk(
text=text,
document_id=document_id,
document=document,
strategy=strategy,
)

View File

@ -2,15 +2,14 @@
Fixed Size Chunker - Concrete implementation for fixed-size chunking.
This adapter implements the IChunker port using a fixed-size strategy
with optional overlap and boundary respect.
with optional overlap, boundary respect, and section-aware chunking.
"""
import logging
from typing import List
from uuid import UUID
from typing import List, Optional
from ....core.domain import logic_utils
from ....core.domain.exceptions import ChunkingError, ValidationError
from ....core.domain.models import Chunk, ChunkingStrategy
from ....core.domain.models import Chunk, ChunkingStrategy, Document
from ....core.ports.outgoing.chunker import IChunker
@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)
class FixedSizeChunker(IChunker):
"""
Concrete fixed-size chunker implementation.
Concrete fixed-size chunker implementation with section awareness.
This adapter:
1. Splits text into fixed-size chunks
1. Splits documents into fixed-size chunks
2. Supports overlap between chunks
3. Respects word and sentence boundaries when configured
4. Can process each section independently (section-aware chunking)
"""
def __init__(self) -> None:
@ -34,20 +34,21 @@ class FixedSizeChunker(IChunker):
def chunk(
self,
text: str,
document_id: UUID,
document: Document,
strategy: ChunkingStrategy,
) -> List[Chunk]:
"""
Split text into fixed-size chunks with overlap.
Split document into fixed-size chunks with optional section awareness.
If respect_boundaries is True and document has sections, chunks
will not span across section boundaries.
Args:
text: Text content to chunk
document_id: ID of the parent document
document: Full Document entity with raw_markdown and sections
strategy: Chunking strategy configuration
Returns:
List of Chunk entities
List of Chunk entities with section metadata
Raises:
ChunkingError: If chunking fails
@ -55,18 +56,22 @@ class FixedSizeChunker(IChunker):
"""
try:
logger.info(
f"Chunking text with fixed_size strategy "
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
f"Chunking document with fixed_size strategy "
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
f"sections={len(document.sections)})"
)
# Validate inputs
self._validate_input(text, strategy)
self._validate_input(document.raw_markdown, strategy)
# Split text into segments
segments = self._split_into_segments(text, strategy)
# Create Chunk entities
chunks = self._create_chunks(segments, document_id)
# Choose chunking approach based on strategy and document structure
if strategy.respect_boundaries and document.sections:
# Section-aware chunking: process each section independently
chunks = self._chunk_by_sections(document, strategy)
else:
# Standard chunking: process entire raw_markdown
segments = self._split_into_segments(document.raw_markdown, strategy)
chunks = self._create_chunks(segments, document.id)
logger.info(f"Created {len(chunks)} fixed-size chunks")
return chunks
@ -78,7 +83,7 @@ class FixedSizeChunker(IChunker):
except Exception as e:
logger.error(f"Fixed-size chunking failed: {str(e)}")
raise ChunkingError(
message="Failed to chunk text with fixed_size strategy",
message="Failed to chunk document with fixed_size strategy",
details=str(e),
strategy_name=self._strategy_name,
)
@ -232,10 +237,55 @@ class FixedSizeChunker(IChunker):
respect_boundary=True,
)
def _chunk_by_sections(
self,
document: Document,
strategy: ChunkingStrategy,
) -> List[Chunk]:
"""
Chunk document by processing each section independently.
This prevents chunks from spanning across section boundaries.
Args:
document: Document with sections
strategy: Chunking strategy configuration
Returns:
List of Chunk entities with section metadata
"""
all_chunks = []
global_sequence = 0
for section_index, section in enumerate(document.sections):
# Split this section's content into segments
segments = self._split_into_segments(section.content, strategy)
# Create chunks for this section
for text, start_char, end_char in segments:
chunk = Chunk(
document_id=document.id,
content=text,
sequence_number=global_sequence,
start_char=start_char,
end_char=end_char,
section_title=section.title,
section_index=section_index,
)
all_chunks.append(chunk)
global_sequence += 1
logger.debug(
f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
)
return all_chunks
def _create_chunks(
self,
segments: List[tuple[str, int, int]],
document_id: UUID,
document_id,
section_title: Optional[str] = None,
section_index: Optional[int] = None,
) -> List[Chunk]:
"""
Create Chunk entities from text segments.
@ -243,6 +293,8 @@ class FixedSizeChunker(IChunker):
Args:
segments: List of (text, start_pos, end_pos) tuples
document_id: ID of parent document
section_title: Optional section title
section_index: Optional section index
Returns:
List of Chunk entities
@ -256,6 +308,8 @@ class FixedSizeChunker(IChunker):
sequence_number=sequence_number,
start_char=start_char,
end_char=end_char,
section_title=section_title,
section_index=section_index,
)
chunks.append(chunk)

View File

@ -2,15 +2,14 @@
Paragraph Chunker - Concrete implementation for paragraph-based chunking.
This adapter implements the IChunker port using a paragraph-respecting
strategy that combines paragraphs to reach target chunk size.
strategy that combines paragraphs to reach target chunk size with section awareness.
"""
import logging
from typing import List
from uuid import UUID
from typing import List, Optional
from ....core.domain import logic_utils
from ....core.domain.exceptions import ChunkingError, ValidationError
from ....core.domain.models import Chunk, ChunkingStrategy
from ....core.domain.models import Chunk, ChunkingStrategy, Document
from ....core.ports.outgoing.chunker import IChunker
@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)
class ParagraphChunker(IChunker):
"""
Concrete paragraph-based chunker implementation.
Concrete paragraph-based chunker implementation with section awareness.
This adapter:
1. Splits text by paragraph boundaries
1. Splits documents by paragraph boundaries
2. Combines paragraphs to reach target chunk size
3. Preserves document structure
4. Can process each section independently (section-aware chunking)
"""
def __init__(self) -> None:
@ -34,20 +34,21 @@ class ParagraphChunker(IChunker):
def chunk(
self,
text: str,
document_id: UUID,
document: Document,
strategy: ChunkingStrategy,
) -> List[Chunk]:
"""
Split text into paragraph-based chunks.
Split document into paragraph-based chunks with optional section awareness.
If respect_boundaries is True and document has sections, chunks
will not span across section boundaries.
Args:
text: Text content to chunk
document_id: ID of the parent document
document: Full Document entity with raw_markdown and sections
strategy: Chunking strategy configuration
Returns:
List of Chunk entities
List of Chunk entities with section metadata
Raises:
ChunkingError: If chunking fails
@ -55,18 +56,22 @@ class ParagraphChunker(IChunker):
"""
try:
logger.info(
f"Chunking text with paragraph strategy "
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
f"Chunking document with paragraph strategy "
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
f"sections={len(document.sections)})"
)
# Validate inputs
self._validate_input(text, strategy)
self._validate_input(document.raw_markdown, strategy)
# Split into paragraphs and group
segments = self._split_and_group_paragraphs(text, strategy)
# Create Chunk entities
chunks = self._create_chunks(segments, document_id)
# Choose chunking approach based on strategy and document structure
if strategy.respect_boundaries and document.sections:
# Section-aware chunking: process each section independently
chunks = self._chunk_by_sections(document, strategy)
else:
# Standard chunking: process entire raw_markdown
segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
chunks = self._create_chunks(segments, document.id)
logger.info(f"Created {len(chunks)} paragraph-based chunks")
return chunks
@ -78,7 +83,7 @@ class ParagraphChunker(IChunker):
except Exception as e:
logger.error(f"Paragraph chunking failed: {str(e)}")
raise ChunkingError(
message="Failed to chunk text with paragraph strategy",
message="Failed to chunk document with paragraph strategy",
details=str(e),
strategy_name=self._strategy_name,
)
@ -283,10 +288,55 @@ class ParagraphChunker(IChunker):
_, _, prev_end = previous_segment
return ([new_paragraph], prev_end, new_para_size)
def _chunk_by_sections(
self,
document: Document,
strategy: ChunkingStrategy,
) -> List[Chunk]:
"""
Chunk document by processing each section independently.
This prevents chunks from spanning across section boundaries.
Args:
document: Document with sections
strategy: Chunking strategy configuration
Returns:
List of Chunk entities with section metadata
"""
all_chunks = []
global_sequence = 0
for section_index, section in enumerate(document.sections):
# Split this section's content into paragraph-based segments
segments = self._split_and_group_paragraphs(section.content, strategy)
# Create chunks for this section
for text, start_char, end_char in segments:
chunk = Chunk(
document_id=document.id,
content=text,
sequence_number=global_sequence,
start_char=start_char,
end_char=end_char,
section_title=section.title,
section_index=section_index,
)
all_chunks.append(chunk)
global_sequence += 1
logger.debug(
f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
)
return all_chunks
def _create_chunks(
self,
segments: List[tuple[str, int, int]],
document_id: UUID,
document_id,
section_title: Optional[str] = None,
section_index: Optional[int] = None,
) -> List[Chunk]:
"""
Create Chunk entities from text segments.
@ -294,6 +344,8 @@ class ParagraphChunker(IChunker):
Args:
segments: List of (text, start_pos, end_pos) tuples
document_id: ID of parent document
section_title: Optional section title
section_index: Optional section index
Returns:
List of Chunk entities
@ -307,6 +359,8 @@ class ParagraphChunker(IChunker):
sequence_number=sequence_number,
start_char=start_char,
end_char=end_char,
section_title=section_title,
section_index=section_index,
)
chunks.append(chunk)