some fixes in concrete implementations of chunkers
This commit is contained in:
parent
2c375ce6bd
commit
f06370e0b9
@ -6,10 +6,9 @@ This is an ADAPTER that implements the IChunkingContext port from Core.
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
from uuid import UUID
|
||||
|
||||
from ....core.domain.exceptions import ChunkingError
|
||||
from ....core.domain.models import Chunk, ChunkingStrategy
|
||||
from ....core.domain.models import Chunk, ChunkingStrategy, Document
|
||||
from ....core.ports.outgoing.chunker import IChunker
|
||||
from ....core.ports.outgoing.chunking_context import IChunkingContext
|
||||
|
||||
@ -46,23 +45,21 @@ class ChunkingContext(IChunkingContext):
|
||||
|
||||
def execute_chunking(
|
||||
self,
|
||||
text: str,
|
||||
document_id: UUID,
|
||||
document: Document,
|
||||
strategy: ChunkingStrategy,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Execute chunking using the specified strategy.
|
||||
|
||||
This method is stateless and thread-safe. It selects the appropriate
|
||||
chunker based on the strategy configuration for each call.
|
||||
This method is stateless and thread-safe. It accepts the full
|
||||
Document object (with sections) to enable section-aware chunking.
|
||||
|
||||
Args:
|
||||
text: Text to chunk
|
||||
document_id: ID of parent document
|
||||
document: Full Document entity with raw_markdown and sections
|
||||
strategy: Chunking strategy configuration (includes strategy_name)
|
||||
|
||||
Returns:
|
||||
List of chunks
|
||||
List of chunks with section metadata
|
||||
|
||||
Raises:
|
||||
ChunkingError: If strategy is not registered or chunking fails
|
||||
@ -83,8 +80,7 @@ class ChunkingContext(IChunkingContext):
|
||||
)
|
||||
|
||||
return chunker.chunk(
|
||||
text=text,
|
||||
document_id=document_id,
|
||||
document=document,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
|
||||
@ -2,15 +2,14 @@
|
||||
Fixed Size Chunker - Concrete implementation for fixed-size chunking.
|
||||
|
||||
This adapter implements the IChunker port using a fixed-size strategy
|
||||
with optional overlap and boundary respect.
|
||||
with optional overlap, boundary respect, and section-aware chunking.
|
||||
"""
|
||||
import logging
|
||||
from typing import List
|
||||
from uuid import UUID
|
||||
from typing import List, Optional
|
||||
|
||||
from ....core.domain import logic_utils
|
||||
from ....core.domain.exceptions import ChunkingError, ValidationError
|
||||
from ....core.domain.models import Chunk, ChunkingStrategy
|
||||
from ....core.domain.models import Chunk, ChunkingStrategy, Document
|
||||
from ....core.ports.outgoing.chunker import IChunker
|
||||
|
||||
|
||||
@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class FixedSizeChunker(IChunker):
|
||||
"""
|
||||
Concrete fixed-size chunker implementation.
|
||||
Concrete fixed-size chunker implementation with section awareness.
|
||||
|
||||
This adapter:
|
||||
1. Splits text into fixed-size chunks
|
||||
1. Splits documents into fixed-size chunks
|
||||
2. Supports overlap between chunks
|
||||
3. Respects word and sentence boundaries when configured
|
||||
4. Can process each section independently (section-aware chunking)
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
@ -34,20 +34,21 @@ class FixedSizeChunker(IChunker):
|
||||
|
||||
def chunk(
|
||||
self,
|
||||
text: str,
|
||||
document_id: UUID,
|
||||
document: Document,
|
||||
strategy: ChunkingStrategy,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Split text into fixed-size chunks with overlap.
|
||||
Split document into fixed-size chunks with optional section awareness.
|
||||
|
||||
If respect_boundaries is True and document has sections, chunks
|
||||
will not span across section boundaries.
|
||||
|
||||
Args:
|
||||
text: Text content to chunk
|
||||
document_id: ID of the parent document
|
||||
document: Full Document entity with raw_markdown and sections
|
||||
strategy: Chunking strategy configuration
|
||||
|
||||
Returns:
|
||||
List of Chunk entities
|
||||
List of Chunk entities with section metadata
|
||||
|
||||
Raises:
|
||||
ChunkingError: If chunking fails
|
||||
@ -55,18 +56,22 @@ class FixedSizeChunker(IChunker):
|
||||
"""
|
||||
try:
|
||||
logger.info(
|
||||
f"Chunking text with fixed_size strategy "
|
||||
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
|
||||
f"Chunking document with fixed_size strategy "
|
||||
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
|
||||
f"sections={len(document.sections)})"
|
||||
)
|
||||
|
||||
# Validate inputs
|
||||
self._validate_input(text, strategy)
|
||||
self._validate_input(document.raw_markdown, strategy)
|
||||
|
||||
# Split text into segments
|
||||
segments = self._split_into_segments(text, strategy)
|
||||
|
||||
# Create Chunk entities
|
||||
chunks = self._create_chunks(segments, document_id)
|
||||
# Choose chunking approach based on strategy and document structure
|
||||
if strategy.respect_boundaries and document.sections:
|
||||
# Section-aware chunking: process each section independently
|
||||
chunks = self._chunk_by_sections(document, strategy)
|
||||
else:
|
||||
# Standard chunking: process entire raw_markdown
|
||||
segments = self._split_into_segments(document.raw_markdown, strategy)
|
||||
chunks = self._create_chunks(segments, document.id)
|
||||
|
||||
logger.info(f"Created {len(chunks)} fixed-size chunks")
|
||||
return chunks
|
||||
@ -78,7 +83,7 @@ class FixedSizeChunker(IChunker):
|
||||
except Exception as e:
|
||||
logger.error(f"Fixed-size chunking failed: {str(e)}")
|
||||
raise ChunkingError(
|
||||
message="Failed to chunk text with fixed_size strategy",
|
||||
message="Failed to chunk document with fixed_size strategy",
|
||||
details=str(e),
|
||||
strategy_name=self._strategy_name,
|
||||
)
|
||||
@ -232,10 +237,55 @@ class FixedSizeChunker(IChunker):
|
||||
respect_boundary=True,
|
||||
)
|
||||
|
||||
def _chunk_by_sections(
|
||||
self,
|
||||
document: Document,
|
||||
strategy: ChunkingStrategy,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Chunk document by processing each section independently.
|
||||
|
||||
This prevents chunks from spanning across section boundaries.
|
||||
|
||||
Args:
|
||||
document: Document with sections
|
||||
strategy: Chunking strategy configuration
|
||||
|
||||
Returns:
|
||||
List of Chunk entities with section metadata
|
||||
"""
|
||||
all_chunks = []
|
||||
global_sequence = 0
|
||||
|
||||
for section_index, section in enumerate(document.sections):
|
||||
# Split this section's content into segments
|
||||
segments = self._split_into_segments(section.content, strategy)
|
||||
|
||||
# Create chunks for this section
|
||||
for text, start_char, end_char in segments:
|
||||
chunk = Chunk(
|
||||
document_id=document.id,
|
||||
content=text,
|
||||
sequence_number=global_sequence,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
section_title=section.title,
|
||||
section_index=section_index,
|
||||
)
|
||||
all_chunks.append(chunk)
|
||||
global_sequence += 1
|
||||
|
||||
logger.debug(
|
||||
f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
|
||||
)
|
||||
return all_chunks
|
||||
|
||||
def _create_chunks(
|
||||
self,
|
||||
segments: List[tuple[str, int, int]],
|
||||
document_id: UUID,
|
||||
document_id,
|
||||
section_title: Optional[str] = None,
|
||||
section_index: Optional[int] = None,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Create Chunk entities from text segments.
|
||||
@ -243,6 +293,8 @@ class FixedSizeChunker(IChunker):
|
||||
Args:
|
||||
segments: List of (text, start_pos, end_pos) tuples
|
||||
document_id: ID of parent document
|
||||
section_title: Optional section title
|
||||
section_index: Optional section index
|
||||
|
||||
Returns:
|
||||
List of Chunk entities
|
||||
@ -256,6 +308,8 @@ class FixedSizeChunker(IChunker):
|
||||
sequence_number=sequence_number,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
section_title=section_title,
|
||||
section_index=section_index,
|
||||
)
|
||||
chunks.append(chunk)
|
||||
|
||||
|
||||
@ -2,15 +2,14 @@
|
||||
Paragraph Chunker - Concrete implementation for paragraph-based chunking.
|
||||
|
||||
This adapter implements the IChunker port using a paragraph-respecting
|
||||
strategy that combines paragraphs to reach target chunk size.
|
||||
strategy that combines paragraphs to reach target chunk size with section awareness.
|
||||
"""
|
||||
import logging
|
||||
from typing import List
|
||||
from uuid import UUID
|
||||
from typing import List, Optional
|
||||
|
||||
from ....core.domain import logic_utils
|
||||
from ....core.domain.exceptions import ChunkingError, ValidationError
|
||||
from ....core.domain.models import Chunk, ChunkingStrategy
|
||||
from ....core.domain.models import Chunk, ChunkingStrategy, Document
|
||||
from ....core.ports.outgoing.chunker import IChunker
|
||||
|
||||
|
||||
@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class ParagraphChunker(IChunker):
|
||||
"""
|
||||
Concrete paragraph-based chunker implementation.
|
||||
Concrete paragraph-based chunker implementation with section awareness.
|
||||
|
||||
This adapter:
|
||||
1. Splits text by paragraph boundaries
|
||||
1. Splits documents by paragraph boundaries
|
||||
2. Combines paragraphs to reach target chunk size
|
||||
3. Preserves document structure
|
||||
4. Can process each section independently (section-aware chunking)
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
@ -34,20 +34,21 @@ class ParagraphChunker(IChunker):
|
||||
|
||||
def chunk(
|
||||
self,
|
||||
text: str,
|
||||
document_id: UUID,
|
||||
document: Document,
|
||||
strategy: ChunkingStrategy,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Split text into paragraph-based chunks.
|
||||
Split document into paragraph-based chunks with optional section awareness.
|
||||
|
||||
If respect_boundaries is True and document has sections, chunks
|
||||
will not span across section boundaries.
|
||||
|
||||
Args:
|
||||
text: Text content to chunk
|
||||
document_id: ID of the parent document
|
||||
document: Full Document entity with raw_markdown and sections
|
||||
strategy: Chunking strategy configuration
|
||||
|
||||
Returns:
|
||||
List of Chunk entities
|
||||
List of Chunk entities with section metadata
|
||||
|
||||
Raises:
|
||||
ChunkingError: If chunking fails
|
||||
@ -55,18 +56,22 @@ class ParagraphChunker(IChunker):
|
||||
"""
|
||||
try:
|
||||
logger.info(
|
||||
f"Chunking text with paragraph strategy "
|
||||
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
|
||||
f"Chunking document with paragraph strategy "
|
||||
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
|
||||
f"sections={len(document.sections)})"
|
||||
)
|
||||
|
||||
# Validate inputs
|
||||
self._validate_input(text, strategy)
|
||||
self._validate_input(document.raw_markdown, strategy)
|
||||
|
||||
# Split into paragraphs and group
|
||||
segments = self._split_and_group_paragraphs(text, strategy)
|
||||
|
||||
# Create Chunk entities
|
||||
chunks = self._create_chunks(segments, document_id)
|
||||
# Choose chunking approach based on strategy and document structure
|
||||
if strategy.respect_boundaries and document.sections:
|
||||
# Section-aware chunking: process each section independently
|
||||
chunks = self._chunk_by_sections(document, strategy)
|
||||
else:
|
||||
# Standard chunking: process entire raw_markdown
|
||||
segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
|
||||
chunks = self._create_chunks(segments, document.id)
|
||||
|
||||
logger.info(f"Created {len(chunks)} paragraph-based chunks")
|
||||
return chunks
|
||||
@ -78,7 +83,7 @@ class ParagraphChunker(IChunker):
|
||||
except Exception as e:
|
||||
logger.error(f"Paragraph chunking failed: {str(e)}")
|
||||
raise ChunkingError(
|
||||
message="Failed to chunk text with paragraph strategy",
|
||||
message="Failed to chunk document with paragraph strategy",
|
||||
details=str(e),
|
||||
strategy_name=self._strategy_name,
|
||||
)
|
||||
@ -283,10 +288,55 @@ class ParagraphChunker(IChunker):
|
||||
_, _, prev_end = previous_segment
|
||||
return ([new_paragraph], prev_end, new_para_size)
|
||||
|
||||
def _chunk_by_sections(
|
||||
self,
|
||||
document: Document,
|
||||
strategy: ChunkingStrategy,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Chunk document by processing each section independently.
|
||||
|
||||
This prevents chunks from spanning across section boundaries.
|
||||
|
||||
Args:
|
||||
document: Document with sections
|
||||
strategy: Chunking strategy configuration
|
||||
|
||||
Returns:
|
||||
List of Chunk entities with section metadata
|
||||
"""
|
||||
all_chunks = []
|
||||
global_sequence = 0
|
||||
|
||||
for section_index, section in enumerate(document.sections):
|
||||
# Split this section's content into paragraph-based segments
|
||||
segments = self._split_and_group_paragraphs(section.content, strategy)
|
||||
|
||||
# Create chunks for this section
|
||||
for text, start_char, end_char in segments:
|
||||
chunk = Chunk(
|
||||
document_id=document.id,
|
||||
content=text,
|
||||
sequence_number=global_sequence,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
section_title=section.title,
|
||||
section_index=section_index,
|
||||
)
|
||||
all_chunks.append(chunk)
|
||||
global_sequence += 1
|
||||
|
||||
logger.debug(
|
||||
f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
|
||||
)
|
||||
return all_chunks
|
||||
|
||||
def _create_chunks(
|
||||
self,
|
||||
segments: List[tuple[str, int, int]],
|
||||
document_id: UUID,
|
||||
document_id,
|
||||
section_title: Optional[str] = None,
|
||||
section_index: Optional[int] = None,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Create Chunk entities from text segments.
|
||||
@ -294,6 +344,8 @@ class ParagraphChunker(IChunker):
|
||||
Args:
|
||||
segments: List of (text, start_pos, end_pos) tuples
|
||||
document_id: ID of parent document
|
||||
section_title: Optional section title
|
||||
section_index: Optional section index
|
||||
|
||||
Returns:
|
||||
List of Chunk entities
|
||||
@ -307,6 +359,8 @@ class ParagraphChunker(IChunker):
|
||||
sequence_number=sequence_number,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
section_title=section_title,
|
||||
section_index=section_index,
|
||||
)
|
||||
chunks.append(chunk)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user