some fixes in concrete implementations of chunkers
This commit is contained in:
parent
2c375ce6bd
commit
f06370e0b9
@ -6,10 +6,9 @@ This is an ADAPTER that implements the IChunkingContext port from Core.
|
|||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from ....core.domain.exceptions import ChunkingError
|
from ....core.domain.exceptions import ChunkingError
|
||||||
from ....core.domain.models import Chunk, ChunkingStrategy
|
from ....core.domain.models import Chunk, ChunkingStrategy, Document
|
||||||
from ....core.ports.outgoing.chunker import IChunker
|
from ....core.ports.outgoing.chunker import IChunker
|
||||||
from ....core.ports.outgoing.chunking_context import IChunkingContext
|
from ....core.ports.outgoing.chunking_context import IChunkingContext
|
||||||
|
|
||||||
@ -46,23 +45,21 @@ class ChunkingContext(IChunkingContext):
|
|||||||
|
|
||||||
def execute_chunking(
|
def execute_chunking(
|
||||||
self,
|
self,
|
||||||
text: str,
|
document: Document,
|
||||||
document_id: UUID,
|
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Execute chunking using the specified strategy.
|
Execute chunking using the specified strategy.
|
||||||
|
|
||||||
This method is stateless and thread-safe. It selects the appropriate
|
This method is stateless and thread-safe. It accepts the full
|
||||||
chunker based on the strategy configuration for each call.
|
Document object (with sections) to enable section-aware chunking.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text to chunk
|
document: Full Document entity with raw_markdown and sections
|
||||||
document_id: ID of parent document
|
|
||||||
strategy: Chunking strategy configuration (includes strategy_name)
|
strategy: Chunking strategy configuration (includes strategy_name)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of chunks
|
List of chunks with section metadata
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ChunkingError: If strategy is not registered or chunking fails
|
ChunkingError: If strategy is not registered or chunking fails
|
||||||
@ -83,8 +80,7 @@ class ChunkingContext(IChunkingContext):
|
|||||||
)
|
)
|
||||||
|
|
||||||
return chunker.chunk(
|
return chunker.chunk(
|
||||||
text=text,
|
document=document,
|
||||||
document_id=document_id,
|
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -2,15 +2,14 @@
|
|||||||
Fixed Size Chunker - Concrete implementation for fixed-size chunking.
|
Fixed Size Chunker - Concrete implementation for fixed-size chunking.
|
||||||
|
|
||||||
This adapter implements the IChunker port using a fixed-size strategy
|
This adapter implements the IChunker port using a fixed-size strategy
|
||||||
with optional overlap and boundary respect.
|
with optional overlap, boundary respect, and section-aware chunking.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from ....core.domain import logic_utils
|
from ....core.domain import logic_utils
|
||||||
from ....core.domain.exceptions import ChunkingError, ValidationError
|
from ....core.domain.exceptions import ChunkingError, ValidationError
|
||||||
from ....core.domain.models import Chunk, ChunkingStrategy
|
from ....core.domain.models import Chunk, ChunkingStrategy, Document
|
||||||
from ....core.ports.outgoing.chunker import IChunker
|
from ....core.ports.outgoing.chunker import IChunker
|
||||||
|
|
||||||
|
|
||||||
@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class FixedSizeChunker(IChunker):
|
class FixedSizeChunker(IChunker):
|
||||||
"""
|
"""
|
||||||
Concrete fixed-size chunker implementation.
|
Concrete fixed-size chunker implementation with section awareness.
|
||||||
|
|
||||||
This adapter:
|
This adapter:
|
||||||
1. Splits text into fixed-size chunks
|
1. Splits documents into fixed-size chunks
|
||||||
2. Supports overlap between chunks
|
2. Supports overlap between chunks
|
||||||
3. Respects word and sentence boundaries when configured
|
3. Respects word and sentence boundaries when configured
|
||||||
|
4. Can process each section independently (section-aware chunking)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
@ -34,20 +34,21 @@ class FixedSizeChunker(IChunker):
|
|||||||
|
|
||||||
def chunk(
|
def chunk(
|
||||||
self,
|
self,
|
||||||
text: str,
|
document: Document,
|
||||||
document_id: UUID,
|
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Split text into fixed-size chunks with overlap.
|
Split document into fixed-size chunks with optional section awareness.
|
||||||
|
|
||||||
|
If respect_boundaries is True and document has sections, chunks
|
||||||
|
will not span across section boundaries.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text content to chunk
|
document: Full Document entity with raw_markdown and sections
|
||||||
document_id: ID of the parent document
|
|
||||||
strategy: Chunking strategy configuration
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of Chunk entities
|
List of Chunk entities with section metadata
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ChunkingError: If chunking fails
|
ChunkingError: If chunking fails
|
||||||
@ -55,18 +56,22 @@ class FixedSizeChunker(IChunker):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Chunking text with fixed_size strategy "
|
f"Chunking document with fixed_size strategy "
|
||||||
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
|
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
|
||||||
|
f"sections={len(document.sections)})"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Validate inputs
|
# Validate inputs
|
||||||
self._validate_input(text, strategy)
|
self._validate_input(document.raw_markdown, strategy)
|
||||||
|
|
||||||
# Split text into segments
|
# Choose chunking approach based on strategy and document structure
|
||||||
segments = self._split_into_segments(text, strategy)
|
if strategy.respect_boundaries and document.sections:
|
||||||
|
# Section-aware chunking: process each section independently
|
||||||
# Create Chunk entities
|
chunks = self._chunk_by_sections(document, strategy)
|
||||||
chunks = self._create_chunks(segments, document_id)
|
else:
|
||||||
|
# Standard chunking: process entire raw_markdown
|
||||||
|
segments = self._split_into_segments(document.raw_markdown, strategy)
|
||||||
|
chunks = self._create_chunks(segments, document.id)
|
||||||
|
|
||||||
logger.info(f"Created {len(chunks)} fixed-size chunks")
|
logger.info(f"Created {len(chunks)} fixed-size chunks")
|
||||||
return chunks
|
return chunks
|
||||||
@ -78,7 +83,7 @@ class FixedSizeChunker(IChunker):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Fixed-size chunking failed: {str(e)}")
|
logger.error(f"Fixed-size chunking failed: {str(e)}")
|
||||||
raise ChunkingError(
|
raise ChunkingError(
|
||||||
message="Failed to chunk text with fixed_size strategy",
|
message="Failed to chunk document with fixed_size strategy",
|
||||||
details=str(e),
|
details=str(e),
|
||||||
strategy_name=self._strategy_name,
|
strategy_name=self._strategy_name,
|
||||||
)
|
)
|
||||||
@ -232,10 +237,55 @@ class FixedSizeChunker(IChunker):
|
|||||||
respect_boundary=True,
|
respect_boundary=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _chunk_by_sections(
|
||||||
|
self,
|
||||||
|
document: Document,
|
||||||
|
strategy: ChunkingStrategy,
|
||||||
|
) -> List[Chunk]:
|
||||||
|
"""
|
||||||
|
Chunk document by processing each section independently.
|
||||||
|
|
||||||
|
This prevents chunks from spanning across section boundaries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: Document with sections
|
||||||
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Chunk entities with section metadata
|
||||||
|
"""
|
||||||
|
all_chunks = []
|
||||||
|
global_sequence = 0
|
||||||
|
|
||||||
|
for section_index, section in enumerate(document.sections):
|
||||||
|
# Split this section's content into segments
|
||||||
|
segments = self._split_into_segments(section.content, strategy)
|
||||||
|
|
||||||
|
# Create chunks for this section
|
||||||
|
for text, start_char, end_char in segments:
|
||||||
|
chunk = Chunk(
|
||||||
|
document_id=document.id,
|
||||||
|
content=text,
|
||||||
|
sequence_number=global_sequence,
|
||||||
|
start_char=start_char,
|
||||||
|
end_char=end_char,
|
||||||
|
section_title=section.title,
|
||||||
|
section_index=section_index,
|
||||||
|
)
|
||||||
|
all_chunks.append(chunk)
|
||||||
|
global_sequence += 1
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
|
||||||
|
)
|
||||||
|
return all_chunks
|
||||||
|
|
||||||
def _create_chunks(
|
def _create_chunks(
|
||||||
self,
|
self,
|
||||||
segments: List[tuple[str, int, int]],
|
segments: List[tuple[str, int, int]],
|
||||||
document_id: UUID,
|
document_id,
|
||||||
|
section_title: Optional[str] = None,
|
||||||
|
section_index: Optional[int] = None,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Create Chunk entities from text segments.
|
Create Chunk entities from text segments.
|
||||||
@ -243,6 +293,8 @@ class FixedSizeChunker(IChunker):
|
|||||||
Args:
|
Args:
|
||||||
segments: List of (text, start_pos, end_pos) tuples
|
segments: List of (text, start_pos, end_pos) tuples
|
||||||
document_id: ID of parent document
|
document_id: ID of parent document
|
||||||
|
section_title: Optional section title
|
||||||
|
section_index: Optional section index
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of Chunk entities
|
List of Chunk entities
|
||||||
@ -256,6 +308,8 @@ class FixedSizeChunker(IChunker):
|
|||||||
sequence_number=sequence_number,
|
sequence_number=sequence_number,
|
||||||
start_char=start_char,
|
start_char=start_char,
|
||||||
end_char=end_char,
|
end_char=end_char,
|
||||||
|
section_title=section_title,
|
||||||
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
chunks.append(chunk)
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
|||||||
@ -2,15 +2,14 @@
|
|||||||
Paragraph Chunker - Concrete implementation for paragraph-based chunking.
|
Paragraph Chunker - Concrete implementation for paragraph-based chunking.
|
||||||
|
|
||||||
This adapter implements the IChunker port using a paragraph-respecting
|
This adapter implements the IChunker port using a paragraph-respecting
|
||||||
strategy that combines paragraphs to reach target chunk size.
|
strategy that combines paragraphs to reach target chunk size with section awareness.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from ....core.domain import logic_utils
|
from ....core.domain import logic_utils
|
||||||
from ....core.domain.exceptions import ChunkingError, ValidationError
|
from ....core.domain.exceptions import ChunkingError, ValidationError
|
||||||
from ....core.domain.models import Chunk, ChunkingStrategy
|
from ....core.domain.models import Chunk, ChunkingStrategy, Document
|
||||||
from ....core.ports.outgoing.chunker import IChunker
|
from ....core.ports.outgoing.chunker import IChunker
|
||||||
|
|
||||||
|
|
||||||
@ -19,12 +18,13 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class ParagraphChunker(IChunker):
|
class ParagraphChunker(IChunker):
|
||||||
"""
|
"""
|
||||||
Concrete paragraph-based chunker implementation.
|
Concrete paragraph-based chunker implementation with section awareness.
|
||||||
|
|
||||||
This adapter:
|
This adapter:
|
||||||
1. Splits text by paragraph boundaries
|
1. Splits documents by paragraph boundaries
|
||||||
2. Combines paragraphs to reach target chunk size
|
2. Combines paragraphs to reach target chunk size
|
||||||
3. Preserves document structure
|
3. Preserves document structure
|
||||||
|
4. Can process each section independently (section-aware chunking)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
@ -34,20 +34,21 @@ class ParagraphChunker(IChunker):
|
|||||||
|
|
||||||
def chunk(
|
def chunk(
|
||||||
self,
|
self,
|
||||||
text: str,
|
document: Document,
|
||||||
document_id: UUID,
|
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Split text into paragraph-based chunks.
|
Split document into paragraph-based chunks with optional section awareness.
|
||||||
|
|
||||||
|
If respect_boundaries is True and document has sections, chunks
|
||||||
|
will not span across section boundaries.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text content to chunk
|
document: Full Document entity with raw_markdown and sections
|
||||||
document_id: ID of the parent document
|
|
||||||
strategy: Chunking strategy configuration
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of Chunk entities
|
List of Chunk entities with section metadata
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ChunkingError: If chunking fails
|
ChunkingError: If chunking fails
|
||||||
@ -55,18 +56,22 @@ class ParagraphChunker(IChunker):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Chunking text with paragraph strategy "
|
f"Chunking document with paragraph strategy "
|
||||||
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size})"
|
f"(size={strategy.chunk_size}, overlap={strategy.overlap_size}, "
|
||||||
|
f"sections={len(document.sections)})"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Validate inputs
|
# Validate inputs
|
||||||
self._validate_input(text, strategy)
|
self._validate_input(document.raw_markdown, strategy)
|
||||||
|
|
||||||
# Split into paragraphs and group
|
# Choose chunking approach based on strategy and document structure
|
||||||
segments = self._split_and_group_paragraphs(text, strategy)
|
if strategy.respect_boundaries and document.sections:
|
||||||
|
# Section-aware chunking: process each section independently
|
||||||
# Create Chunk entities
|
chunks = self._chunk_by_sections(document, strategy)
|
||||||
chunks = self._create_chunks(segments, document_id)
|
else:
|
||||||
|
# Standard chunking: process entire raw_markdown
|
||||||
|
segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
|
||||||
|
chunks = self._create_chunks(segments, document.id)
|
||||||
|
|
||||||
logger.info(f"Created {len(chunks)} paragraph-based chunks")
|
logger.info(f"Created {len(chunks)} paragraph-based chunks")
|
||||||
return chunks
|
return chunks
|
||||||
@ -78,7 +83,7 @@ class ParagraphChunker(IChunker):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Paragraph chunking failed: {str(e)}")
|
logger.error(f"Paragraph chunking failed: {str(e)}")
|
||||||
raise ChunkingError(
|
raise ChunkingError(
|
||||||
message="Failed to chunk text with paragraph strategy",
|
message="Failed to chunk document with paragraph strategy",
|
||||||
details=str(e),
|
details=str(e),
|
||||||
strategy_name=self._strategy_name,
|
strategy_name=self._strategy_name,
|
||||||
)
|
)
|
||||||
@ -283,10 +288,55 @@ class ParagraphChunker(IChunker):
|
|||||||
_, _, prev_end = previous_segment
|
_, _, prev_end = previous_segment
|
||||||
return ([new_paragraph], prev_end, new_para_size)
|
return ([new_paragraph], prev_end, new_para_size)
|
||||||
|
|
||||||
|
def _chunk_by_sections(
|
||||||
|
self,
|
||||||
|
document: Document,
|
||||||
|
strategy: ChunkingStrategy,
|
||||||
|
) -> List[Chunk]:
|
||||||
|
"""
|
||||||
|
Chunk document by processing each section independently.
|
||||||
|
|
||||||
|
This prevents chunks from spanning across section boundaries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: Document with sections
|
||||||
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Chunk entities with section metadata
|
||||||
|
"""
|
||||||
|
all_chunks = []
|
||||||
|
global_sequence = 0
|
||||||
|
|
||||||
|
for section_index, section in enumerate(document.sections):
|
||||||
|
# Split this section's content into paragraph-based segments
|
||||||
|
segments = self._split_and_group_paragraphs(section.content, strategy)
|
||||||
|
|
||||||
|
# Create chunks for this section
|
||||||
|
for text, start_char, end_char in segments:
|
||||||
|
chunk = Chunk(
|
||||||
|
document_id=document.id,
|
||||||
|
content=text,
|
||||||
|
sequence_number=global_sequence,
|
||||||
|
start_char=start_char,
|
||||||
|
end_char=end_char,
|
||||||
|
section_title=section.title,
|
||||||
|
section_index=section_index,
|
||||||
|
)
|
||||||
|
all_chunks.append(chunk)
|
||||||
|
global_sequence += 1
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Created {len(all_chunks)} chunks across {len(document.sections)} sections"
|
||||||
|
)
|
||||||
|
return all_chunks
|
||||||
|
|
||||||
def _create_chunks(
|
def _create_chunks(
|
||||||
self,
|
self,
|
||||||
segments: List[tuple[str, int, int]],
|
segments: List[tuple[str, int, int]],
|
||||||
document_id: UUID,
|
document_id,
|
||||||
|
section_title: Optional[str] = None,
|
||||||
|
section_index: Optional[int] = None,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Create Chunk entities from text segments.
|
Create Chunk entities from text segments.
|
||||||
@ -294,6 +344,8 @@ class ParagraphChunker(IChunker):
|
|||||||
Args:
|
Args:
|
||||||
segments: List of (text, start_pos, end_pos) tuples
|
segments: List of (text, start_pos, end_pos) tuples
|
||||||
document_id: ID of parent document
|
document_id: ID of parent document
|
||||||
|
section_title: Optional section title
|
||||||
|
section_index: Optional section index
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of Chunk entities
|
List of Chunk entities
|
||||||
@ -307,6 +359,8 @@ class ParagraphChunker(IChunker):
|
|||||||
sequence_number=sequence_number,
|
sequence_number=sequence_number,
|
||||||
start_char=start_char,
|
start_char=start_char,
|
||||||
end_char=end_char,
|
end_char=end_char,
|
||||||
|
section_title=section_title,
|
||||||
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
chunks.append(chunk)
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user