add document title and section title to the beginning of each chunk in paragraph chunker

This commit is contained in:
m.dabbagh 2026-01-25 11:32:35 +03:30
parent cda128e438
commit 9e1e49bc59
5 changed files with 43 additions and 93 deletions

View File

@ -216,8 +216,6 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
document_id=str(chunk.document_id),
content=chunk.content,
sequence_number=chunk.sequence_number,
start_char=chunk.start_char,
end_char=chunk.end_char,
length=chunk.get_length(),
)
for chunk in chunks

View File

@ -101,8 +101,6 @@ class ChunkResponse(BaseModel):
document_id: str
content: str
sequence_number: int
start_char: int
end_char: int
length: int

View File

@ -70,8 +70,8 @@ class FixedSizeChunker(IChunker):
chunks = self._chunk_by_sections(document, strategy)
else:
# Standard chunking: process entire raw_markdown
segments = self._split_into_segments(document.raw_markdown, strategy)
chunks = self._create_chunks(segments, document.id)
chunk_texts = self._split_into_segments(document.raw_markdown, strategy)
chunks = self._create_chunks(chunk_texts, document.id)
logger.info(f"Created {len(chunks)} fixed-size chunks")
return chunks
@ -136,7 +136,7 @@ class FixedSizeChunker(IChunker):
self,
text: str,
strategy: ChunkingStrategy,
) -> List[tuple[str, int, int]]:
) -> List[str]:
"""
Split text into fixed-size segments.
@ -145,7 +145,7 @@ class FixedSizeChunker(IChunker):
strategy: Chunking strategy configuration
Returns:
List of (chunk_text, start_position, end_position) tuples
List of chunk text strings
"""
segments = []
text_length = len(text)
@ -155,7 +155,7 @@ class FixedSizeChunker(IChunker):
position = 0
while position < text_length:
segment = self._extract_segment(
chunk_text = self._extract_segment(
text=text,
position=position,
chunk_size=chunk_size,
@ -163,10 +163,8 @@ class FixedSizeChunker(IChunker):
respect_boundaries=strategy.respect_boundaries,
)
if segment:
chunk_text, start_pos, end_pos = segment
if chunk_text.strip():
segments.append((chunk_text, start_pos, end_pos))
if chunk_text and chunk_text.strip():
segments.append(chunk_text)
position += step_size
@ -183,7 +181,7 @@ class FixedSizeChunker(IChunker):
chunk_size: int,
text_length: int,
respect_boundaries: bool,
) -> tuple[str, int, int] | None:
) -> str:
"""
Extract a single segment from text.
@ -195,16 +193,15 @@ class FixedSizeChunker(IChunker):
respect_boundaries: Whether to respect boundaries
Returns:
Tuple of (chunk_text, start_pos, end_pos) or None
Chunk text string
"""
end_pos = min(position + chunk_size, text_length)
chunk_text = text[position:end_pos]
if respect_boundaries and end_pos < text_length:
chunk_text = self._adjust_to_boundary(text, position, end_pos)
end_pos = position + len(chunk_text)
return (chunk_text, position, end_pos)
return chunk_text
def _adjust_to_boundary(
self,
@ -258,17 +255,15 @@ class FixedSizeChunker(IChunker):
global_sequence = 0
for section_index, section in enumerate(document.sections):
# Split this section's content into segments
segments = self._split_into_segments(section.content, strategy)
# Split this section's content into chunks
chunk_texts = self._split_into_segments(section.content, strategy)
# Create chunks for this section
for text, start_char, end_char in segments:
for text in chunk_texts:
chunk = Chunk(
document_id=document.id,
content=text,
sequence_number=global_sequence,
start_char=start_char,
end_char=end_char,
section_title=section.title,
section_index=section_index,
)
@ -282,16 +277,16 @@ class FixedSizeChunker(IChunker):
def _create_chunks(
self,
segments: List[tuple[str, int, int]],
chunk_texts: List[str],
document_id,
section_title: Optional[str] = None,
section_index: Optional[int] = None,
) -> List[Chunk]:
"""
Create Chunk entities from text segments.
Create Chunk entities from text strings.
Args:
segments: List of (text, start_pos, end_pos) tuples
chunk_texts: List of chunk text strings
document_id: ID of parent document
section_title: Optional section title
section_index: Optional section index
@ -301,13 +296,11 @@ class FixedSizeChunker(IChunker):
"""
chunks = []
for sequence_number, (text, start_char, end_char) in enumerate(segments):
for sequence_number, text in enumerate(chunk_texts):
chunk = Chunk(
document_id=document_id,
content=text,
sequence_number=sequence_number,
start_char=start_char,
end_char=end_char,
section_title=section_title,
section_index=section_index,
)

View File

@ -70,8 +70,8 @@ class ParagraphChunker(IChunker):
chunks = self._chunk_by_sections(document, strategy)
else:
# Standard chunking: process entire raw_markdown
segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
chunks = self._create_chunks(segments, document.id)
chunk_texts = self._split_and_group_paragraphs(document.raw_markdown, strategy)
chunks = self._create_chunks(chunk_texts, document.id)
logger.info(f"Created {len(chunks)} paragraph-based chunks")
return chunks
@ -136,7 +136,7 @@ class ParagraphChunker(IChunker):
self,
text: str,
strategy: ChunkingStrategy,
) -> List[tuple[str, int, int]]:
) -> List[str]:
"""
Split text into paragraphs and group them into chunks.
@ -145,14 +145,14 @@ class ParagraphChunker(IChunker):
strategy: Chunking strategy configuration
Returns:
List of (chunk_text, start_position, end_position) tuples
List of chunk text strings
"""
# Split into paragraphs
paragraphs = logic_utils.split_into_paragraphs(text)
if not paragraphs:
# No paragraphs found, return whole text as single chunk
return [(text, 0, len(text))]
return [text]
# Group paragraphs into chunks
return self._group_paragraphs(paragraphs, strategy)
@ -161,7 +161,7 @@ class ParagraphChunker(IChunker):
self,
paragraphs: List[str],
strategy: ChunkingStrategy,
) -> List[tuple[str, int, int]]:
) -> List[str]:
"""
Group paragraphs into chunks based on target size.
@ -170,12 +170,11 @@ class ParagraphChunker(IChunker):
strategy: Chunking strategy
Returns:
List of (chunk_text, start_pos, end_pos) tuples
List of chunk text strings
"""
segments = []
current_paragraphs = []
current_size = 0
current_start = 0
for paragraph in paragraphs:
para_size = len(paragraph)
@ -185,13 +184,11 @@ class ParagraphChunker(IChunker):
current_size, para_size, strategy.chunk_size, current_paragraphs
):
# Create chunk from accumulated paragraphs
segment = self._create_segment(
current_paragraphs, current_start
)
segment = self._create_segment(current_paragraphs)
segments.append(segment)
# Handle overlap
current_paragraphs, current_start, current_size = (
current_paragraphs, current_size = (
self._handle_overlap(
segment, paragraph, para_size, strategy.overlap_size
)
@ -203,7 +200,7 @@ class ParagraphChunker(IChunker):
# Add final chunk
if current_paragraphs:
segment = self._create_segment(current_paragraphs, current_start)
segment = self._create_segment(current_paragraphs)
segments.append(segment)
logger.debug(
@ -237,56 +234,49 @@ class ParagraphChunker(IChunker):
def _create_segment(
self,
paragraphs: List[str],
start_pos: int,
) -> tuple[str, int, int]:
) -> str:
"""
Create a segment from paragraphs.
Args:
paragraphs: List of paragraph strings
start_pos: Starting position
Returns:
Tuple of (chunk_text, start_pos, end_pos)
Chunk text string
"""
chunk_text = "\n\n".join(paragraphs)
end_pos = start_pos + len(chunk_text)
return (chunk_text, start_pos, end_pos)
return "\n\n".join(paragraphs)
def _handle_overlap(
self,
previous_segment: tuple[str, int, int],
previous_segment: str,
new_paragraph: str,
new_para_size: int,
overlap_size: int,
) -> tuple[List[str], int, int]:
) -> tuple[List[str], int]:
"""
Handle overlap between chunks.
Args:
previous_segment: Previous chunk segment
previous_segment: Previous chunk text
new_paragraph: New paragraph to start with
new_para_size: Size of new paragraph
overlap_size: Desired overlap size
Returns:
Tuple of (new_paragraphs, new_start, new_size)
Tuple of (new_paragraphs, new_size)
"""
if overlap_size > 0:
prev_text, _, prev_end = previous_segment
overlap_text = logic_utils.calculate_overlap_text(
text=prev_text,
text=previous_segment,
overlap_size=overlap_size,
from_start=False,
)
return (
[overlap_text, new_paragraph],
prev_end - len(overlap_text),
len(overlap_text) + new_para_size,
)
else:
_, _, prev_end = previous_segment
return ([new_paragraph], prev_end, new_para_size)
return ([new_paragraph], new_para_size)
def _chunk_by_sections(
self,
@ -313,11 +303,11 @@ class ParagraphChunker(IChunker):
document_title = document.metadata.display_name
for section_index, section in enumerate(document.sections):
# Split this section's content into paragraph-based segments
segments = self._split_and_group_paragraphs(section.content, strategy)
# Split this section's content into paragraph-based chunks
chunk_texts = self._split_and_group_paragraphs(section.content, strategy)
# Create chunks for this section with title prefix
for text, start_char, end_char in segments:
for text in chunk_texts:
# Prepend document title and section title to chunk content
prefixed_content = f"{document_title}\n{section.title}\n{text}"
@ -325,8 +315,6 @@ class ParagraphChunker(IChunker):
document_id=document.id,
content=prefixed_content,
sequence_number=global_sequence,
start_char=start_char,
end_char=end_char,
section_title=section.title,
section_index=section_index,
)
@ -340,16 +328,16 @@ class ParagraphChunker(IChunker):
def _create_chunks(
self,
segments: List[tuple[str, int, int]],
chunk_texts: List[str],
document_id,
section_title: Optional[str] = None,
section_index: Optional[int] = None,
) -> List[Chunk]:
"""
Create Chunk entities from text segments.
Create Chunk entities from text strings.
Args:
segments: List of (text, start_pos, end_pos) tuples
chunk_texts: List of chunk text strings
document_id: ID of parent document
section_title: Optional section title
section_index: Optional section index
@ -359,13 +347,11 @@ class ParagraphChunker(IChunker):
"""
chunks = []
for sequence_number, (text, start_char, end_char) in enumerate(segments):
for sequence_number, text in enumerate(chunk_texts):
chunk = Chunk(
document_id=document_id,
content=text,
sequence_number=sequence_number,
start_char=start_char,
end_char=end_char,
section_title=section_title,
section_index=section_index,
)

View File

@ -360,8 +360,6 @@ class Chunk(BaseModel):
document_id: ID of the parent document
content: Text content of the chunk
sequence_number: Order of this chunk in the document
start_char: Starting character position in original document
end_char: Ending character position in original document
section_title: Title of the section this chunk belongs to
section_index: Index of the section in document.sections
metadata: Optional metadata specific to this chunk
@ -370,8 +368,6 @@ class Chunk(BaseModel):
document_id: UUID = Field(..., description="Parent document ID")
content: str = Field(..., min_length=1, description="Chunk text content")
sequence_number: int = Field(..., ge=0, description="Chunk order in document")
start_char: int = Field(..., ge=0, description="Start position in document")
end_char: int = Field(..., gt=0, description="End position in document")
section_title: Optional[str] = Field(None, description="Section title")
section_index: Optional[int] = Field(None, ge=0, description="Section index")
metadata: Dict[str, str] = Field(default_factory=dict)
@ -380,27 +376,6 @@ class Chunk(BaseModel):
"frozen": True, # Chunks are immutable
}
@model_validator(mode='after')
def validate_position_consistency(self) -> 'Chunk':
"""Ensure end position is after start position."""
if self.end_char <= self.start_char:
raise ValueError(
f"end_char ({self.end_char}) must be greater than "
f"start_char ({self.start_char})"
)
# Validate content length matches position range
content_length = len(self.content)
position_range = self.end_char - self.start_char
if abs(content_length - position_range) > 10: # Allow small variance
raise ValueError(
f"Content length ({content_length}) doesn't match "
f"position range ({position_range})"
)
return self
def get_length(self) -> int:
"""Get the length of the chunk content."""
return len(self.content)