add document title and section title to the beginning of each chunk in paragraph chunker

This commit is contained in:
m.dabbagh 2026-01-25 11:32:35 +03:30
parent cda128e438
commit 9e1e49bc59
5 changed files with 43 additions and 93 deletions

View File

@ -216,8 +216,6 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
document_id=str(chunk.document_id), document_id=str(chunk.document_id),
content=chunk.content, content=chunk.content,
sequence_number=chunk.sequence_number, sequence_number=chunk.sequence_number,
start_char=chunk.start_char,
end_char=chunk.end_char,
length=chunk.get_length(), length=chunk.get_length(),
) )
for chunk in chunks for chunk in chunks

View File

@ -101,8 +101,6 @@ class ChunkResponse(BaseModel):
document_id: str document_id: str
content: str content: str
sequence_number: int sequence_number: int
start_char: int
end_char: int
length: int length: int

View File

@ -70,8 +70,8 @@ class FixedSizeChunker(IChunker):
chunks = self._chunk_by_sections(document, strategy) chunks = self._chunk_by_sections(document, strategy)
else: else:
# Standard chunking: process entire raw_markdown # Standard chunking: process entire raw_markdown
segments = self._split_into_segments(document.raw_markdown, strategy) chunk_texts = self._split_into_segments(document.raw_markdown, strategy)
chunks = self._create_chunks(segments, document.id) chunks = self._create_chunks(chunk_texts, document.id)
logger.info(f"Created {len(chunks)} fixed-size chunks") logger.info(f"Created {len(chunks)} fixed-size chunks")
return chunks return chunks
@ -136,7 +136,7 @@ class FixedSizeChunker(IChunker):
self, self,
text: str, text: str,
strategy: ChunkingStrategy, strategy: ChunkingStrategy,
) -> List[tuple[str, int, int]]: ) -> List[str]:
""" """
Split text into fixed-size segments. Split text into fixed-size segments.
@ -145,7 +145,7 @@ class FixedSizeChunker(IChunker):
strategy: Chunking strategy configuration strategy: Chunking strategy configuration
Returns: Returns:
List of (chunk_text, start_position, end_position) tuples List of chunk text strings
""" """
segments = [] segments = []
text_length = len(text) text_length = len(text)
@ -155,7 +155,7 @@ class FixedSizeChunker(IChunker):
position = 0 position = 0
while position < text_length: while position < text_length:
segment = self._extract_segment( chunk_text = self._extract_segment(
text=text, text=text,
position=position, position=position,
chunk_size=chunk_size, chunk_size=chunk_size,
@ -163,10 +163,8 @@ class FixedSizeChunker(IChunker):
respect_boundaries=strategy.respect_boundaries, respect_boundaries=strategy.respect_boundaries,
) )
if segment: if chunk_text and chunk_text.strip():
chunk_text, start_pos, end_pos = segment segments.append(chunk_text)
if chunk_text.strip():
segments.append((chunk_text, start_pos, end_pos))
position += step_size position += step_size
@ -183,7 +181,7 @@ class FixedSizeChunker(IChunker):
chunk_size: int, chunk_size: int,
text_length: int, text_length: int,
respect_boundaries: bool, respect_boundaries: bool,
) -> tuple[str, int, int] | None: ) -> str:
""" """
Extract a single segment from text. Extract a single segment from text.
@ -195,16 +193,15 @@ class FixedSizeChunker(IChunker):
respect_boundaries: Whether to respect boundaries respect_boundaries: Whether to respect boundaries
Returns: Returns:
Tuple of (chunk_text, start_pos, end_pos) or None Chunk text string
""" """
end_pos = min(position + chunk_size, text_length) end_pos = min(position + chunk_size, text_length)
chunk_text = text[position:end_pos] chunk_text = text[position:end_pos]
if respect_boundaries and end_pos < text_length: if respect_boundaries and end_pos < text_length:
chunk_text = self._adjust_to_boundary(text, position, end_pos) chunk_text = self._adjust_to_boundary(text, position, end_pos)
end_pos = position + len(chunk_text)
return (chunk_text, position, end_pos) return chunk_text
def _adjust_to_boundary( def _adjust_to_boundary(
self, self,
@ -258,17 +255,15 @@ class FixedSizeChunker(IChunker):
global_sequence = 0 global_sequence = 0
for section_index, section in enumerate(document.sections): for section_index, section in enumerate(document.sections):
# Split this section's content into segments # Split this section's content into chunks
segments = self._split_into_segments(section.content, strategy) chunk_texts = self._split_into_segments(section.content, strategy)
# Create chunks for this section # Create chunks for this section
for text, start_char, end_char in segments: for text in chunk_texts:
chunk = Chunk( chunk = Chunk(
document_id=document.id, document_id=document.id,
content=text, content=text,
sequence_number=global_sequence, sequence_number=global_sequence,
start_char=start_char,
end_char=end_char,
section_title=section.title, section_title=section.title,
section_index=section_index, section_index=section_index,
) )
@ -282,16 +277,16 @@ class FixedSizeChunker(IChunker):
def _create_chunks( def _create_chunks(
self, self,
segments: List[tuple[str, int, int]], chunk_texts: List[str],
document_id, document_id,
section_title: Optional[str] = None, section_title: Optional[str] = None,
section_index: Optional[int] = None, section_index: Optional[int] = None,
) -> List[Chunk]: ) -> List[Chunk]:
""" """
Create Chunk entities from text segments. Create Chunk entities from text strings.
Args: Args:
segments: List of (text, start_pos, end_pos) tuples chunk_texts: List of chunk text strings
document_id: ID of parent document document_id: ID of parent document
section_title: Optional section title section_title: Optional section title
section_index: Optional section index section_index: Optional section index
@ -301,13 +296,11 @@ class FixedSizeChunker(IChunker):
""" """
chunks = [] chunks = []
for sequence_number, (text, start_char, end_char) in enumerate(segments): for sequence_number, text in enumerate(chunk_texts):
chunk = Chunk( chunk = Chunk(
document_id=document_id, document_id=document_id,
content=text, content=text,
sequence_number=sequence_number, sequence_number=sequence_number,
start_char=start_char,
end_char=end_char,
section_title=section_title, section_title=section_title,
section_index=section_index, section_index=section_index,
) )

View File

@ -70,8 +70,8 @@ class ParagraphChunker(IChunker):
chunks = self._chunk_by_sections(document, strategy) chunks = self._chunk_by_sections(document, strategy)
else: else:
# Standard chunking: process entire raw_markdown # Standard chunking: process entire raw_markdown
segments = self._split_and_group_paragraphs(document.raw_markdown, strategy) chunk_texts = self._split_and_group_paragraphs(document.raw_markdown, strategy)
chunks = self._create_chunks(segments, document.id) chunks = self._create_chunks(chunk_texts, document.id)
logger.info(f"Created {len(chunks)} paragraph-based chunks") logger.info(f"Created {len(chunks)} paragraph-based chunks")
return chunks return chunks
@ -136,7 +136,7 @@ class ParagraphChunker(IChunker):
self, self,
text: str, text: str,
strategy: ChunkingStrategy, strategy: ChunkingStrategy,
) -> List[tuple[str, int, int]]: ) -> List[str]:
""" """
Split text into paragraphs and group them into chunks. Split text into paragraphs and group them into chunks.
@ -145,14 +145,14 @@ class ParagraphChunker(IChunker):
strategy: Chunking strategy configuration strategy: Chunking strategy configuration
Returns: Returns:
List of (chunk_text, start_position, end_position) tuples List of chunk text strings
""" """
# Split into paragraphs # Split into paragraphs
paragraphs = logic_utils.split_into_paragraphs(text) paragraphs = logic_utils.split_into_paragraphs(text)
if not paragraphs: if not paragraphs:
# No paragraphs found, return whole text as single chunk # No paragraphs found, return whole text as single chunk
return [(text, 0, len(text))] return [text]
# Group paragraphs into chunks # Group paragraphs into chunks
return self._group_paragraphs(paragraphs, strategy) return self._group_paragraphs(paragraphs, strategy)
@ -161,7 +161,7 @@ class ParagraphChunker(IChunker):
self, self,
paragraphs: List[str], paragraphs: List[str],
strategy: ChunkingStrategy, strategy: ChunkingStrategy,
) -> List[tuple[str, int, int]]: ) -> List[str]:
""" """
Group paragraphs into chunks based on target size. Group paragraphs into chunks based on target size.
@ -170,12 +170,11 @@ class ParagraphChunker(IChunker):
strategy: Chunking strategy strategy: Chunking strategy
Returns: Returns:
List of (chunk_text, start_pos, end_pos) tuples List of chunk text strings
""" """
segments = [] segments = []
current_paragraphs = [] current_paragraphs = []
current_size = 0 current_size = 0
current_start = 0
for paragraph in paragraphs: for paragraph in paragraphs:
para_size = len(paragraph) para_size = len(paragraph)
@ -185,13 +184,11 @@ class ParagraphChunker(IChunker):
current_size, para_size, strategy.chunk_size, current_paragraphs current_size, para_size, strategy.chunk_size, current_paragraphs
): ):
# Create chunk from accumulated paragraphs # Create chunk from accumulated paragraphs
segment = self._create_segment( segment = self._create_segment(current_paragraphs)
current_paragraphs, current_start
)
segments.append(segment) segments.append(segment)
# Handle overlap # Handle overlap
current_paragraphs, current_start, current_size = ( current_paragraphs, current_size = (
self._handle_overlap( self._handle_overlap(
segment, paragraph, para_size, strategy.overlap_size segment, paragraph, para_size, strategy.overlap_size
) )
@ -203,7 +200,7 @@ class ParagraphChunker(IChunker):
# Add final chunk # Add final chunk
if current_paragraphs: if current_paragraphs:
segment = self._create_segment(current_paragraphs, current_start) segment = self._create_segment(current_paragraphs)
segments.append(segment) segments.append(segment)
logger.debug( logger.debug(
@ -237,56 +234,49 @@ class ParagraphChunker(IChunker):
def _create_segment( def _create_segment(
self, self,
paragraphs: List[str], paragraphs: List[str],
start_pos: int, ) -> str:
) -> tuple[str, int, int]:
""" """
Create a segment from paragraphs. Create a segment from paragraphs.
Args: Args:
paragraphs: List of paragraph strings paragraphs: List of paragraph strings
start_pos: Starting position
Returns: Returns:
Tuple of (chunk_text, start_pos, end_pos) Chunk text string
""" """
chunk_text = "\n\n".join(paragraphs) return "\n\n".join(paragraphs)
end_pos = start_pos + len(chunk_text)
return (chunk_text, start_pos, end_pos)
def _handle_overlap( def _handle_overlap(
self, self,
previous_segment: tuple[str, int, int], previous_segment: str,
new_paragraph: str, new_paragraph: str,
new_para_size: int, new_para_size: int,
overlap_size: int, overlap_size: int,
) -> tuple[List[str], int, int]: ) -> tuple[List[str], int]:
""" """
Handle overlap between chunks. Handle overlap between chunks.
Args: Args:
previous_segment: Previous chunk segment previous_segment: Previous chunk text
new_paragraph: New paragraph to start with new_paragraph: New paragraph to start with
new_para_size: Size of new paragraph new_para_size: Size of new paragraph
overlap_size: Desired overlap size overlap_size: Desired overlap size
Returns: Returns:
Tuple of (new_paragraphs, new_start, new_size) Tuple of (new_paragraphs, new_size)
""" """
if overlap_size > 0: if overlap_size > 0:
prev_text, _, prev_end = previous_segment
overlap_text = logic_utils.calculate_overlap_text( overlap_text = logic_utils.calculate_overlap_text(
text=prev_text, text=previous_segment,
overlap_size=overlap_size, overlap_size=overlap_size,
from_start=False, from_start=False,
) )
return ( return (
[overlap_text, new_paragraph], [overlap_text, new_paragraph],
prev_end - len(overlap_text),
len(overlap_text) + new_para_size, len(overlap_text) + new_para_size,
) )
else: else:
_, _, prev_end = previous_segment return ([new_paragraph], new_para_size)
return ([new_paragraph], prev_end, new_para_size)
def _chunk_by_sections( def _chunk_by_sections(
self, self,
@ -313,11 +303,11 @@ class ParagraphChunker(IChunker):
document_title = document.metadata.display_name document_title = document.metadata.display_name
for section_index, section in enumerate(document.sections): for section_index, section in enumerate(document.sections):
# Split this section's content into paragraph-based segments # Split this section's content into paragraph-based chunks
segments = self._split_and_group_paragraphs(section.content, strategy) chunk_texts = self._split_and_group_paragraphs(section.content, strategy)
# Create chunks for this section with title prefix # Create chunks for this section with title prefix
for text, start_char, end_char in segments: for text in chunk_texts:
# Prepend document title and section title to chunk content # Prepend document title and section title to chunk content
prefixed_content = f"{document_title}\n{section.title}\n{text}" prefixed_content = f"{document_title}\n{section.title}\n{text}"
@ -325,8 +315,6 @@ class ParagraphChunker(IChunker):
document_id=document.id, document_id=document.id,
content=prefixed_content, content=prefixed_content,
sequence_number=global_sequence, sequence_number=global_sequence,
start_char=start_char,
end_char=end_char,
section_title=section.title, section_title=section.title,
section_index=section_index, section_index=section_index,
) )
@ -340,16 +328,16 @@ class ParagraphChunker(IChunker):
def _create_chunks( def _create_chunks(
self, self,
segments: List[tuple[str, int, int]], chunk_texts: List[str],
document_id, document_id,
section_title: Optional[str] = None, section_title: Optional[str] = None,
section_index: Optional[int] = None, section_index: Optional[int] = None,
) -> List[Chunk]: ) -> List[Chunk]:
""" """
Create Chunk entities from text segments. Create Chunk entities from text strings.
Args: Args:
segments: List of (text, start_pos, end_pos) tuples chunk_texts: List of chunk text strings
document_id: ID of parent document document_id: ID of parent document
section_title: Optional section title section_title: Optional section title
section_index: Optional section index section_index: Optional section index
@ -359,13 +347,11 @@ class ParagraphChunker(IChunker):
""" """
chunks = [] chunks = []
for sequence_number, (text, start_char, end_char) in enumerate(segments): for sequence_number, text in enumerate(chunk_texts):
chunk = Chunk( chunk = Chunk(
document_id=document_id, document_id=document_id,
content=text, content=text,
sequence_number=sequence_number, sequence_number=sequence_number,
start_char=start_char,
end_char=end_char,
section_title=section_title, section_title=section_title,
section_index=section_index, section_index=section_index,
) )

View File

@ -360,8 +360,6 @@ class Chunk(BaseModel):
document_id: ID of the parent document document_id: ID of the parent document
content: Text content of the chunk content: Text content of the chunk
sequence_number: Order of this chunk in the document sequence_number: Order of this chunk in the document
start_char: Starting character position in original document
end_char: Ending character position in original document
section_title: Title of the section this chunk belongs to section_title: Title of the section this chunk belongs to
section_index: Index of the section in document.sections section_index: Index of the section in document.sections
metadata: Optional metadata specific to this chunk metadata: Optional metadata specific to this chunk
@ -370,8 +368,6 @@ class Chunk(BaseModel):
document_id: UUID = Field(..., description="Parent document ID") document_id: UUID = Field(..., description="Parent document ID")
content: str = Field(..., min_length=1, description="Chunk text content") content: str = Field(..., min_length=1, description="Chunk text content")
sequence_number: int = Field(..., ge=0, description="Chunk order in document") sequence_number: int = Field(..., ge=0, description="Chunk order in document")
start_char: int = Field(..., ge=0, description="Start position in document")
end_char: int = Field(..., gt=0, description="End position in document")
section_title: Optional[str] = Field(None, description="Section title") section_title: Optional[str] = Field(None, description="Section title")
section_index: Optional[int] = Field(None, ge=0, description="Section index") section_index: Optional[int] = Field(None, ge=0, description="Section index")
metadata: Dict[str, str] = Field(default_factory=dict) metadata: Dict[str, str] = Field(default_factory=dict)
@ -380,27 +376,6 @@ class Chunk(BaseModel):
"frozen": True, # Chunks are immutable "frozen": True, # Chunks are immutable
} }
@model_validator(mode='after')
def validate_position_consistency(self) -> 'Chunk':
"""Ensure end position is after start position."""
if self.end_char <= self.start_char:
raise ValueError(
f"end_char ({self.end_char}) must be greater than "
f"start_char ({self.start_char})"
)
# Validate content length matches position range
content_length = len(self.content)
position_range = self.end_char - self.start_char
if abs(content_length - position_range) > 10: # Allow small variance
raise ValueError(
f"Content length ({content_length}) doesn't match "
f"position range ({position_range})"
)
return self
def get_length(self) -> int: def get_length(self) -> int:
"""Get the length of the chunk content.""" """Get the length of the chunk content."""
return len(self.content) return len(self.content)