add document title and section title to the beginning of each chunk in paragraph chunker
This commit is contained in:
parent
cda128e438
commit
9e1e49bc59
@ -216,8 +216,6 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
|
||||
document_id=str(chunk.document_id),
|
||||
content=chunk.content,
|
||||
sequence_number=chunk.sequence_number,
|
||||
start_char=chunk.start_char,
|
||||
end_char=chunk.end_char,
|
||||
length=chunk.get_length(),
|
||||
)
|
||||
for chunk in chunks
|
||||
|
||||
@ -101,8 +101,6 @@ class ChunkResponse(BaseModel):
|
||||
document_id: str
|
||||
content: str
|
||||
sequence_number: int
|
||||
start_char: int
|
||||
end_char: int
|
||||
length: int
|
||||
|
||||
|
||||
|
||||
@ -70,8 +70,8 @@ class FixedSizeChunker(IChunker):
|
||||
chunks = self._chunk_by_sections(document, strategy)
|
||||
else:
|
||||
# Standard chunking: process entire raw_markdown
|
||||
segments = self._split_into_segments(document.raw_markdown, strategy)
|
||||
chunks = self._create_chunks(segments, document.id)
|
||||
chunk_texts = self._split_into_segments(document.raw_markdown, strategy)
|
||||
chunks = self._create_chunks(chunk_texts, document.id)
|
||||
|
||||
logger.info(f"Created {len(chunks)} fixed-size chunks")
|
||||
return chunks
|
||||
@ -136,7 +136,7 @@ class FixedSizeChunker(IChunker):
|
||||
self,
|
||||
text: str,
|
||||
strategy: ChunkingStrategy,
|
||||
) -> List[tuple[str, int, int]]:
|
||||
) -> List[str]:
|
||||
"""
|
||||
Split text into fixed-size segments.
|
||||
|
||||
@ -145,7 +145,7 @@ class FixedSizeChunker(IChunker):
|
||||
strategy: Chunking strategy configuration
|
||||
|
||||
Returns:
|
||||
List of (chunk_text, start_position, end_position) tuples
|
||||
List of chunk text strings
|
||||
"""
|
||||
segments = []
|
||||
text_length = len(text)
|
||||
@ -155,7 +155,7 @@ class FixedSizeChunker(IChunker):
|
||||
position = 0
|
||||
|
||||
while position < text_length:
|
||||
segment = self._extract_segment(
|
||||
chunk_text = self._extract_segment(
|
||||
text=text,
|
||||
position=position,
|
||||
chunk_size=chunk_size,
|
||||
@ -163,10 +163,8 @@ class FixedSizeChunker(IChunker):
|
||||
respect_boundaries=strategy.respect_boundaries,
|
||||
)
|
||||
|
||||
if segment:
|
||||
chunk_text, start_pos, end_pos = segment
|
||||
if chunk_text.strip():
|
||||
segments.append((chunk_text, start_pos, end_pos))
|
||||
if chunk_text and chunk_text.strip():
|
||||
segments.append(chunk_text)
|
||||
|
||||
position += step_size
|
||||
|
||||
@ -183,7 +181,7 @@ class FixedSizeChunker(IChunker):
|
||||
chunk_size: int,
|
||||
text_length: int,
|
||||
respect_boundaries: bool,
|
||||
) -> tuple[str, int, int] | None:
|
||||
) -> str:
|
||||
"""
|
||||
Extract a single segment from text.
|
||||
|
||||
@ -195,16 +193,15 @@ class FixedSizeChunker(IChunker):
|
||||
respect_boundaries: Whether to respect boundaries
|
||||
|
||||
Returns:
|
||||
Tuple of (chunk_text, start_pos, end_pos) or None
|
||||
Chunk text string
|
||||
"""
|
||||
end_pos = min(position + chunk_size, text_length)
|
||||
chunk_text = text[position:end_pos]
|
||||
|
||||
if respect_boundaries and end_pos < text_length:
|
||||
chunk_text = self._adjust_to_boundary(text, position, end_pos)
|
||||
end_pos = position + len(chunk_text)
|
||||
|
||||
return (chunk_text, position, end_pos)
|
||||
return chunk_text
|
||||
|
||||
def _adjust_to_boundary(
|
||||
self,
|
||||
@ -258,17 +255,15 @@ class FixedSizeChunker(IChunker):
|
||||
global_sequence = 0
|
||||
|
||||
for section_index, section in enumerate(document.sections):
|
||||
# Split this section's content into segments
|
||||
segments = self._split_into_segments(section.content, strategy)
|
||||
# Split this section's content into chunks
|
||||
chunk_texts = self._split_into_segments(section.content, strategy)
|
||||
|
||||
# Create chunks for this section
|
||||
for text, start_char, end_char in segments:
|
||||
for text in chunk_texts:
|
||||
chunk = Chunk(
|
||||
document_id=document.id,
|
||||
content=text,
|
||||
sequence_number=global_sequence,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
section_title=section.title,
|
||||
section_index=section_index,
|
||||
)
|
||||
@ -282,16 +277,16 @@ class FixedSizeChunker(IChunker):
|
||||
|
||||
def _create_chunks(
|
||||
self,
|
||||
segments: List[tuple[str, int, int]],
|
||||
chunk_texts: List[str],
|
||||
document_id,
|
||||
section_title: Optional[str] = None,
|
||||
section_index: Optional[int] = None,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Create Chunk entities from text segments.
|
||||
Create Chunk entities from text strings.
|
||||
|
||||
Args:
|
||||
segments: List of (text, start_pos, end_pos) tuples
|
||||
chunk_texts: List of chunk text strings
|
||||
document_id: ID of parent document
|
||||
section_title: Optional section title
|
||||
section_index: Optional section index
|
||||
@ -301,13 +296,11 @@ class FixedSizeChunker(IChunker):
|
||||
"""
|
||||
chunks = []
|
||||
|
||||
for sequence_number, (text, start_char, end_char) in enumerate(segments):
|
||||
for sequence_number, text in enumerate(chunk_texts):
|
||||
chunk = Chunk(
|
||||
document_id=document_id,
|
||||
content=text,
|
||||
sequence_number=sequence_number,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
section_title=section_title,
|
||||
section_index=section_index,
|
||||
)
|
||||
|
||||
@ -70,8 +70,8 @@ class ParagraphChunker(IChunker):
|
||||
chunks = self._chunk_by_sections(document, strategy)
|
||||
else:
|
||||
# Standard chunking: process entire raw_markdown
|
||||
segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
|
||||
chunks = self._create_chunks(segments, document.id)
|
||||
chunk_texts = self._split_and_group_paragraphs(document.raw_markdown, strategy)
|
||||
chunks = self._create_chunks(chunk_texts, document.id)
|
||||
|
||||
logger.info(f"Created {len(chunks)} paragraph-based chunks")
|
||||
return chunks
|
||||
@ -136,7 +136,7 @@ class ParagraphChunker(IChunker):
|
||||
self,
|
||||
text: str,
|
||||
strategy: ChunkingStrategy,
|
||||
) -> List[tuple[str, int, int]]:
|
||||
) -> List[str]:
|
||||
"""
|
||||
Split text into paragraphs and group them into chunks.
|
||||
|
||||
@ -145,14 +145,14 @@ class ParagraphChunker(IChunker):
|
||||
strategy: Chunking strategy configuration
|
||||
|
||||
Returns:
|
||||
List of (chunk_text, start_position, end_position) tuples
|
||||
List of chunk text strings
|
||||
"""
|
||||
# Split into paragraphs
|
||||
paragraphs = logic_utils.split_into_paragraphs(text)
|
||||
|
||||
if not paragraphs:
|
||||
# No paragraphs found, return whole text as single chunk
|
||||
return [(text, 0, len(text))]
|
||||
return [text]
|
||||
|
||||
# Group paragraphs into chunks
|
||||
return self._group_paragraphs(paragraphs, strategy)
|
||||
@ -161,7 +161,7 @@ class ParagraphChunker(IChunker):
|
||||
self,
|
||||
paragraphs: List[str],
|
||||
strategy: ChunkingStrategy,
|
||||
) -> List[tuple[str, int, int]]:
|
||||
) -> List[str]:
|
||||
"""
|
||||
Group paragraphs into chunks based on target size.
|
||||
|
||||
@ -170,12 +170,11 @@ class ParagraphChunker(IChunker):
|
||||
strategy: Chunking strategy
|
||||
|
||||
Returns:
|
||||
List of (chunk_text, start_pos, end_pos) tuples
|
||||
List of chunk text strings
|
||||
"""
|
||||
segments = []
|
||||
current_paragraphs = []
|
||||
current_size = 0
|
||||
current_start = 0
|
||||
|
||||
for paragraph in paragraphs:
|
||||
para_size = len(paragraph)
|
||||
@ -185,13 +184,11 @@ class ParagraphChunker(IChunker):
|
||||
current_size, para_size, strategy.chunk_size, current_paragraphs
|
||||
):
|
||||
# Create chunk from accumulated paragraphs
|
||||
segment = self._create_segment(
|
||||
current_paragraphs, current_start
|
||||
)
|
||||
segment = self._create_segment(current_paragraphs)
|
||||
segments.append(segment)
|
||||
|
||||
# Handle overlap
|
||||
current_paragraphs, current_start, current_size = (
|
||||
current_paragraphs, current_size = (
|
||||
self._handle_overlap(
|
||||
segment, paragraph, para_size, strategy.overlap_size
|
||||
)
|
||||
@ -203,7 +200,7 @@ class ParagraphChunker(IChunker):
|
||||
|
||||
# Add final chunk
|
||||
if current_paragraphs:
|
||||
segment = self._create_segment(current_paragraphs, current_start)
|
||||
segment = self._create_segment(current_paragraphs)
|
||||
segments.append(segment)
|
||||
|
||||
logger.debug(
|
||||
@ -237,56 +234,49 @@ class ParagraphChunker(IChunker):
|
||||
def _create_segment(
|
||||
self,
|
||||
paragraphs: List[str],
|
||||
start_pos: int,
|
||||
) -> tuple[str, int, int]:
|
||||
) -> str:
|
||||
"""
|
||||
Create a segment from paragraphs.
|
||||
|
||||
Args:
|
||||
paragraphs: List of paragraph strings
|
||||
start_pos: Starting position
|
||||
|
||||
Returns:
|
||||
Tuple of (chunk_text, start_pos, end_pos)
|
||||
Chunk text string
|
||||
"""
|
||||
chunk_text = "\n\n".join(paragraphs)
|
||||
end_pos = start_pos + len(chunk_text)
|
||||
return (chunk_text, start_pos, end_pos)
|
||||
return "\n\n".join(paragraphs)
|
||||
|
||||
def _handle_overlap(
|
||||
self,
|
||||
previous_segment: tuple[str, int, int],
|
||||
previous_segment: str,
|
||||
new_paragraph: str,
|
||||
new_para_size: int,
|
||||
overlap_size: int,
|
||||
) -> tuple[List[str], int, int]:
|
||||
) -> tuple[List[str], int]:
|
||||
"""
|
||||
Handle overlap between chunks.
|
||||
|
||||
Args:
|
||||
previous_segment: Previous chunk segment
|
||||
previous_segment: Previous chunk text
|
||||
new_paragraph: New paragraph to start with
|
||||
new_para_size: Size of new paragraph
|
||||
overlap_size: Desired overlap size
|
||||
|
||||
Returns:
|
||||
Tuple of (new_paragraphs, new_start, new_size)
|
||||
Tuple of (new_paragraphs, new_size)
|
||||
"""
|
||||
if overlap_size > 0:
|
||||
prev_text, _, prev_end = previous_segment
|
||||
overlap_text = logic_utils.calculate_overlap_text(
|
||||
text=prev_text,
|
||||
text=previous_segment,
|
||||
overlap_size=overlap_size,
|
||||
from_start=False,
|
||||
)
|
||||
return (
|
||||
[overlap_text, new_paragraph],
|
||||
prev_end - len(overlap_text),
|
||||
len(overlap_text) + new_para_size,
|
||||
)
|
||||
else:
|
||||
_, _, prev_end = previous_segment
|
||||
return ([new_paragraph], prev_end, new_para_size)
|
||||
return ([new_paragraph], new_para_size)
|
||||
|
||||
def _chunk_by_sections(
|
||||
self,
|
||||
@ -313,11 +303,11 @@ class ParagraphChunker(IChunker):
|
||||
document_title = document.metadata.display_name
|
||||
|
||||
for section_index, section in enumerate(document.sections):
|
||||
# Split this section's content into paragraph-based segments
|
||||
segments = self._split_and_group_paragraphs(section.content, strategy)
|
||||
# Split this section's content into paragraph-based chunks
|
||||
chunk_texts = self._split_and_group_paragraphs(section.content, strategy)
|
||||
|
||||
# Create chunks for this section with title prefix
|
||||
for text, start_char, end_char in segments:
|
||||
for text in chunk_texts:
|
||||
# Prepend document title and section title to chunk content
|
||||
prefixed_content = f"{document_title}\n{section.title}\n{text}"
|
||||
|
||||
@ -325,8 +315,6 @@ class ParagraphChunker(IChunker):
|
||||
document_id=document.id,
|
||||
content=prefixed_content,
|
||||
sequence_number=global_sequence,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
section_title=section.title,
|
||||
section_index=section_index,
|
||||
)
|
||||
@ -340,16 +328,16 @@ class ParagraphChunker(IChunker):
|
||||
|
||||
def _create_chunks(
|
||||
self,
|
||||
segments: List[tuple[str, int, int]],
|
||||
chunk_texts: List[str],
|
||||
document_id,
|
||||
section_title: Optional[str] = None,
|
||||
section_index: Optional[int] = None,
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Create Chunk entities from text segments.
|
||||
Create Chunk entities from text strings.
|
||||
|
||||
Args:
|
||||
segments: List of (text, start_pos, end_pos) tuples
|
||||
chunk_texts: List of chunk text strings
|
||||
document_id: ID of parent document
|
||||
section_title: Optional section title
|
||||
section_index: Optional section index
|
||||
@ -359,13 +347,11 @@ class ParagraphChunker(IChunker):
|
||||
"""
|
||||
chunks = []
|
||||
|
||||
for sequence_number, (text, start_char, end_char) in enumerate(segments):
|
||||
for sequence_number, text in enumerate(chunk_texts):
|
||||
chunk = Chunk(
|
||||
document_id=document_id,
|
||||
content=text,
|
||||
sequence_number=sequence_number,
|
||||
start_char=start_char,
|
||||
end_char=end_char,
|
||||
section_title=section_title,
|
||||
section_index=section_index,
|
||||
)
|
||||
|
||||
@ -360,8 +360,6 @@ class Chunk(BaseModel):
|
||||
document_id: ID of the parent document
|
||||
content: Text content of the chunk
|
||||
sequence_number: Order of this chunk in the document
|
||||
start_char: Starting character position in original document
|
||||
end_char: Ending character position in original document
|
||||
section_title: Title of the section this chunk belongs to
|
||||
section_index: Index of the section in document.sections
|
||||
metadata: Optional metadata specific to this chunk
|
||||
@ -370,8 +368,6 @@ class Chunk(BaseModel):
|
||||
document_id: UUID = Field(..., description="Parent document ID")
|
||||
content: str = Field(..., min_length=1, description="Chunk text content")
|
||||
sequence_number: int = Field(..., ge=0, description="Chunk order in document")
|
||||
start_char: int = Field(..., ge=0, description="Start position in document")
|
||||
end_char: int = Field(..., gt=0, description="End position in document")
|
||||
section_title: Optional[str] = Field(None, description="Section title")
|
||||
section_index: Optional[int] = Field(None, ge=0, description="Section index")
|
||||
metadata: Dict[str, str] = Field(default_factory=dict)
|
||||
@ -380,27 +376,6 @@ class Chunk(BaseModel):
|
||||
"frozen": True, # Chunks are immutable
|
||||
}
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_position_consistency(self) -> 'Chunk':
|
||||
"""Ensure end position is after start position."""
|
||||
if self.end_char <= self.start_char:
|
||||
raise ValueError(
|
||||
f"end_char ({self.end_char}) must be greater than "
|
||||
f"start_char ({self.start_char})"
|
||||
)
|
||||
|
||||
# Validate content length matches position range
|
||||
content_length = len(self.content)
|
||||
position_range = self.end_char - self.start_char
|
||||
|
||||
if abs(content_length - position_range) > 10: # Allow small variance
|
||||
raise ValueError(
|
||||
f"Content length ({content_length}) doesn't match "
|
||||
f"position range ({position_range})"
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def get_length(self) -> int:
|
||||
"""Get the length of the chunk content."""
|
||||
return len(self.content)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user