add document title and section title to the beginning of each chunk in paragraph chunker
This commit is contained in:
parent
cda128e438
commit
9e1e49bc59
@ -216,8 +216,6 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
|
|||||||
document_id=str(chunk.document_id),
|
document_id=str(chunk.document_id),
|
||||||
content=chunk.content,
|
content=chunk.content,
|
||||||
sequence_number=chunk.sequence_number,
|
sequence_number=chunk.sequence_number,
|
||||||
start_char=chunk.start_char,
|
|
||||||
end_char=chunk.end_char,
|
|
||||||
length=chunk.get_length(),
|
length=chunk.get_length(),
|
||||||
)
|
)
|
||||||
for chunk in chunks
|
for chunk in chunks
|
||||||
|
|||||||
@ -101,8 +101,6 @@ class ChunkResponse(BaseModel):
|
|||||||
document_id: str
|
document_id: str
|
||||||
content: str
|
content: str
|
||||||
sequence_number: int
|
sequence_number: int
|
||||||
start_char: int
|
|
||||||
end_char: int
|
|
||||||
length: int
|
length: int
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -70,8 +70,8 @@ class FixedSizeChunker(IChunker):
|
|||||||
chunks = self._chunk_by_sections(document, strategy)
|
chunks = self._chunk_by_sections(document, strategy)
|
||||||
else:
|
else:
|
||||||
# Standard chunking: process entire raw_markdown
|
# Standard chunking: process entire raw_markdown
|
||||||
segments = self._split_into_segments(document.raw_markdown, strategy)
|
chunk_texts = self._split_into_segments(document.raw_markdown, strategy)
|
||||||
chunks = self._create_chunks(segments, document.id)
|
chunks = self._create_chunks(chunk_texts, document.id)
|
||||||
|
|
||||||
logger.info(f"Created {len(chunks)} fixed-size chunks")
|
logger.info(f"Created {len(chunks)} fixed-size chunks")
|
||||||
return chunks
|
return chunks
|
||||||
@ -136,7 +136,7 @@ class FixedSizeChunker(IChunker):
|
|||||||
self,
|
self,
|
||||||
text: str,
|
text: str,
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[tuple[str, int, int]]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Split text into fixed-size segments.
|
Split text into fixed-size segments.
|
||||||
|
|
||||||
@ -145,7 +145,7 @@ class FixedSizeChunker(IChunker):
|
|||||||
strategy: Chunking strategy configuration
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of (chunk_text, start_position, end_position) tuples
|
List of chunk text strings
|
||||||
"""
|
"""
|
||||||
segments = []
|
segments = []
|
||||||
text_length = len(text)
|
text_length = len(text)
|
||||||
@ -155,7 +155,7 @@ class FixedSizeChunker(IChunker):
|
|||||||
position = 0
|
position = 0
|
||||||
|
|
||||||
while position < text_length:
|
while position < text_length:
|
||||||
segment = self._extract_segment(
|
chunk_text = self._extract_segment(
|
||||||
text=text,
|
text=text,
|
||||||
position=position,
|
position=position,
|
||||||
chunk_size=chunk_size,
|
chunk_size=chunk_size,
|
||||||
@ -163,10 +163,8 @@ class FixedSizeChunker(IChunker):
|
|||||||
respect_boundaries=strategy.respect_boundaries,
|
respect_boundaries=strategy.respect_boundaries,
|
||||||
)
|
)
|
||||||
|
|
||||||
if segment:
|
if chunk_text and chunk_text.strip():
|
||||||
chunk_text, start_pos, end_pos = segment
|
segments.append(chunk_text)
|
||||||
if chunk_text.strip():
|
|
||||||
segments.append((chunk_text, start_pos, end_pos))
|
|
||||||
|
|
||||||
position += step_size
|
position += step_size
|
||||||
|
|
||||||
@ -183,7 +181,7 @@ class FixedSizeChunker(IChunker):
|
|||||||
chunk_size: int,
|
chunk_size: int,
|
||||||
text_length: int,
|
text_length: int,
|
||||||
respect_boundaries: bool,
|
respect_boundaries: bool,
|
||||||
) -> tuple[str, int, int] | None:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Extract a single segment from text.
|
Extract a single segment from text.
|
||||||
|
|
||||||
@ -195,16 +193,15 @@ class FixedSizeChunker(IChunker):
|
|||||||
respect_boundaries: Whether to respect boundaries
|
respect_boundaries: Whether to respect boundaries
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (chunk_text, start_pos, end_pos) or None
|
Chunk text string
|
||||||
"""
|
"""
|
||||||
end_pos = min(position + chunk_size, text_length)
|
end_pos = min(position + chunk_size, text_length)
|
||||||
chunk_text = text[position:end_pos]
|
chunk_text = text[position:end_pos]
|
||||||
|
|
||||||
if respect_boundaries and end_pos < text_length:
|
if respect_boundaries and end_pos < text_length:
|
||||||
chunk_text = self._adjust_to_boundary(text, position, end_pos)
|
chunk_text = self._adjust_to_boundary(text, position, end_pos)
|
||||||
end_pos = position + len(chunk_text)
|
|
||||||
|
|
||||||
return (chunk_text, position, end_pos)
|
return chunk_text
|
||||||
|
|
||||||
def _adjust_to_boundary(
|
def _adjust_to_boundary(
|
||||||
self,
|
self,
|
||||||
@ -258,17 +255,15 @@ class FixedSizeChunker(IChunker):
|
|||||||
global_sequence = 0
|
global_sequence = 0
|
||||||
|
|
||||||
for section_index, section in enumerate(document.sections):
|
for section_index, section in enumerate(document.sections):
|
||||||
# Split this section's content into segments
|
# Split this section's content into chunks
|
||||||
segments = self._split_into_segments(section.content, strategy)
|
chunk_texts = self._split_into_segments(section.content, strategy)
|
||||||
|
|
||||||
# Create chunks for this section
|
# Create chunks for this section
|
||||||
for text, start_char, end_char in segments:
|
for text in chunk_texts:
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
content=text,
|
content=text,
|
||||||
sequence_number=global_sequence,
|
sequence_number=global_sequence,
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
section_title=section.title,
|
section_title=section.title,
|
||||||
section_index=section_index,
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
@ -282,16 +277,16 @@ class FixedSizeChunker(IChunker):
|
|||||||
|
|
||||||
def _create_chunks(
|
def _create_chunks(
|
||||||
self,
|
self,
|
||||||
segments: List[tuple[str, int, int]],
|
chunk_texts: List[str],
|
||||||
document_id,
|
document_id,
|
||||||
section_title: Optional[str] = None,
|
section_title: Optional[str] = None,
|
||||||
section_index: Optional[int] = None,
|
section_index: Optional[int] = None,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Create Chunk entities from text segments.
|
Create Chunk entities from text strings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
segments: List of (text, start_pos, end_pos) tuples
|
chunk_texts: List of chunk text strings
|
||||||
document_id: ID of parent document
|
document_id: ID of parent document
|
||||||
section_title: Optional section title
|
section_title: Optional section title
|
||||||
section_index: Optional section index
|
section_index: Optional section index
|
||||||
@ -301,13 +296,11 @@ class FixedSizeChunker(IChunker):
|
|||||||
"""
|
"""
|
||||||
chunks = []
|
chunks = []
|
||||||
|
|
||||||
for sequence_number, (text, start_char, end_char) in enumerate(segments):
|
for sequence_number, text in enumerate(chunk_texts):
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
content=text,
|
content=text,
|
||||||
sequence_number=sequence_number,
|
sequence_number=sequence_number,
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
section_title=section_title,
|
section_title=section_title,
|
||||||
section_index=section_index,
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -70,8 +70,8 @@ class ParagraphChunker(IChunker):
|
|||||||
chunks = self._chunk_by_sections(document, strategy)
|
chunks = self._chunk_by_sections(document, strategy)
|
||||||
else:
|
else:
|
||||||
# Standard chunking: process entire raw_markdown
|
# Standard chunking: process entire raw_markdown
|
||||||
segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
|
chunk_texts = self._split_and_group_paragraphs(document.raw_markdown, strategy)
|
||||||
chunks = self._create_chunks(segments, document.id)
|
chunks = self._create_chunks(chunk_texts, document.id)
|
||||||
|
|
||||||
logger.info(f"Created {len(chunks)} paragraph-based chunks")
|
logger.info(f"Created {len(chunks)} paragraph-based chunks")
|
||||||
return chunks
|
return chunks
|
||||||
@ -136,7 +136,7 @@ class ParagraphChunker(IChunker):
|
|||||||
self,
|
self,
|
||||||
text: str,
|
text: str,
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[tuple[str, int, int]]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Split text into paragraphs and group them into chunks.
|
Split text into paragraphs and group them into chunks.
|
||||||
|
|
||||||
@ -145,14 +145,14 @@ class ParagraphChunker(IChunker):
|
|||||||
strategy: Chunking strategy configuration
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of (chunk_text, start_position, end_position) tuples
|
List of chunk text strings
|
||||||
"""
|
"""
|
||||||
# Split into paragraphs
|
# Split into paragraphs
|
||||||
paragraphs = logic_utils.split_into_paragraphs(text)
|
paragraphs = logic_utils.split_into_paragraphs(text)
|
||||||
|
|
||||||
if not paragraphs:
|
if not paragraphs:
|
||||||
# No paragraphs found, return whole text as single chunk
|
# No paragraphs found, return whole text as single chunk
|
||||||
return [(text, 0, len(text))]
|
return [text]
|
||||||
|
|
||||||
# Group paragraphs into chunks
|
# Group paragraphs into chunks
|
||||||
return self._group_paragraphs(paragraphs, strategy)
|
return self._group_paragraphs(paragraphs, strategy)
|
||||||
@ -161,7 +161,7 @@ class ParagraphChunker(IChunker):
|
|||||||
self,
|
self,
|
||||||
paragraphs: List[str],
|
paragraphs: List[str],
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[tuple[str, int, int]]:
|
) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Group paragraphs into chunks based on target size.
|
Group paragraphs into chunks based on target size.
|
||||||
|
|
||||||
@ -170,12 +170,11 @@ class ParagraphChunker(IChunker):
|
|||||||
strategy: Chunking strategy
|
strategy: Chunking strategy
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of (chunk_text, start_pos, end_pos) tuples
|
List of chunk text strings
|
||||||
"""
|
"""
|
||||||
segments = []
|
segments = []
|
||||||
current_paragraphs = []
|
current_paragraphs = []
|
||||||
current_size = 0
|
current_size = 0
|
||||||
current_start = 0
|
|
||||||
|
|
||||||
for paragraph in paragraphs:
|
for paragraph in paragraphs:
|
||||||
para_size = len(paragraph)
|
para_size = len(paragraph)
|
||||||
@ -185,13 +184,11 @@ class ParagraphChunker(IChunker):
|
|||||||
current_size, para_size, strategy.chunk_size, current_paragraphs
|
current_size, para_size, strategy.chunk_size, current_paragraphs
|
||||||
):
|
):
|
||||||
# Create chunk from accumulated paragraphs
|
# Create chunk from accumulated paragraphs
|
||||||
segment = self._create_segment(
|
segment = self._create_segment(current_paragraphs)
|
||||||
current_paragraphs, current_start
|
|
||||||
)
|
|
||||||
segments.append(segment)
|
segments.append(segment)
|
||||||
|
|
||||||
# Handle overlap
|
# Handle overlap
|
||||||
current_paragraphs, current_start, current_size = (
|
current_paragraphs, current_size = (
|
||||||
self._handle_overlap(
|
self._handle_overlap(
|
||||||
segment, paragraph, para_size, strategy.overlap_size
|
segment, paragraph, para_size, strategy.overlap_size
|
||||||
)
|
)
|
||||||
@ -203,7 +200,7 @@ class ParagraphChunker(IChunker):
|
|||||||
|
|
||||||
# Add final chunk
|
# Add final chunk
|
||||||
if current_paragraphs:
|
if current_paragraphs:
|
||||||
segment = self._create_segment(current_paragraphs, current_start)
|
segment = self._create_segment(current_paragraphs)
|
||||||
segments.append(segment)
|
segments.append(segment)
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
@ -237,56 +234,49 @@ class ParagraphChunker(IChunker):
|
|||||||
def _create_segment(
|
def _create_segment(
|
||||||
self,
|
self,
|
||||||
paragraphs: List[str],
|
paragraphs: List[str],
|
||||||
start_pos: int,
|
) -> str:
|
||||||
) -> tuple[str, int, int]:
|
|
||||||
"""
|
"""
|
||||||
Create a segment from paragraphs.
|
Create a segment from paragraphs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
paragraphs: List of paragraph strings
|
paragraphs: List of paragraph strings
|
||||||
start_pos: Starting position
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (chunk_text, start_pos, end_pos)
|
Chunk text string
|
||||||
"""
|
"""
|
||||||
chunk_text = "\n\n".join(paragraphs)
|
return "\n\n".join(paragraphs)
|
||||||
end_pos = start_pos + len(chunk_text)
|
|
||||||
return (chunk_text, start_pos, end_pos)
|
|
||||||
|
|
||||||
def _handle_overlap(
|
def _handle_overlap(
|
||||||
self,
|
self,
|
||||||
previous_segment: tuple[str, int, int],
|
previous_segment: str,
|
||||||
new_paragraph: str,
|
new_paragraph: str,
|
||||||
new_para_size: int,
|
new_para_size: int,
|
||||||
overlap_size: int,
|
overlap_size: int,
|
||||||
) -> tuple[List[str], int, int]:
|
) -> tuple[List[str], int]:
|
||||||
"""
|
"""
|
||||||
Handle overlap between chunks.
|
Handle overlap between chunks.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
previous_segment: Previous chunk segment
|
previous_segment: Previous chunk text
|
||||||
new_paragraph: New paragraph to start with
|
new_paragraph: New paragraph to start with
|
||||||
new_para_size: Size of new paragraph
|
new_para_size: Size of new paragraph
|
||||||
overlap_size: Desired overlap size
|
overlap_size: Desired overlap size
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (new_paragraphs, new_start, new_size)
|
Tuple of (new_paragraphs, new_size)
|
||||||
"""
|
"""
|
||||||
if overlap_size > 0:
|
if overlap_size > 0:
|
||||||
prev_text, _, prev_end = previous_segment
|
|
||||||
overlap_text = logic_utils.calculate_overlap_text(
|
overlap_text = logic_utils.calculate_overlap_text(
|
||||||
text=prev_text,
|
text=previous_segment,
|
||||||
overlap_size=overlap_size,
|
overlap_size=overlap_size,
|
||||||
from_start=False,
|
from_start=False,
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
[overlap_text, new_paragraph],
|
[overlap_text, new_paragraph],
|
||||||
prev_end - len(overlap_text),
|
|
||||||
len(overlap_text) + new_para_size,
|
len(overlap_text) + new_para_size,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
_, _, prev_end = previous_segment
|
return ([new_paragraph], new_para_size)
|
||||||
return ([new_paragraph], prev_end, new_para_size)
|
|
||||||
|
|
||||||
def _chunk_by_sections(
|
def _chunk_by_sections(
|
||||||
self,
|
self,
|
||||||
@ -313,11 +303,11 @@ class ParagraphChunker(IChunker):
|
|||||||
document_title = document.metadata.display_name
|
document_title = document.metadata.display_name
|
||||||
|
|
||||||
for section_index, section in enumerate(document.sections):
|
for section_index, section in enumerate(document.sections):
|
||||||
# Split this section's content into paragraph-based segments
|
# Split this section's content into paragraph-based chunks
|
||||||
segments = self._split_and_group_paragraphs(section.content, strategy)
|
chunk_texts = self._split_and_group_paragraphs(section.content, strategy)
|
||||||
|
|
||||||
# Create chunks for this section with title prefix
|
# Create chunks for this section with title prefix
|
||||||
for text, start_char, end_char in segments:
|
for text in chunk_texts:
|
||||||
# Prepend document title and section title to chunk content
|
# Prepend document title and section title to chunk content
|
||||||
prefixed_content = f"{document_title}\n{section.title}\n{text}"
|
prefixed_content = f"{document_title}\n{section.title}\n{text}"
|
||||||
|
|
||||||
@ -325,8 +315,6 @@ class ParagraphChunker(IChunker):
|
|||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
content=prefixed_content,
|
content=prefixed_content,
|
||||||
sequence_number=global_sequence,
|
sequence_number=global_sequence,
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
section_title=section.title,
|
section_title=section.title,
|
||||||
section_index=section_index,
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
@ -340,16 +328,16 @@ class ParagraphChunker(IChunker):
|
|||||||
|
|
||||||
def _create_chunks(
|
def _create_chunks(
|
||||||
self,
|
self,
|
||||||
segments: List[tuple[str, int, int]],
|
chunk_texts: List[str],
|
||||||
document_id,
|
document_id,
|
||||||
section_title: Optional[str] = None,
|
section_title: Optional[str] = None,
|
||||||
section_index: Optional[int] = None,
|
section_index: Optional[int] = None,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Create Chunk entities from text segments.
|
Create Chunk entities from text strings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
segments: List of (text, start_pos, end_pos) tuples
|
chunk_texts: List of chunk text strings
|
||||||
document_id: ID of parent document
|
document_id: ID of parent document
|
||||||
section_title: Optional section title
|
section_title: Optional section title
|
||||||
section_index: Optional section index
|
section_index: Optional section index
|
||||||
@ -359,13 +347,11 @@ class ParagraphChunker(IChunker):
|
|||||||
"""
|
"""
|
||||||
chunks = []
|
chunks = []
|
||||||
|
|
||||||
for sequence_number, (text, start_char, end_char) in enumerate(segments):
|
for sequence_number, text in enumerate(chunk_texts):
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
content=text,
|
content=text,
|
||||||
sequence_number=sequence_number,
|
sequence_number=sequence_number,
|
||||||
start_char=start_char,
|
|
||||||
end_char=end_char,
|
|
||||||
section_title=section_title,
|
section_title=section_title,
|
||||||
section_index=section_index,
|
section_index=section_index,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -360,8 +360,6 @@ class Chunk(BaseModel):
|
|||||||
document_id: ID of the parent document
|
document_id: ID of the parent document
|
||||||
content: Text content of the chunk
|
content: Text content of the chunk
|
||||||
sequence_number: Order of this chunk in the document
|
sequence_number: Order of this chunk in the document
|
||||||
start_char: Starting character position in original document
|
|
||||||
end_char: Ending character position in original document
|
|
||||||
section_title: Title of the section this chunk belongs to
|
section_title: Title of the section this chunk belongs to
|
||||||
section_index: Index of the section in document.sections
|
section_index: Index of the section in document.sections
|
||||||
metadata: Optional metadata specific to this chunk
|
metadata: Optional metadata specific to this chunk
|
||||||
@ -370,8 +368,6 @@ class Chunk(BaseModel):
|
|||||||
document_id: UUID = Field(..., description="Parent document ID")
|
document_id: UUID = Field(..., description="Parent document ID")
|
||||||
content: str = Field(..., min_length=1, description="Chunk text content")
|
content: str = Field(..., min_length=1, description="Chunk text content")
|
||||||
sequence_number: int = Field(..., ge=0, description="Chunk order in document")
|
sequence_number: int = Field(..., ge=0, description="Chunk order in document")
|
||||||
start_char: int = Field(..., ge=0, description="Start position in document")
|
|
||||||
end_char: int = Field(..., gt=0, description="End position in document")
|
|
||||||
section_title: Optional[str] = Field(None, description="Section title")
|
section_title: Optional[str] = Field(None, description="Section title")
|
||||||
section_index: Optional[int] = Field(None, ge=0, description="Section index")
|
section_index: Optional[int] = Field(None, ge=0, description="Section index")
|
||||||
metadata: Dict[str, str] = Field(default_factory=dict)
|
metadata: Dict[str, str] = Field(default_factory=dict)
|
||||||
@ -380,27 +376,6 @@ class Chunk(BaseModel):
|
|||||||
"frozen": True, # Chunks are immutable
|
"frozen": True, # Chunks are immutable
|
||||||
}
|
}
|
||||||
|
|
||||||
@model_validator(mode='after')
|
|
||||||
def validate_position_consistency(self) -> 'Chunk':
|
|
||||||
"""Ensure end position is after start position."""
|
|
||||||
if self.end_char <= self.start_char:
|
|
||||||
raise ValueError(
|
|
||||||
f"end_char ({self.end_char}) must be greater than "
|
|
||||||
f"start_char ({self.start_char})"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Validate content length matches position range
|
|
||||||
content_length = len(self.content)
|
|
||||||
position_range = self.end_char - self.start_char
|
|
||||||
|
|
||||||
if abs(content_length - position_range) > 10: # Allow small variance
|
|
||||||
raise ValueError(
|
|
||||||
f"Content length ({content_length}) doesn't match "
|
|
||||||
f"position range ({position_range})"
|
|
||||||
)
|
|
||||||
|
|
||||||
return self
|
|
||||||
|
|
||||||
def get_length(self) -> int:
|
def get_length(self) -> int:
|
||||||
"""Get the length of the chunk content."""
|
"""Get the length of the chunk content."""
|
||||||
return len(self.content)
|
return len(self.content)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user