From 9e1e49bc5901c5bdb952a4ece782e59f15857b76 Mon Sep 17 00:00:00 2001 From: "m.dabbagh" Date: Sun, 25 Jan 2026 11:32:35 +0330 Subject: [PATCH] add document title and section title to the beginning of each chunk in paragraph chunker --- src/adapters/incoming/api_routes.py | 2 - src/adapters/incoming/api_schemas.py | 2 - .../outgoing/chunkers/fixed_size_chunker.py | 41 +++++------- .../outgoing/chunkers/paragraph_chunker.py | 66 ++++++++----------- src/core/domain/models.py | 25 ------- 5 files changed, 43 insertions(+), 93 deletions(-) diff --git a/src/adapters/incoming/api_routes.py b/src/adapters/incoming/api_routes.py index b5ee2c5..8367415 100644 --- a/src/adapters/incoming/api_routes.py +++ b/src/adapters/incoming/api_routes.py @@ -216,8 +216,6 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]: document_id=str(chunk.document_id), content=chunk.content, sequence_number=chunk.sequence_number, - start_char=chunk.start_char, - end_char=chunk.end_char, length=chunk.get_length(), ) for chunk in chunks diff --git a/src/adapters/incoming/api_schemas.py b/src/adapters/incoming/api_schemas.py index 616fb59..113b237 100644 --- a/src/adapters/incoming/api_schemas.py +++ b/src/adapters/incoming/api_schemas.py @@ -101,8 +101,6 @@ class ChunkResponse(BaseModel): document_id: str content: str sequence_number: int - start_char: int - end_char: int length: int diff --git a/src/adapters/outgoing/chunkers/fixed_size_chunker.py b/src/adapters/outgoing/chunkers/fixed_size_chunker.py index b4812a0..fd687ca 100644 --- a/src/adapters/outgoing/chunkers/fixed_size_chunker.py +++ b/src/adapters/outgoing/chunkers/fixed_size_chunker.py @@ -70,8 +70,8 @@ class FixedSizeChunker(IChunker): chunks = self._chunk_by_sections(document, strategy) else: # Standard chunking: process entire raw_markdown - segments = self._split_into_segments(document.raw_markdown, strategy) - chunks = self._create_chunks(segments, document.id) + chunk_texts = self._split_into_segments(document.raw_markdown, strategy) + chunks = self._create_chunks(chunk_texts, document.id) logger.info(f"Created {len(chunks)} fixed-size chunks") return chunks @@ -136,7 +136,7 @@ class FixedSizeChunker(IChunker): self, text: str, strategy: ChunkingStrategy, - ) -> List[tuple[str, int, int]]: + ) -> List[str]: """ Split text into fixed-size segments. @@ -145,7 +145,7 @@ class FixedSizeChunker(IChunker): strategy: Chunking strategy configuration Returns: - List of (chunk_text, start_position, end_position) tuples + List of chunk text strings """ segments = [] text_length = len(text) @@ -155,7 +155,7 @@ class FixedSizeChunker(IChunker): position = 0 while position < text_length: - segment = self._extract_segment( + chunk_text = self._extract_segment( text=text, position=position, chunk_size=chunk_size, @@ -163,10 +163,8 @@ class FixedSizeChunker(IChunker): respect_boundaries=strategy.respect_boundaries, ) - if segment: - chunk_text, start_pos, end_pos = segment - if chunk_text.strip(): - segments.append((chunk_text, start_pos, end_pos)) + if chunk_text and chunk_text.strip(): + segments.append(chunk_text) position += step_size @@ -183,7 +181,7 @@ class FixedSizeChunker(IChunker): chunk_size: int, text_length: int, respect_boundaries: bool, - ) -> tuple[str, int, int] | None: + ) -> str: """ Extract a single segment from text. @@ -195,16 +193,15 @@ class FixedSizeChunker(IChunker): respect_boundaries: Whether to respect boundaries Returns: - Tuple of (chunk_text, start_pos, end_pos) or None + Chunk text string """ end_pos = min(position + chunk_size, text_length) chunk_text = text[position:end_pos] if respect_boundaries and end_pos < text_length: chunk_text = self._adjust_to_boundary(text, position, end_pos) - end_pos = position + len(chunk_text) - return (chunk_text, position, end_pos) + return chunk_text def _adjust_to_boundary( self, @@ -258,17 +255,15 @@ class FixedSizeChunker(IChunker): global_sequence = 0 for section_index, section in enumerate(document.sections): - # Split this section's content into segments - segments = self._split_into_segments(section.content, strategy) + # Split this section's content into chunks + chunk_texts = self._split_into_segments(section.content, strategy) # Create chunks for this section - for text, start_char, end_char in segments: + for text in chunk_texts: chunk = Chunk( document_id=document.id, content=text, sequence_number=global_sequence, - start_char=start_char, - end_char=end_char, section_title=section.title, section_index=section_index, ) @@ -282,16 +277,16 @@ class FixedSizeChunker(IChunker): def _create_chunks( self, - segments: List[tuple[str, int, int]], + chunk_texts: List[str], document_id, section_title: Optional[str] = None, section_index: Optional[int] = None, ) -> List[Chunk]: """ - Create Chunk entities from text segments. + Create Chunk entities from text strings. Args: - segments: List of (text, start_pos, end_pos) tuples + chunk_texts: List of chunk text strings document_id: ID of parent document section_title: Optional section title section_index: Optional section index @@ -301,13 +296,11 @@ class FixedSizeChunker(IChunker): """ chunks = [] - for sequence_number, (text, start_char, end_char) in enumerate(segments): + for sequence_number, text in enumerate(chunk_texts): chunk = Chunk( document_id=document_id, content=text, sequence_number=sequence_number, - start_char=start_char, - end_char=end_char, section_title=section_title, section_index=section_index, ) diff --git a/src/adapters/outgoing/chunkers/paragraph_chunker.py b/src/adapters/outgoing/chunkers/paragraph_chunker.py index a938f58..105be6d 100644 --- a/src/adapters/outgoing/chunkers/paragraph_chunker.py +++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py @@ -70,8 +70,8 @@ class ParagraphChunker(IChunker): chunks = self._chunk_by_sections(document, strategy) else: # Standard chunking: process entire raw_markdown - segments = self._split_and_group_paragraphs(document.raw_markdown, strategy) - chunks = self._create_chunks(segments, document.id) + chunk_texts = self._split_and_group_paragraphs(document.raw_markdown, strategy) + chunks = self._create_chunks(chunk_texts, document.id) logger.info(f"Created {len(chunks)} paragraph-based chunks") return chunks @@ -136,7 +136,7 @@ class ParagraphChunker(IChunker): self, text: str, strategy: ChunkingStrategy, - ) -> List[tuple[str, int, int]]: + ) -> List[str]: """ Split text into paragraphs and group them into chunks. @@ -145,14 +145,14 @@ class ParagraphChunker(IChunker): strategy: Chunking strategy configuration Returns: - List of (chunk_text, start_position, end_position) tuples + List of chunk text strings """ # Split into paragraphs paragraphs = logic_utils.split_into_paragraphs(text) if not paragraphs: # No paragraphs found, return whole text as single chunk - return [(text, 0, len(text))] + return [text] # Group paragraphs into chunks return self._group_paragraphs(paragraphs, strategy) @@ -161,7 +161,7 @@ class ParagraphChunker(IChunker): self, paragraphs: List[str], strategy: ChunkingStrategy, - ) -> List[tuple[str, int, int]]: + ) -> List[str]: """ Group paragraphs into chunks based on target size. @@ -170,12 +170,11 @@ class ParagraphChunker(IChunker): strategy: Chunking strategy Returns: - List of (chunk_text, start_pos, end_pos) tuples + List of chunk text strings """ segments = [] current_paragraphs = [] current_size = 0 - current_start = 0 for paragraph in paragraphs: para_size = len(paragraph) @@ -185,13 +184,11 @@ class ParagraphChunker(IChunker): current_size, para_size, strategy.chunk_size, current_paragraphs ): # Create chunk from accumulated paragraphs - segment = self._create_segment( - current_paragraphs, current_start - ) + segment = self._create_segment(current_paragraphs) segments.append(segment) # Handle overlap - current_paragraphs, current_start, current_size = ( + current_paragraphs, current_size = ( self._handle_overlap( segment, paragraph, para_size, strategy.overlap_size ) @@ -203,7 +200,7 @@ class ParagraphChunker(IChunker): # Add final chunk if current_paragraphs: - segment = self._create_segment(current_paragraphs, current_start) + segment = self._create_segment(current_paragraphs) segments.append(segment) logger.debug( @@ -237,56 +234,49 @@ class ParagraphChunker(IChunker): def _create_segment( self, paragraphs: List[str], - start_pos: int, - ) -> tuple[str, int, int]: + ) -> str: """ Create a segment from paragraphs. Args: paragraphs: List of paragraph strings - start_pos: Starting position Returns: - Tuple of (chunk_text, start_pos, end_pos) + Chunk text string """ - chunk_text = "\n\n".join(paragraphs) - end_pos = start_pos + len(chunk_text) - return (chunk_text, start_pos, end_pos) + return "\n\n".join(paragraphs) def _handle_overlap( self, - previous_segment: tuple[str, int, int], + previous_segment: str, new_paragraph: str, new_para_size: int, overlap_size: int, - ) -> tuple[List[str], int, int]: + ) -> tuple[List[str], int]: """ Handle overlap between chunks. Args: - previous_segment: Previous chunk segment + previous_segment: Previous chunk text new_paragraph: New paragraph to start with new_para_size: Size of new paragraph overlap_size: Desired overlap size Returns: - Tuple of (new_paragraphs, new_start, new_size) + Tuple of (new_paragraphs, new_size) """ if overlap_size > 0: - prev_text, _, prev_end = previous_segment overlap_text = logic_utils.calculate_overlap_text( - text=prev_text, + text=previous_segment, overlap_size=overlap_size, from_start=False, ) return ( [overlap_text, new_paragraph], - prev_end - len(overlap_text), len(overlap_text) + new_para_size, ) else: - _, _, prev_end = previous_segment - return ([new_paragraph], prev_end, new_para_size) + return ([new_paragraph], new_para_size) def _chunk_by_sections( self, @@ -313,11 +303,11 @@ class ParagraphChunker(IChunker): document_title = document.metadata.display_name for section_index, section in enumerate(document.sections): - # Split this section's content into paragraph-based segments - segments = self._split_and_group_paragraphs(section.content, strategy) + # Split this section's content into paragraph-based chunks + chunk_texts = self._split_and_group_paragraphs(section.content, strategy) # Create chunks for this section with title prefix - for text, start_char, end_char in segments: + for text in chunk_texts: # Prepend document title and section title to chunk content prefixed_content = f"{document_title}\n{section.title}\n{text}" @@ -325,8 +315,6 @@ class ParagraphChunker(IChunker): document_id=document.id, content=prefixed_content, sequence_number=global_sequence, - start_char=start_char, - end_char=end_char, section_title=section.title, section_index=section_index, ) @@ -340,16 +328,16 @@ class ParagraphChunker(IChunker): def _create_chunks( self, - segments: List[tuple[str, int, int]], + chunk_texts: List[str], document_id, section_title: Optional[str] = None, section_index: Optional[int] = None, ) -> List[Chunk]: """ - Create Chunk entities from text segments. + Create Chunk entities from text strings. Args: - segments: List of (text, start_pos, end_pos) tuples + chunk_texts: List of chunk text strings document_id: ID of parent document section_title: Optional section title section_index: Optional section index @@ -359,13 +347,11 @@ class ParagraphChunker(IChunker): """ chunks = [] - for sequence_number, (text, start_char, end_char) in enumerate(segments): + for sequence_number, text in enumerate(chunk_texts): chunk = Chunk( document_id=document_id, content=text, sequence_number=sequence_number, - start_char=start_char, - end_char=end_char, section_title=section_title, section_index=section_index, ) diff --git a/src/core/domain/models.py b/src/core/domain/models.py index 4bd2914..4c37edf 100644 --- a/src/core/domain/models.py +++ b/src/core/domain/models.py @@ -360,8 +360,6 @@ class Chunk(BaseModel): document_id: ID of the parent document content: Text content of the chunk sequence_number: Order of this chunk in the document - start_char: Starting character position in original document - end_char: Ending character position in original document section_title: Title of the section this chunk belongs to section_index: Index of the section in document.sections metadata: Optional metadata specific to this chunk @@ -370,8 +368,6 @@ class Chunk(BaseModel): document_id: UUID = Field(..., description="Parent document ID") content: str = Field(..., min_length=1, description="Chunk text content") sequence_number: int = Field(..., ge=0, description="Chunk order in document") - start_char: int = Field(..., ge=0, description="Start position in document") - end_char: int = Field(..., gt=0, description="End position in document") section_title: Optional[str] = Field(None, description="Section title") section_index: Optional[int] = Field(None, ge=0, description="Section index") metadata: Dict[str, str] = Field(default_factory=dict) @@ -380,27 +376,6 @@ class Chunk(BaseModel): "frozen": True, # Chunks are immutable } - @model_validator(mode='after') - def validate_position_consistency(self) -> 'Chunk': - """Ensure end position is after start position.""" - if self.end_char <= self.start_char: - raise ValueError( - f"end_char ({self.end_char}) must be greater than " - f"start_char ({self.start_char})" - ) - - # Validate content length matches position range - content_length = len(self.content) - position_range = self.end_char - self.start_char - - if abs(content_length - position_range) > 10: # Allow small variance - raise ValueError( - f"Content length ({content_length}) doesn't match " - f"position range ({position_range})" - ) - - return self - def get_length(self) -> int: """Get the length of the chunk content.""" return len(self.content)