From 9e1e49bc5901c5bdb952a4ece782e59f15857b76 Mon Sep 17 00:00:00 2001
From: "m.dabbagh" <mostafadabbagh76@gmail.com>
Date: Sun, 25 Jan 2026 11:32:35 +0330
Subject: [PATCH] add document title and section title to the beginning of each
 chunk in paragraph chunker

---
 src/adapters/incoming/api_routes.py           |  2 -
 src/adapters/incoming/api_schemas.py          |  2 -
 .../outgoing/chunkers/fixed_size_chunker.py   | 41 +++++-------
 .../outgoing/chunkers/paragraph_chunker.py    | 66 ++++++++-----------
 src/core/domain/models.py                     | 25 -------
 5 files changed, 43 insertions(+), 93 deletions(-)

diff --git a/src/adapters/incoming/api_routes.py b/src/adapters/incoming/api_routes.py
index b5ee2c5..8367415 100644
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@@ -216,8 +216,6 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
             document_id=str(chunk.document_id),
             content=chunk.content,
             sequence_number=chunk.sequence_number,
-            start_char=chunk.start_char,
-            end_char=chunk.end_char,
             length=chunk.get_length(),
         )
         for chunk in chunks
diff --git a/src/adapters/incoming/api_schemas.py b/src/adapters/incoming/api_schemas.py
index 616fb59..113b237 100644
--- a/src/adapters/incoming/api_schemas.py
+++ b/src/adapters/incoming/api_schemas.py
@@ -101,8 +101,6 @@ class ChunkResponse(BaseModel):
     document_id: str
     content: str
     sequence_number: int
-    start_char: int
-    end_char: int
     length: int
 
 
diff --git a/src/adapters/outgoing/chunkers/fixed_size_chunker.py b/src/adapters/outgoing/chunkers/fixed_size_chunker.py
index b4812a0..fd687ca 100644
--- a/src/adapters/outgoing/chunkers/fixed_size_chunker.py
+++ b/src/adapters/outgoing/chunkers/fixed_size_chunker.py
@@ -70,8 +70,8 @@ class FixedSizeChunker(IChunker):
                 chunks = self._chunk_by_sections(document, strategy)
             else:
                 # Standard chunking: process entire raw_markdown
-                segments = self._split_into_segments(document.raw_markdown, strategy)
-                chunks = self._create_chunks(segments, document.id)
+                chunk_texts = self._split_into_segments(document.raw_markdown, strategy)
+                chunks = self._create_chunks(chunk_texts, document.id)
 
             logger.info(f"Created {len(chunks)} fixed-size chunks")
             return chunks
@@ -136,7 +136,7 @@ class FixedSizeChunker(IChunker):
         self,
         text: str,
         strategy: ChunkingStrategy,
-    ) -> List[tuple[str, int, int]]:
+    ) -> List[str]:
         """
         Split text into fixed-size segments.
 
@@ -145,7 +145,7 @@ class FixedSizeChunker(IChunker):
             strategy: Chunking strategy configuration
 
         Returns:
-            List of (chunk_text, start_position, end_position) tuples
+            List of chunk text strings
         """
         segments = []
         text_length = len(text)
@@ -155,7 +155,7 @@ class FixedSizeChunker(IChunker):
         position = 0
 
         while position < text_length:
-            segment = self._extract_segment(
+            chunk_text = self._extract_segment(
                 text=text,
                 position=position,
                 chunk_size=chunk_size,
@@ -163,10 +163,8 @@ class FixedSizeChunker(IChunker):
                 respect_boundaries=strategy.respect_boundaries,
             )
 
-            if segment:
-                chunk_text, start_pos, end_pos = segment
-                if chunk_text.strip():
-                    segments.append((chunk_text, start_pos, end_pos))
+            if chunk_text and chunk_text.strip():
+                segments.append(chunk_text)
 
             position += step_size
 
@@ -183,7 +181,7 @@ class FixedSizeChunker(IChunker):
         chunk_size: int,
         text_length: int,
         respect_boundaries: bool,
-    ) -> tuple[str, int, int] | None:
+    ) -> str:
         """
         Extract a single segment from text.
 
@@ -195,16 +193,15 @@ class FixedSizeChunker(IChunker):
             respect_boundaries: Whether to respect boundaries
 
         Returns:
-            Tuple of (chunk_text, start_pos, end_pos) or None
+            Chunk text string
         """
         end_pos = min(position + chunk_size, text_length)
         chunk_text = text[position:end_pos]
 
         if respect_boundaries and end_pos < text_length:
             chunk_text = self._adjust_to_boundary(text, position, end_pos)
-            end_pos = position + len(chunk_text)
 
-        return (chunk_text, position, end_pos)
+        return chunk_text
 
     def _adjust_to_boundary(
         self,
@@ -258,17 +255,15 @@ class FixedSizeChunker(IChunker):
         global_sequence = 0
 
         for section_index, section in enumerate(document.sections):
-            # Split this section's content into segments
-            segments = self._split_into_segments(section.content, strategy)
+            # Split this section's content into chunks
+            chunk_texts = self._split_into_segments(section.content, strategy)
 
             # Create chunks for this section
-            for text, start_char, end_char in segments:
+            for text in chunk_texts:
                 chunk = Chunk(
                     document_id=document.id,
                     content=text,
                     sequence_number=global_sequence,
-                    start_char=start_char,
-                    end_char=end_char,
                     section_title=section.title,
                     section_index=section_index,
                 )
@@ -282,16 +277,16 @@ class FixedSizeChunker(IChunker):
 
     def _create_chunks(
         self,
-        segments: List[tuple[str, int, int]],
+        chunk_texts: List[str],
         document_id,
         section_title: Optional[str] = None,
         section_index: Optional[int] = None,
     ) -> List[Chunk]:
         """
-        Create Chunk entities from text segments.
+        Create Chunk entities from text strings.
 
         Args:
-            segments: List of (text, start_pos, end_pos) tuples
+            chunk_texts: List of chunk text strings
             document_id: ID of parent document
             section_title: Optional section title
             section_index: Optional section index
@@ -301,13 +296,11 @@ class FixedSizeChunker(IChunker):
         """
         chunks = []
 
-        for sequence_number, (text, start_char, end_char) in enumerate(segments):
+        for sequence_number, text in enumerate(chunk_texts):
             chunk = Chunk(
                 document_id=document_id,
                 content=text,
                 sequence_number=sequence_number,
-                start_char=start_char,
-                end_char=end_char,
                 section_title=section_title,
                 section_index=section_index,
             )
diff --git a/src/adapters/outgoing/chunkers/paragraph_chunker.py b/src/adapters/outgoing/chunkers/paragraph_chunker.py
index a938f58..105be6d 100644
--- a/src/adapters/outgoing/chunkers/paragraph_chunker.py
+++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py
@@ -70,8 +70,8 @@ class ParagraphChunker(IChunker):
                 chunks = self._chunk_by_sections(document, strategy)
             else:
                 # Standard chunking: process entire raw_markdown
-                segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
-                chunks = self._create_chunks(segments, document.id)
+                chunk_texts = self._split_and_group_paragraphs(document.raw_markdown, strategy)
+                chunks = self._create_chunks(chunk_texts, document.id)
 
             logger.info(f"Created {len(chunks)} paragraph-based chunks")
             return chunks
@@ -136,7 +136,7 @@ class ParagraphChunker(IChunker):
         self,
         text: str,
         strategy: ChunkingStrategy,
-    ) -> List[tuple[str, int, int]]:
+    ) -> List[str]:
         """
         Split text into paragraphs and group them into chunks.
 
@@ -145,14 +145,14 @@ class ParagraphChunker(IChunker):
             strategy: Chunking strategy configuration
 
         Returns:
-            List of (chunk_text, start_position, end_position) tuples
+            List of chunk text strings
         """
         # Split into paragraphs
         paragraphs = logic_utils.split_into_paragraphs(text)
 
         if not paragraphs:
             # No paragraphs found, return whole text as single chunk
-            return [(text, 0, len(text))]
+            return [text]
 
         # Group paragraphs into chunks
         return self._group_paragraphs(paragraphs, strategy)
@@ -161,7 +161,7 @@ class ParagraphChunker(IChunker):
         self,
         paragraphs: List[str],
         strategy: ChunkingStrategy,
-    ) -> List[tuple[str, int, int]]:
+    ) -> List[str]:
         """
         Group paragraphs into chunks based on target size.
 
@@ -170,12 +170,11 @@ class ParagraphChunker(IChunker):
             strategy: Chunking strategy
 
         Returns:
-            List of (chunk_text, start_pos, end_pos) tuples
+            List of chunk text strings
         """
         segments = []
         current_paragraphs = []
         current_size = 0
-        current_start = 0
 
         for paragraph in paragraphs:
             para_size = len(paragraph)
@@ -185,13 +184,11 @@ class ParagraphChunker(IChunker):
                 current_size, para_size, strategy.chunk_size, current_paragraphs
             ):
                 # Create chunk from accumulated paragraphs
-                segment = self._create_segment(
-                    current_paragraphs, current_start
-                )
+                segment = self._create_segment(current_paragraphs)
                 segments.append(segment)
 
                 # Handle overlap
-                current_paragraphs, current_start, current_size = (
+                current_paragraphs, current_size = (
                     self._handle_overlap(
                         segment, paragraph, para_size, strategy.overlap_size
                     )
@@ -203,7 +200,7 @@ class ParagraphChunker(IChunker):
 
         # Add final chunk
         if current_paragraphs:
-            segment = self._create_segment(current_paragraphs, current_start)
+            segment = self._create_segment(current_paragraphs)
             segments.append(segment)
 
         logger.debug(
@@ -237,56 +234,49 @@ class ParagraphChunker(IChunker):
     def _create_segment(
         self,
         paragraphs: List[str],
-        start_pos: int,
-    ) -> tuple[str, int, int]:
+    ) -> str:
         """
         Create a segment from paragraphs.
 
         Args:
             paragraphs: List of paragraph strings
-            start_pos: Starting position
 
         Returns:
-            Tuple of (chunk_text, start_pos, end_pos)
+            Chunk text string
         """
-        chunk_text = "\n\n".join(paragraphs)
-        end_pos = start_pos + len(chunk_text)
-        return (chunk_text, start_pos, end_pos)
+        return "\n\n".join(paragraphs)
 
     def _handle_overlap(
         self,
-        previous_segment: tuple[str, int, int],
+        previous_segment: str,
         new_paragraph: str,
         new_para_size: int,
         overlap_size: int,
-    ) -> tuple[List[str], int, int]:
+    ) -> tuple[List[str], int]:
         """
         Handle overlap between chunks.
 
         Args:
-            previous_segment: Previous chunk segment
+            previous_segment: Previous chunk text
             new_paragraph: New paragraph to start with
             new_para_size: Size of new paragraph
             overlap_size: Desired overlap size
 
         Returns:
-            Tuple of (new_paragraphs, new_start, new_size)
+            Tuple of (new_paragraphs, new_size)
         """
         if overlap_size > 0:
-            prev_text, _, prev_end = previous_segment
             overlap_text = logic_utils.calculate_overlap_text(
-                text=prev_text,
+                text=previous_segment,
                 overlap_size=overlap_size,
                 from_start=False,
             )
             return (
                 [overlap_text, new_paragraph],
-                prev_end - len(overlap_text),
                 len(overlap_text) + new_para_size,
             )
         else:
-            _, _, prev_end = previous_segment
-            return ([new_paragraph], prev_end, new_para_size)
+            return ([new_paragraph], new_para_size)
 
     def _chunk_by_sections(
         self,
@@ -313,11 +303,11 @@ class ParagraphChunker(IChunker):
         document_title = document.metadata.display_name
 
         for section_index, section in enumerate(document.sections):
-            # Split this section's content into paragraph-based segments
-            segments = self._split_and_group_paragraphs(section.content, strategy)
+            # Split this section's content into paragraph-based chunks
+            chunk_texts = self._split_and_group_paragraphs(section.content, strategy)
 
             # Create chunks for this section with title prefix
-            for text, start_char, end_char in segments:
+            for text in chunk_texts:
                 # Prepend document title and section title to chunk content
                 prefixed_content = f"{document_title}\n{section.title}\n{text}"
 
@@ -325,8 +315,6 @@ class ParagraphChunker(IChunker):
                     document_id=document.id,
                     content=prefixed_content,
                     sequence_number=global_sequence,
-                    start_char=start_char,
-                    end_char=end_char,
                     section_title=section.title,
                     section_index=section_index,
                 )
@@ -340,16 +328,16 @@ class ParagraphChunker(IChunker):
 
     def _create_chunks(
         self,
-        segments: List[tuple[str, int, int]],
+        chunk_texts: List[str],
         document_id,
         section_title: Optional[str] = None,
         section_index: Optional[int] = None,
     ) -> List[Chunk]:
         """
-        Create Chunk entities from text segments.
+        Create Chunk entities from text strings.
 
         Args:
-            segments: List of (text, start_pos, end_pos) tuples
+            chunk_texts: List of chunk text strings
             document_id: ID of parent document
             section_title: Optional section title
             section_index: Optional section index
@@ -359,13 +347,11 @@ class ParagraphChunker(IChunker):
         """
         chunks = []
 
-        for sequence_number, (text, start_char, end_char) in enumerate(segments):
+        for sequence_number, text in enumerate(chunk_texts):
             chunk = Chunk(
                 document_id=document_id,
                 content=text,
                 sequence_number=sequence_number,
-                start_char=start_char,
-                end_char=end_char,
                 section_title=section_title,
                 section_index=section_index,
             )
diff --git a/src/core/domain/models.py b/src/core/domain/models.py
index 4bd2914..4c37edf 100644
--- a/src/core/domain/models.py
+++ b/src/core/domain/models.py
@@ -360,8 +360,6 @@ class Chunk(BaseModel):
         document_id: ID of the parent document
         content: Text content of the chunk
         sequence_number: Order of this chunk in the document
-        start_char: Starting character position in original document
-        end_char: Ending character position in original document
         section_title: Title of the section this chunk belongs to
         section_index: Index of the section in document.sections
         metadata: Optional metadata specific to this chunk
@@ -370,8 +368,6 @@ class Chunk(BaseModel):
     document_id: UUID = Field(..., description="Parent document ID")
     content: str = Field(..., min_length=1, description="Chunk text content")
     sequence_number: int = Field(..., ge=0, description="Chunk order in document")
-    start_char: int = Field(..., ge=0, description="Start position in document")
-    end_char: int = Field(..., gt=0, description="End position in document")
     section_title: Optional[str] = Field(None, description="Section title")
     section_index: Optional[int] = Field(None, ge=0, description="Section index")
     metadata: Dict[str, str] = Field(default_factory=dict)
@@ -380,27 +376,6 @@ class Chunk(BaseModel):
         "frozen": True,  # Chunks are immutable
     }
 
-    @model_validator(mode='after')
-    def validate_position_consistency(self) -> 'Chunk':
-        """Ensure end position is after start position."""
-        if self.end_char <= self.start_char:
-            raise ValueError(
-                f"end_char ({self.end_char}) must be greater than "
-                f"start_char ({self.start_char})"
-            )
-
-        # Validate content length matches position range
-        content_length = len(self.content)
-        position_range = self.end_char - self.start_char
-
-        if abs(content_length - position_range) > 10:  # Allow small variance
-            raise ValueError(
-                f"Content length ({content_length}) doesn't match "
-                f"position range ({position_range})"
-            )
-
-        return self
-
     def get_length(self) -> int:
         """Get the length of the chunk content."""
         return len(self.content)