add document title and section title to the beginning of each chunk in paragraph chunker

2026-01-25 11:32:35 +03:30 · 2026-01-25 11:32:35 +03:30 · 9e1e49bc59
commit 9e1e49bc59
parent cda128e438
5 changed files with 43 additions and 93 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -216,8 +216,6 @@ def to_chunk_responses(chunks: List[Chunk]) -> List[ChunkResponse]:
            document_id=str(chunk.document_id),
            content=chunk.content,
            sequence_number=chunk.sequence_number,
-            start_char=chunk.start_char,
-            end_char=chunk.end_char,
            length=chunk.get_length(),
        )
        for chunk in chunks
--- a/src/adapters/incoming/api_schemas.py
+++ b/src/adapters/incoming/api_schemas.py
@ -101,8 +101,6 @@ class ChunkResponse(BaseModel):
    document_id: str
    content: str
    sequence_number: int
-    start_char: int
-    end_char: int
    length: int


--- a/src/adapters/outgoing/chunkers/fixed_size_chunker.py
+++ b/src/adapters/outgoing/chunkers/fixed_size_chunker.py
@ -70,8 +70,8 @@ class FixedSizeChunker(IChunker):
                chunks = self._chunk_by_sections(document, strategy)
            else:
                # Standard chunking: process entire raw_markdown
-                segments = self._split_into_segments(document.raw_markdown, strategy)
-                chunks = self._create_chunks(segments, document.id)
+                chunk_texts = self._split_into_segments(document.raw_markdown, strategy)
+                chunks = self._create_chunks(chunk_texts, document.id)

            logger.info(f"Created {len(chunks)} fixed-size chunks")
            return chunks
@ -136,7 +136,7 @@ class FixedSizeChunker(IChunker):
        self,
        text: str,
        strategy: ChunkingStrategy,
-    ) -> List[tuple[str, int, int]]:
+    ) -> List[str]:
        """
        Split text into fixed-size segments.

@ -145,7 +145,7 @@ class FixedSizeChunker(IChunker):
            strategy: Chunking strategy configuration

        Returns:
-            List of (chunk_text, start_position, end_position) tuples
+            List of chunk text strings
        """
        segments = []
        text_length = len(text)
@ -155,7 +155,7 @@ class FixedSizeChunker(IChunker):
        position = 0

        while position < text_length:
-            segment = self._extract_segment(
+            chunk_text = self._extract_segment(
                text=text,
                position=position,
                chunk_size=chunk_size,
@ -163,10 +163,8 @@ class FixedSizeChunker(IChunker):
                respect_boundaries=strategy.respect_boundaries,
            )

-            if segment:
-                chunk_text, start_pos, end_pos = segment
-                if chunk_text.strip():
-                    segments.append((chunk_text, start_pos, end_pos))
+            if chunk_text and chunk_text.strip():
+                segments.append(chunk_text)

            position += step_size

@ -183,7 +181,7 @@ class FixedSizeChunker(IChunker):
        chunk_size: int,
        text_length: int,
        respect_boundaries: bool,
-    ) -> tuple[str, int, int] | None:
+    ) -> str:
        """
        Extract a single segment from text.

@ -195,16 +193,15 @@ class FixedSizeChunker(IChunker):
            respect_boundaries: Whether to respect boundaries

        Returns:
-            Tuple of (chunk_text, start_pos, end_pos) or None
+            Chunk text string
        """
        end_pos = min(position + chunk_size, text_length)
        chunk_text = text[position:end_pos]

        if respect_boundaries and end_pos < text_length:
            chunk_text = self._adjust_to_boundary(text, position, end_pos)
-            end_pos = position + len(chunk_text)

-        return (chunk_text, position, end_pos)
+        return chunk_text

    def _adjust_to_boundary(
        self,
@ -258,17 +255,15 @@ class FixedSizeChunker(IChunker):
        global_sequence = 0

        for section_index, section in enumerate(document.sections):
-            # Split this section's content into segments
-            segments = self._split_into_segments(section.content, strategy)
+            # Split this section's content into chunks
+            chunk_texts = self._split_into_segments(section.content, strategy)

            # Create chunks for this section
-            for text, start_char, end_char in segments:
+            for text in chunk_texts:
                chunk = Chunk(
                    document_id=document.id,
                    content=text,
                    sequence_number=global_sequence,
-                    start_char=start_char,
-                    end_char=end_char,
                    section_title=section.title,
                    section_index=section_index,
                )
@ -282,16 +277,16 @@ class FixedSizeChunker(IChunker):

    def _create_chunks(
        self,
-        segments: List[tuple[str, int, int]],
+        chunk_texts: List[str],
        document_id,
        section_title: Optional[str] = None,
        section_index: Optional[int] = None,
    ) -> List[Chunk]:
        """
-        Create Chunk entities from text segments.
+        Create Chunk entities from text strings.

        Args:
-            segments: List of (text, start_pos, end_pos) tuples
+            chunk_texts: List of chunk text strings
            document_id: ID of parent document
            section_title: Optional section title
            section_index: Optional section index
@ -301,13 +296,11 @@ class FixedSizeChunker(IChunker):
        """
        chunks = []

-        for sequence_number, (text, start_char, end_char) in enumerate(segments):
+        for sequence_number, text in enumerate(chunk_texts):
            chunk = Chunk(
                document_id=document_id,
                content=text,
                sequence_number=sequence_number,
-                start_char=start_char,
-                end_char=end_char,
                section_title=section_title,
                section_index=section_index,
            )
--- a/src/adapters/outgoing/chunkers/paragraph_chunker.py
+++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py
@ -70,8 +70,8 @@ class ParagraphChunker(IChunker):
                chunks = self._chunk_by_sections(document, strategy)
            else:
                # Standard chunking: process entire raw_markdown
-                segments = self._split_and_group_paragraphs(document.raw_markdown, strategy)
-                chunks = self._create_chunks(segments, document.id)
+                chunk_texts = self._split_and_group_paragraphs(document.raw_markdown, strategy)
+                chunks = self._create_chunks(chunk_texts, document.id)

            logger.info(f"Created {len(chunks)} paragraph-based chunks")
            return chunks
@ -136,7 +136,7 @@ class ParagraphChunker(IChunker):
        self,
        text: str,
        strategy: ChunkingStrategy,
-    ) -> List[tuple[str, int, int]]:
+    ) -> List[str]:
        """
        Split text into paragraphs and group them into chunks.

@ -145,14 +145,14 @@ class ParagraphChunker(IChunker):
            strategy: Chunking strategy configuration

        Returns:
-            List of (chunk_text, start_position, end_position) tuples
+            List of chunk text strings
        """
        # Split into paragraphs
        paragraphs = logic_utils.split_into_paragraphs(text)

        if not paragraphs:
            # No paragraphs found, return whole text as single chunk
-            return [(text, 0, len(text))]
+            return [text]

        # Group paragraphs into chunks
        return self._group_paragraphs(paragraphs, strategy)
@ -161,7 +161,7 @@ class ParagraphChunker(IChunker):
        self,
        paragraphs: List[str],
        strategy: ChunkingStrategy,
-    ) -> List[tuple[str, int, int]]:
+    ) -> List[str]:
        """
        Group paragraphs into chunks based on target size.

@ -170,12 +170,11 @@ class ParagraphChunker(IChunker):
            strategy: Chunking strategy

        Returns:
-            List of (chunk_text, start_pos, end_pos) tuples
+            List of chunk text strings
        """
        segments = []
        current_paragraphs = []
        current_size = 0
-        current_start = 0

        for paragraph in paragraphs:
            para_size = len(paragraph)
@ -185,13 +184,11 @@ class ParagraphChunker(IChunker):
                current_size, para_size, strategy.chunk_size, current_paragraphs
            ):
                # Create chunk from accumulated paragraphs
-                segment = self._create_segment(
-                    current_paragraphs, current_start
-                )
+                segment = self._create_segment(current_paragraphs)
                segments.append(segment)

                # Handle overlap
-                current_paragraphs, current_start, current_size = (
+                current_paragraphs, current_size = (
                    self._handle_overlap(
                        segment, paragraph, para_size, strategy.overlap_size
                    )
@ -203,7 +200,7 @@ class ParagraphChunker(IChunker):

        # Add final chunk
        if current_paragraphs:
-            segment = self._create_segment(current_paragraphs, current_start)
+            segment = self._create_segment(current_paragraphs)
            segments.append(segment)

        logger.debug(
@ -237,56 +234,49 @@ class ParagraphChunker(IChunker):
    def _create_segment(
        self,
        paragraphs: List[str],
-        start_pos: int,
-    ) -> tuple[str, int, int]:
+    ) -> str:
        """
        Create a segment from paragraphs.

        Args:
            paragraphs: List of paragraph strings
-            start_pos: Starting position

        Returns:
-            Tuple of (chunk_text, start_pos, end_pos)
+            Chunk text string
        """
-        chunk_text = "\n\n".join(paragraphs)
-        end_pos = start_pos + len(chunk_text)
-        return (chunk_text, start_pos, end_pos)
+        return "\n\n".join(paragraphs)

    def _handle_overlap(
        self,
-        previous_segment: tuple[str, int, int],
+        previous_segment: str,
        new_paragraph: str,
        new_para_size: int,
        overlap_size: int,
-    ) -> tuple[List[str], int, int]:
+    ) -> tuple[List[str], int]:
        """
        Handle overlap between chunks.

        Args:
-            previous_segment: Previous chunk segment
+            previous_segment: Previous chunk text
            new_paragraph: New paragraph to start with
            new_para_size: Size of new paragraph
            overlap_size: Desired overlap size

        Returns:
-            Tuple of (new_paragraphs, new_start, new_size)
+            Tuple of (new_paragraphs, new_size)
        """
        if overlap_size > 0:
-            prev_text, _, prev_end = previous_segment
            overlap_text = logic_utils.calculate_overlap_text(
-                text=prev_text,
+                text=previous_segment,
                overlap_size=overlap_size,
                from_start=False,
            )
            return (
                [overlap_text, new_paragraph],
-                prev_end - len(overlap_text),
                len(overlap_text) + new_para_size,
            )
        else:
-            _, _, prev_end = previous_segment
-            return ([new_paragraph], prev_end, new_para_size)
+            return ([new_paragraph], new_para_size)

    def _chunk_by_sections(
        self,
@ -313,11 +303,11 @@ class ParagraphChunker(IChunker):
        document_title = document.metadata.display_name

        for section_index, section in enumerate(document.sections):
-            # Split this section's content into paragraph-based segments
-            segments = self._split_and_group_paragraphs(section.content, strategy)
+            # Split this section's content into paragraph-based chunks
+            chunk_texts = self._split_and_group_paragraphs(section.content, strategy)

            # Create chunks for this section with title prefix
-            for text, start_char, end_char in segments:
+            for text in chunk_texts:
                # Prepend document title and section title to chunk content
                prefixed_content = f"{document_title}\n{section.title}\n{text}"

@ -325,8 +315,6 @@ class ParagraphChunker(IChunker):
                    document_id=document.id,
                    content=prefixed_content,
                    sequence_number=global_sequence,
-                    start_char=start_char,
-                    end_char=end_char,
                    section_title=section.title,
                    section_index=section_index,
                )
@ -340,16 +328,16 @@ class ParagraphChunker(IChunker):

    def _create_chunks(
        self,
-        segments: List[tuple[str, int, int]],
+        chunk_texts: List[str],
        document_id,
        section_title: Optional[str] = None,
        section_index: Optional[int] = None,
    ) -> List[Chunk]:
        """
-        Create Chunk entities from text segments.
+        Create Chunk entities from text strings.

        Args:
-            segments: List of (text, start_pos, end_pos) tuples
+            chunk_texts: List of chunk text strings
            document_id: ID of parent document
            section_title: Optional section title
            section_index: Optional section index
@ -359,13 +347,11 @@ class ParagraphChunker(IChunker):
        """
        chunks = []

-        for sequence_number, (text, start_char, end_char) in enumerate(segments):
+        for sequence_number, text in enumerate(chunk_texts):
            chunk = Chunk(
                document_id=document_id,
                content=text,
                sequence_number=sequence_number,
-                start_char=start_char,
-                end_char=end_char,
                section_title=section_title,
                section_index=section_index,
            )
--- a/src/core/domain/models.py
+++ b/src/core/domain/models.py
@ -360,8 +360,6 @@ class Chunk(BaseModel):
        document_id: ID of the parent document
        content: Text content of the chunk
        sequence_number: Order of this chunk in the document
-        start_char: Starting character position in original document
-        end_char: Ending character position in original document
        section_title: Title of the section this chunk belongs to
        section_index: Index of the section in document.sections
        metadata: Optional metadata specific to this chunk
@ -370,8 +368,6 @@ class Chunk(BaseModel):
    document_id: UUID = Field(..., description="Parent document ID")
    content: str = Field(..., min_length=1, description="Chunk text content")
    sequence_number: int = Field(..., ge=0, description="Chunk order in document")
-    start_char: int = Field(..., ge=0, description="Start position in document")
-    end_char: int = Field(..., gt=0, description="End position in document")
    section_title: Optional[str] = Field(None, description="Section title")
    section_index: Optional[int] = Field(None, ge=0, description="Section index")
    metadata: Dict[str, str] = Field(default_factory=dict)
@ -380,27 +376,6 @@ class Chunk(BaseModel):
        "frozen": True,  # Chunks are immutable
    }

-    @model_validator(mode='after')
-    def validate_position_consistency(self) -> 'Chunk':
-        """Ensure end position is after start position."""
-        if self.end_char <= self.start_char:
-            raise ValueError(
-                f"end_char ({self.end_char}) must be greater than "
-                f"start_char ({self.start_char})"
-            )
-
-        # Validate content length matches position range
-        content_length = len(self.content)
-        position_range = self.end_char - self.start_char
-
-        if abs(content_length - position_range) > 10:  # Allow small variance
-            raise ValueError(
-                f"Content length ({content_length}) doesn't match "
-                f"position range ({position_range})"
-            )
-
-        return self
-
    def get_length(self) -> int:
        """Get the length of the chunk content."""
        return len(self.content)