one paragraph per chunk in paragraph chunking method

2026-01-25 11:03:54 +03:30 · 2026-01-25 11:03:54 +03:30 · cda128e438
commit cda128e438
parent 8ecbd88498
3 changed files with 19 additions and 27 deletions
--- a/src/adapters/outgoing/chunkers/paragraph_chunker.py
+++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py
@ -297,6 +297,7 @@ class ParagraphChunker(IChunker):
        Chunk document by processing each section independently.

        This prevents chunks from spanning across section boundaries.
+        Each chunk is prefixed with the document title and section title.

        Args:
            document: Document with sections
@ -308,15 +309,21 @@ class ParagraphChunker(IChunker):
        all_chunks = []
        global_sequence = 0

+        # Get document title from metadata
+        document_title = document.metadata.display_name
+
        for section_index, section in enumerate(document.sections):
            # Split this section's content into paragraph-based segments
            segments = self._split_and_group_paragraphs(section.content, strategy)

-            # Create chunks for this section
+            # Create chunks for this section with title prefix
            for text, start_char, end_char in segments:
+                # Prepend document title and section title to chunk content
+                prefixed_content = f"{document_title}\n{section.title}\n{text}"
+
                chunk = Chunk(
                    document_id=document.id,
-                    content=text,
+                    content=prefixed_content,
                    sequence_number=global_sequence,
                    start_char=start_char,
                    end_char=end_char,
--- a/src/adapters/outgoing/extractors/zip_extractor.py
+++ b/src/adapters/outgoing/extractors/zip_extractor.py
@ -227,7 +227,7 @@ class ZipExtractor(IExtractor):
                continue

            # Skip files with 'nohf' in their name
-            if 'nohf' in filename.lower():
+            if 'nohf' not in filename.lower():
                logger.debug(f"Skipping 'nohf' file: {filename}")
                continue

--- a/src/core/domain/parsers.py
+++ b/src/core/domain/parsers.py
@ -50,40 +50,25 @@ def parse_markdown(text: str) -> List[DocumentSection]:
    sections: List[DocumentSection] = []
    current_heading: str | None = None
    current_level: int = 0
-    current_content_parts: List[str] = []
-
-    def finalize_section() -> None:
-        """Helper to finalize and append the current section."""
-        if current_heading is not None or current_content_parts:
-            content = "".join(current_content_parts).strip()
-            if content:  # Only add sections with actual content
-                title = current_heading
-                sections.append(
-                    DocumentSection(
-                        title=title,
-                        level=current_level,
-                        content=content,
-                    )
-                )

    # Walk through all children of the document
    for child in doc.children:
        if isinstance(child, Heading):
-            # Finalize previous section before starting new one
-            finalize_section()
-
-            # Start new section
+            # Update current heading context
            current_heading = _extract_heading_text(child)
            current_level = child.level
-            current_content_parts = []
        else:
            # Render content back to markdown format instead of HTML
            rendered = md_renderer.render(child).strip()
            if rendered:
-                current_content_parts.append(rendered + "\n\n")
-
-    # Finalize the last section
-    finalize_section()
+                # Create a separate section for each paragraph/block
+                sections.append(
+                    DocumentSection(
+                        title=current_heading,
+                        level=current_level,
+                        content=rendered,
+                    )
+                )

    return sections