one paragraph per chunk in paragraph chunking method

2026-01-25 11:03:54 +03:30 · 2026-01-25 11:03:54 +03:30 · cda128e438
commit cda128e438
parent 8ecbd88498
3 changed files with 19 additions and 27 deletions
--- a/src/adapters/outgoing/chunkers/paragraph_chunker.py
+++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py
@ -297,6 +297,7 @@ class ParagraphChunker(IChunker):
        Chunk document by processing each section independently.
        This prevents chunks from spanning across section boundaries.
        Each chunk is prefixed with the document title and section title.
        Args:
            document: Document with sections
@ -308,15 +309,21 @@ class ParagraphChunker(IChunker):
        all_chunks = []
        global_sequence = 0
        # Get document title from metadata
        document_title = document.metadata.display_name
        for section_index, section in enumerate(document.sections):
            # Split this section's content into paragraph-based segments
            segments = self._split_and_group_paragraphs(section.content, strategy)
-            # Create chunks for this section
+            # Create chunks for this section with title prefix
            for text, start_char, end_char in segments:
                # Prepend document title and section title to chunk content
                prefixed_content = f"{document_title}\n{section.title}\n{text}"
                chunk = Chunk(
                    document_id=document.id,
-                    content=text,
+                    content=prefixed_content,
                    sequence_number=global_sequence,
                    start_char=start_char,
                    end_char=end_char,
--- a/src/adapters/outgoing/extractors/zip_extractor.py
+++ b/src/adapters/outgoing/extractors/zip_extractor.py
@ -227,7 +227,7 @@ class ZipExtractor(IExtractor):
                continue
            # Skip files with 'nohf' in their name
-            if 'nohf' in filename.lower():
+            if 'nohf' not in filename.lower():
                logger.debug(f"Skipping 'nohf' file: {filename}")
                continue
--- a/src/core/domain/parsers.py
+++ b/src/core/domain/parsers.py
@ -50,40 +50,25 @@ def parse_markdown(text: str) -> List[DocumentSection]:
    sections: List[DocumentSection] = []
    current_heading: str | None = None
    current_level: int = 0
    current_content_parts: List[str] = []
    def finalize_section() -> None:
        """Helper to finalize and append the current section."""
        if current_heading is not None or current_content_parts:
            content = "".join(current_content_parts).strip()
            if content:  # Only add sections with actual content
                title = current_heading
                sections.append(
                    DocumentSection(
                        title=title,
                        level=current_level,
                        content=content,
                    )
                )
    # Walk through all children of the document
    for child in doc.children:
        if isinstance(child, Heading):
-            # Finalize previous section before starting new one
+            # Update current heading context
            finalize_section()
            # Start new section
            current_heading = _extract_heading_text(child)
            current_level = child.level
            current_content_parts = []
        else:
            # Render content back to markdown format instead of HTML
            rendered = md_renderer.render(child).strip()
            if rendered:
-                current_content_parts.append(rendered + "\n\n")
+                # Create a separate section for each paragraph/block
-
+                sections.append(
-    # Finalize the last section
+                    DocumentSection(
-    finalize_section()
+                        title=current_heading,
                        level=current_level,
                        content=rendered,
                    )
                )
    return sections