From cda128e43819a6dd14cf88c2b8a88c927e1b3be7 Mon Sep 17 00:00:00 2001
From: "m.dabbagh" <mostafadabbagh76@gmail.com>
Date: Sun, 25 Jan 2026 11:03:54 +0330
Subject: [PATCH] one paragraph per chunk in paragraph chunking method

---
 .../outgoing/chunkers/paragraph_chunker.py    | 11 +++++--
 .../outgoing/extractors/zip_extractor.py      |  2 +-
 src/core/domain/parsers.py                    | 33 +++++--------------
 3 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/src/adapters/outgoing/chunkers/paragraph_chunker.py b/src/adapters/outgoing/chunkers/paragraph_chunker.py
index 1150f89..a938f58 100644
--- a/src/adapters/outgoing/chunkers/paragraph_chunker.py
+++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py
@@ -297,6 +297,7 @@ class ParagraphChunker(IChunker):
         Chunk document by processing each section independently.
 
         This prevents chunks from spanning across section boundaries.
+        Each chunk is prefixed with the document title and section title.
 
         Args:
             document: Document with sections
@@ -308,15 +309,21 @@ class ParagraphChunker(IChunker):
         all_chunks = []
         global_sequence = 0
 
+        # Get document title from metadata
+        document_title = document.metadata.display_name
+
         for section_index, section in enumerate(document.sections):
             # Split this section's content into paragraph-based segments
             segments = self._split_and_group_paragraphs(section.content, strategy)
 
-            # Create chunks for this section
+            # Create chunks for this section with title prefix
             for text, start_char, end_char in segments:
+                # Prepend document title and section title to chunk content
+                prefixed_content = f"{document_title}\n{section.title}\n{text}"
+
                 chunk = Chunk(
                     document_id=document.id,
-                    content=text,
+                    content=prefixed_content,
                     sequence_number=global_sequence,
                     start_char=start_char,
                     end_char=end_char,
diff --git a/src/adapters/outgoing/extractors/zip_extractor.py b/src/adapters/outgoing/extractors/zip_extractor.py
index c7c4048..93d9913 100644
--- a/src/adapters/outgoing/extractors/zip_extractor.py
+++ b/src/adapters/outgoing/extractors/zip_extractor.py
@@ -227,7 +227,7 @@ class ZipExtractor(IExtractor):
                 continue
 
             # Skip files with 'nohf' in their name
-            if 'nohf' in filename.lower():
+            if 'nohf' not in filename.lower():
                 logger.debug(f"Skipping 'nohf' file: {filename}")
                 continue
 
diff --git a/src/core/domain/parsers.py b/src/core/domain/parsers.py
index 5bafd3b..c1e3ddf 100644
--- a/src/core/domain/parsers.py
+++ b/src/core/domain/parsers.py
@@ -50,40 +50,25 @@ def parse_markdown(text: str) -> List[DocumentSection]:
     sections: List[DocumentSection] = []
     current_heading: str | None = None
     current_level: int = 0
-    current_content_parts: List[str] = []
-
-    def finalize_section() -> None:
-        """Helper to finalize and append the current section."""
-        if current_heading is not None or current_content_parts:
-            content = "".join(current_content_parts).strip()
-            if content:  # Only add sections with actual content
-                title = current_heading
-                sections.append(
-                    DocumentSection(
-                        title=title,
-                        level=current_level,
-                        content=content,
-                    )
-                )
 
     # Walk through all children of the document
     for child in doc.children:
         if isinstance(child, Heading):
-            # Finalize previous section before starting new one
-            finalize_section()
-
-            # Start new section
+            # Update current heading context
             current_heading = _extract_heading_text(child)
             current_level = child.level
-            current_content_parts = []
         else:
             # Render content back to markdown format instead of HTML
             rendered = md_renderer.render(child).strip()
             if rendered:
-                current_content_parts.append(rendered + "\n\n")
-
-    # Finalize the last section
-    finalize_section()
+                # Create a separate section for each paragraph/block
+                sections.append(
+                    DocumentSection(
+                        title=current_heading,
+                        level=current_level,
+                        content=rendered,
+                    )
+                )
 
     return sections