From cda128e43819a6dd14cf88c2b8a88c927e1b3be7 Mon Sep 17 00:00:00 2001 From: "m.dabbagh" Date: Sun, 25 Jan 2026 11:03:54 +0330 Subject: [PATCH] one paragraph per chunk in paragraph chunking method --- .../outgoing/chunkers/paragraph_chunker.py | 11 +++++-- .../outgoing/extractors/zip_extractor.py | 2 +- src/core/domain/parsers.py | 33 +++++-------------- 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/src/adapters/outgoing/chunkers/paragraph_chunker.py b/src/adapters/outgoing/chunkers/paragraph_chunker.py index 1150f89..a938f58 100644 --- a/src/adapters/outgoing/chunkers/paragraph_chunker.py +++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py @@ -297,6 +297,7 @@ class ParagraphChunker(IChunker): Chunk document by processing each section independently. This prevents chunks from spanning across section boundaries. + Each chunk is prefixed with the document title and section title. Args: document: Document with sections @@ -308,15 +309,21 @@ class ParagraphChunker(IChunker): all_chunks = [] global_sequence = 0 + # Get document title from metadata + document_title = document.metadata.display_name + for section_index, section in enumerate(document.sections): # Split this section's content into paragraph-based segments segments = self._split_and_group_paragraphs(section.content, strategy) - # Create chunks for this section + # Create chunks for this section with title prefix for text, start_char, end_char in segments: + # Prepend document title and section title to chunk content + prefixed_content = f"{document_title}\n{section.title}\n{text}" + chunk = Chunk( document_id=document.id, - content=text, + content=prefixed_content, sequence_number=global_sequence, start_char=start_char, end_char=end_char, diff --git a/src/adapters/outgoing/extractors/zip_extractor.py b/src/adapters/outgoing/extractors/zip_extractor.py index c7c4048..93d9913 100644 --- a/src/adapters/outgoing/extractors/zip_extractor.py +++ b/src/adapters/outgoing/extractors/zip_extractor.py @@ -227,7 +227,7 @@ class ZipExtractor(IExtractor): continue # Skip files with 'nohf' in their name - if 'nohf' in filename.lower(): + if 'nohf' not in filename.lower(): logger.debug(f"Skipping 'nohf' file: {filename}") continue diff --git a/src/core/domain/parsers.py b/src/core/domain/parsers.py index 5bafd3b..c1e3ddf 100644 --- a/src/core/domain/parsers.py +++ b/src/core/domain/parsers.py @@ -50,40 +50,25 @@ def parse_markdown(text: str) -> List[DocumentSection]: sections: List[DocumentSection] = [] current_heading: str | None = None current_level: int = 0 - current_content_parts: List[str] = [] - - def finalize_section() -> None: - """Helper to finalize and append the current section.""" - if current_heading is not None or current_content_parts: - content = "".join(current_content_parts).strip() - if content: # Only add sections with actual content - title = current_heading - sections.append( - DocumentSection( - title=title, - level=current_level, - content=content, - ) - ) # Walk through all children of the document for child in doc.children: if isinstance(child, Heading): - # Finalize previous section before starting new one - finalize_section() - - # Start new section + # Update current heading context current_heading = _extract_heading_text(child) current_level = child.level - current_content_parts = [] else: # Render content back to markdown format instead of HTML rendered = md_renderer.render(child).strip() if rendered: - current_content_parts.append(rendered + "\n\n") - - # Finalize the last section - finalize_section() + # Create a separate section for each paragraph/block + sections.append( + DocumentSection( + title=current_heading, + level=current_level, + content=rendered, + ) + ) return sections