one paragraph per chunk in paragraph chunking method

This commit is contained in:
m.dabbagh 2026-01-25 11:03:54 +03:30
parent 8ecbd88498
commit cda128e438
3 changed files with 19 additions and 27 deletions

View File

@ -297,6 +297,7 @@ class ParagraphChunker(IChunker):
Chunk document by processing each section independently. Chunk document by processing each section independently.
This prevents chunks from spanning across section boundaries. This prevents chunks from spanning across section boundaries.
Each chunk is prefixed with the document title and section title.
Args: Args:
document: Document with sections document: Document with sections
@ -308,15 +309,21 @@ class ParagraphChunker(IChunker):
all_chunks = [] all_chunks = []
global_sequence = 0 global_sequence = 0
# Get document title from metadata
document_title = document.metadata.display_name
for section_index, section in enumerate(document.sections): for section_index, section in enumerate(document.sections):
# Split this section's content into paragraph-based segments # Split this section's content into paragraph-based segments
segments = self._split_and_group_paragraphs(section.content, strategy) segments = self._split_and_group_paragraphs(section.content, strategy)
# Create chunks for this section # Create chunks for this section with title prefix
for text, start_char, end_char in segments: for text, start_char, end_char in segments:
# Prepend document title and section title to chunk content
prefixed_content = f"{document_title}\n{section.title}\n{text}"
chunk = Chunk( chunk = Chunk(
document_id=document.id, document_id=document.id,
content=text, content=prefixed_content,
sequence_number=global_sequence, sequence_number=global_sequence,
start_char=start_char, start_char=start_char,
end_char=end_char, end_char=end_char,

View File

@ -227,7 +227,7 @@ class ZipExtractor(IExtractor):
continue continue
# Skip files with 'nohf' in their name # Skip files with 'nohf' in their name
if 'nohf' in filename.lower(): if 'nohf' not in filename.lower():
logger.debug(f"Skipping 'nohf' file: {filename}") logger.debug(f"Skipping 'nohf' file: {filename}")
continue continue

View File

@ -50,40 +50,25 @@ def parse_markdown(text: str) -> List[DocumentSection]:
sections: List[DocumentSection] = [] sections: List[DocumentSection] = []
current_heading: str | None = None current_heading: str | None = None
current_level: int = 0 current_level: int = 0
current_content_parts: List[str] = []
def finalize_section() -> None:
"""Helper to finalize and append the current section."""
if current_heading is not None or current_content_parts:
content = "".join(current_content_parts).strip()
if content: # Only add sections with actual content
title = current_heading
sections.append(
DocumentSection(
title=title,
level=current_level,
content=content,
)
)
# Walk through all children of the document # Walk through all children of the document
for child in doc.children: for child in doc.children:
if isinstance(child, Heading): if isinstance(child, Heading):
# Finalize previous section before starting new one # Update current heading context
finalize_section()
# Start new section
current_heading = _extract_heading_text(child) current_heading = _extract_heading_text(child)
current_level = child.level current_level = child.level
current_content_parts = []
else: else:
# Render content back to markdown format instead of HTML # Render content back to markdown format instead of HTML
rendered = md_renderer.render(child).strip() rendered = md_renderer.render(child).strip()
if rendered: if rendered:
current_content_parts.append(rendered + "\n\n") # Create a separate section for each paragraph/block
sections.append(
# Finalize the last section DocumentSection(
finalize_section() title=current_heading,
level=current_level,
content=rendered,
)
)
return sections return sections