one paragraph per chunk in paragraph chunking method

This commit is contained in:
m.dabbagh 2026-01-25 11:03:54 +03:30
parent 8ecbd88498
commit cda128e438
3 changed files with 19 additions and 27 deletions

View File

@ -297,6 +297,7 @@ class ParagraphChunker(IChunker):
Chunk document by processing each section independently.
This prevents chunks from spanning across section boundaries.
Each chunk is prefixed with the document title and section title.
Args:
document: Document with sections
@ -308,15 +309,21 @@ class ParagraphChunker(IChunker):
all_chunks = []
global_sequence = 0
# Get document title from metadata
document_title = document.metadata.display_name
for section_index, section in enumerate(document.sections):
# Split this section's content into paragraph-based segments
segments = self._split_and_group_paragraphs(section.content, strategy)
# Create chunks for this section
# Create chunks for this section with title prefix
for text, start_char, end_char in segments:
# Prepend document title and section title to chunk content
prefixed_content = f"{document_title}\n{section.title}\n{text}"
chunk = Chunk(
document_id=document.id,
content=text,
content=prefixed_content,
sequence_number=global_sequence,
start_char=start_char,
end_char=end_char,

View File

@ -227,7 +227,7 @@ class ZipExtractor(IExtractor):
continue
# Skip files with 'nohf' in their name
if 'nohf' in filename.lower():
if 'nohf' not in filename.lower():
logger.debug(f"Skipping 'nohf' file: {filename}")
continue

View File

@ -50,40 +50,25 @@ def parse_markdown(text: str) -> List[DocumentSection]:
sections: List[DocumentSection] = []
current_heading: str | None = None
current_level: int = 0
current_content_parts: List[str] = []
def finalize_section() -> None:
"""Helper to finalize and append the current section."""
if current_heading is not None or current_content_parts:
content = "".join(current_content_parts).strip()
if content: # Only add sections with actual content
title = current_heading
sections.append(
DocumentSection(
title=title,
level=current_level,
content=content,
)
)
# Walk through all children of the document
for child in doc.children:
if isinstance(child, Heading):
# Finalize previous section before starting new one
finalize_section()
# Start new section
# Update current heading context
current_heading = _extract_heading_text(child)
current_level = child.level
current_content_parts = []
else:
# Render content back to markdown format instead of HTML
rendered = md_renderer.render(child).strip()
if rendered:
current_content_parts.append(rendered + "\n\n")
# Finalize the last section
finalize_section()
# Create a separate section for each paragraph/block
sections.append(
DocumentSection(
title=current_heading,
level=current_level,
content=rendered,
)
)
return sections