one paragraph per chunk in paragraph chunking method
This commit is contained in:
parent
8ecbd88498
commit
cda128e438
@ -297,6 +297,7 @@ class ParagraphChunker(IChunker):
|
|||||||
Chunk document by processing each section independently.
|
Chunk document by processing each section independently.
|
||||||
|
|
||||||
This prevents chunks from spanning across section boundaries.
|
This prevents chunks from spanning across section boundaries.
|
||||||
|
Each chunk is prefixed with the document title and section title.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
document: Document with sections
|
document: Document with sections
|
||||||
@ -308,15 +309,21 @@ class ParagraphChunker(IChunker):
|
|||||||
all_chunks = []
|
all_chunks = []
|
||||||
global_sequence = 0
|
global_sequence = 0
|
||||||
|
|
||||||
|
# Get document title from metadata
|
||||||
|
document_title = document.metadata.display_name
|
||||||
|
|
||||||
for section_index, section in enumerate(document.sections):
|
for section_index, section in enumerate(document.sections):
|
||||||
# Split this section's content into paragraph-based segments
|
# Split this section's content into paragraph-based segments
|
||||||
segments = self._split_and_group_paragraphs(section.content, strategy)
|
segments = self._split_and_group_paragraphs(section.content, strategy)
|
||||||
|
|
||||||
# Create chunks for this section
|
# Create chunks for this section with title prefix
|
||||||
for text, start_char, end_char in segments:
|
for text, start_char, end_char in segments:
|
||||||
|
# Prepend document title and section title to chunk content
|
||||||
|
prefixed_content = f"{document_title}\n{section.title}\n{text}"
|
||||||
|
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
content=text,
|
content=prefixed_content,
|
||||||
sequence_number=global_sequence,
|
sequence_number=global_sequence,
|
||||||
start_char=start_char,
|
start_char=start_char,
|
||||||
end_char=end_char,
|
end_char=end_char,
|
||||||
|
|||||||
@ -227,7 +227,7 @@ class ZipExtractor(IExtractor):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip files with 'nohf' in their name
|
# Skip files with 'nohf' in their name
|
||||||
if 'nohf' in filename.lower():
|
if 'nohf' not in filename.lower():
|
||||||
logger.debug(f"Skipping 'nohf' file: {filename}")
|
logger.debug(f"Skipping 'nohf' file: {filename}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
@ -50,40 +50,25 @@ def parse_markdown(text: str) -> List[DocumentSection]:
|
|||||||
sections: List[DocumentSection] = []
|
sections: List[DocumentSection] = []
|
||||||
current_heading: str | None = None
|
current_heading: str | None = None
|
||||||
current_level: int = 0
|
current_level: int = 0
|
||||||
current_content_parts: List[str] = []
|
|
||||||
|
|
||||||
def finalize_section() -> None:
|
|
||||||
"""Helper to finalize and append the current section."""
|
|
||||||
if current_heading is not None or current_content_parts:
|
|
||||||
content = "".join(current_content_parts).strip()
|
|
||||||
if content: # Only add sections with actual content
|
|
||||||
title = current_heading
|
|
||||||
sections.append(
|
|
||||||
DocumentSection(
|
|
||||||
title=title,
|
|
||||||
level=current_level,
|
|
||||||
content=content,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Walk through all children of the document
|
# Walk through all children of the document
|
||||||
for child in doc.children:
|
for child in doc.children:
|
||||||
if isinstance(child, Heading):
|
if isinstance(child, Heading):
|
||||||
# Finalize previous section before starting new one
|
# Update current heading context
|
||||||
finalize_section()
|
|
||||||
|
|
||||||
# Start new section
|
|
||||||
current_heading = _extract_heading_text(child)
|
current_heading = _extract_heading_text(child)
|
||||||
current_level = child.level
|
current_level = child.level
|
||||||
current_content_parts = []
|
|
||||||
else:
|
else:
|
||||||
# Render content back to markdown format instead of HTML
|
# Render content back to markdown format instead of HTML
|
||||||
rendered = md_renderer.render(child).strip()
|
rendered = md_renderer.render(child).strip()
|
||||||
if rendered:
|
if rendered:
|
||||||
current_content_parts.append(rendered + "\n\n")
|
# Create a separate section for each paragraph/block
|
||||||
|
sections.append(
|
||||||
# Finalize the last section
|
DocumentSection(
|
||||||
finalize_section()
|
title=current_heading,
|
||||||
|
level=current_level,
|
||||||
|
content=rendered,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user