make the domain general and open to add crawling system
This commit is contained in:
parent
359026fa98
commit
2c375ce6bd
@ -12,7 +12,7 @@ from ....core.domain.exceptions import (
|
|||||||
EmptyContentError,
|
EmptyContentError,
|
||||||
ExtractionError,
|
ExtractionError,
|
||||||
)
|
)
|
||||||
from ....core.domain.models import Document, DocumentMetadata
|
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||||
from ....core.ports.outgoing.extractor import IExtractor
|
from ....core.ports.outgoing.extractor import IExtractor
|
||||||
|
|
||||||
|
|
||||||
@ -209,7 +209,7 @@ class DocxExtractor(IExtractor):
|
|||||||
|
|
||||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||||
"""
|
"""
|
||||||
Create document metadata from file.
|
Create source-neutral document metadata from file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file
|
file_path: Path to the file
|
||||||
@ -220,7 +220,8 @@ class DocxExtractor(IExtractor):
|
|||||||
stat = file_path.stat()
|
stat = file_path.stat()
|
||||||
|
|
||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
file_name=file_path.name,
|
source_id=str(file_path.absolute()),
|
||||||
file_type=file_path.suffix.lstrip('.').lower(),
|
source_type=SourceType.FILE,
|
||||||
file_size_bytes=stat.st_size,
|
display_name=file_path.name,
|
||||||
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from ....core.domain.exceptions import (
|
|||||||
EmptyContentError,
|
EmptyContentError,
|
||||||
ExtractionError,
|
ExtractionError,
|
||||||
)
|
)
|
||||||
from ....core.domain.models import Document, DocumentMetadata
|
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||||
from ....core.ports.outgoing.extractor import IExtractor
|
from ....core.ports.outgoing.extractor import IExtractor
|
||||||
|
|
||||||
|
|
||||||
@ -200,7 +200,7 @@ class PDFExtractor(IExtractor):
|
|||||||
|
|
||||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||||
"""
|
"""
|
||||||
Create document metadata from file.
|
Create source-neutral document metadata from file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file
|
file_path: Path to the file
|
||||||
@ -211,7 +211,8 @@ class PDFExtractor(IExtractor):
|
|||||||
stat = file_path.stat()
|
stat = file_path.stat()
|
||||||
|
|
||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
file_name=file_path.name,
|
source_id=str(file_path.absolute()),
|
||||||
file_type=file_path.suffix.lstrip('.').lower(),
|
source_type=SourceType.FILE,
|
||||||
file_size_bytes=stat.st_size,
|
display_name=file_path.name,
|
||||||
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from ....core.domain.exceptions import (
|
|||||||
EmptyContentError,
|
EmptyContentError,
|
||||||
ExtractionError,
|
ExtractionError,
|
||||||
)
|
)
|
||||||
from ....core.domain.models import Document, DocumentMetadata
|
from ....core.domain.models import Document, DocumentMetadata, SourceType
|
||||||
from ....core.ports.outgoing.extractor import IExtractor
|
from ....core.ports.outgoing.extractor import IExtractor
|
||||||
|
|
||||||
|
|
||||||
@ -187,7 +187,7 @@ class TxtExtractor(IExtractor):
|
|||||||
|
|
||||||
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
def _create_metadata(self, file_path: Path) -> DocumentMetadata:
|
||||||
"""
|
"""
|
||||||
Create document metadata from file.
|
Create source-neutral document metadata from file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file
|
file_path: Path to the file
|
||||||
@ -198,7 +198,8 @@ class TxtExtractor(IExtractor):
|
|||||||
stat = file_path.stat()
|
stat = file_path.stat()
|
||||||
|
|
||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
file_name=file_path.name,
|
source_id=str(file_path.absolute()),
|
||||||
file_type=file_path.suffix.lstrip('.').lower(),
|
source_type=SourceType.FILE,
|
||||||
file_size_bytes=stat.st_size,
|
display_name=file_path.name,
|
||||||
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -5,6 +5,7 @@ This module contains the domain entities that represent the core business concep
|
|||||||
All models are immutable by default and include comprehensive validation.
|
All models are immutable by default and include comprehensive validation.
|
||||||
"""
|
"""
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
from uuid import UUID, uuid4
|
from uuid import UUID, uuid4
|
||||||
@ -12,6 +13,12 @@ from uuid import UUID, uuid4
|
|||||||
from pydantic import BaseModel, Field, field_validator, model_validator
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
||||||
|
|
||||||
|
|
||||||
|
class SourceType(str, Enum):
|
||||||
|
"""Enumeration of supported source types."""
|
||||||
|
FILE = "file"
|
||||||
|
WEB = "web"
|
||||||
|
|
||||||
|
|
||||||
class SourceFile(BaseModel):
|
class SourceFile(BaseModel):
|
||||||
"""
|
"""
|
||||||
Represents the raw input file before processing.
|
Represents the raw input file before processing.
|
||||||
@ -58,6 +65,48 @@ class SourceFile(BaseModel):
|
|||||||
return self.path.stem
|
return self.path.stem
|
||||||
|
|
||||||
|
|
||||||
|
class WebPageSource(BaseModel):
|
||||||
|
"""
|
||||||
|
Represents a web page source for document extraction.
|
||||||
|
|
||||||
|
This model encapsulates URL information about the document source.
|
||||||
|
Flow: WebPageSource -> Extraction -> Document
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
url: URL of the web page
|
||||||
|
display_name: Human-readable name (e.g., 'about_us.html')
|
||||||
|
content_length: Optional content length in bytes
|
||||||
|
"""
|
||||||
|
url: str = Field(..., min_length=1, description="Web page URL")
|
||||||
|
display_name: str = Field(..., min_length=1, description="Display name")
|
||||||
|
content_length: Optional[int] = Field(None, ge=0, description="Content length")
|
||||||
|
|
||||||
|
model_config = {
|
||||||
|
"frozen": True, # WebPageSource is immutable
|
||||||
|
}
|
||||||
|
|
||||||
|
@field_validator('url')
|
||||||
|
@classmethod
|
||||||
|
def validate_url(cls, value: str) -> str:
|
||||||
|
"""Validate URL format."""
|
||||||
|
value = value.strip()
|
||||||
|
if not (value.startswith('http://') or value.startswith('https://')):
|
||||||
|
raise ValueError(f"URL must start with http:// or https://: {value}")
|
||||||
|
return value
|
||||||
|
|
||||||
|
@field_validator('display_name')
|
||||||
|
@classmethod
|
||||||
|
def normalize_display_name(cls, value: str) -> str:
|
||||||
|
"""Normalize display name."""
|
||||||
|
return value.strip()
|
||||||
|
|
||||||
|
def get_domain(self) -> str:
|
||||||
|
"""Extract domain from URL."""
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
parsed = urlparse(self.url)
|
||||||
|
return parsed.netloc
|
||||||
|
|
||||||
|
|
||||||
class DocumentSection(BaseModel):
|
class DocumentSection(BaseModel):
|
||||||
"""
|
"""
|
||||||
Represents a structured section of a Markdown document.
|
Represents a structured section of a Markdown document.
|
||||||
@ -95,30 +144,36 @@ class DocumentSection(BaseModel):
|
|||||||
|
|
||||||
class DocumentMetadata(BaseModel):
|
class DocumentMetadata(BaseModel):
|
||||||
"""
|
"""
|
||||||
Metadata associated with a document.
|
Source-neutral metadata for documents.
|
||||||
|
|
||||||
|
This metadata works for both file-based and web-based sources,
|
||||||
|
enabling a unified processing pipeline.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
file_name: Original filename of the document
|
source_id: Path or URL identifying the source
|
||||||
file_type: Type/extension of the file (e.g., 'pdf', 'docx')
|
source_type: Type of source (FILE or WEB)
|
||||||
file_size_bytes: Size of the file in bytes
|
display_name: Human-readable name (e.g., 'manual.pdf', 'about_us.html')
|
||||||
created_at: Timestamp when document was created
|
size_bytes: Size in bytes (file size or content length)
|
||||||
|
created_at: Timestamp when metadata was created
|
||||||
author: Optional author information
|
author: Optional author information
|
||||||
page_count: Optional number of pages in document
|
extra_metadata: Additional source-specific metadata
|
||||||
custom_fields: Additional metadata fields
|
|
||||||
"""
|
"""
|
||||||
file_name: str = Field(..., min_length=1, description="Original filename")
|
source_id: str = Field(..., min_length=1, description="Path or URL")
|
||||||
file_type: str = Field(..., min_length=1, description="File extension")
|
source_type: SourceType = Field(..., description="Source type enum")
|
||||||
file_size_bytes: int = Field(..., ge=0, description="File size in bytes")
|
display_name: str = Field(..., min_length=1, description="Display name")
|
||||||
|
size_bytes: int = Field(..., ge=0, description="Size in bytes")
|
||||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||||
author: Optional[str] = Field(None, description="Document author")
|
author: Optional[str] = Field(None, description="Author information")
|
||||||
page_count: Optional[int] = Field(None, ge=1, description="Number of pages")
|
extra_metadata: Dict[str, str] = Field(
|
||||||
custom_fields: Dict[str, str] = Field(default_factory=dict)
|
default_factory=dict,
|
||||||
|
description="Additional metadata"
|
||||||
|
)
|
||||||
|
|
||||||
@field_validator('file_type')
|
@field_validator('display_name')
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_file_type(cls, value: str) -> str:
|
def normalize_display_name(cls, value: str) -> str:
|
||||||
"""Ensure file type is lowercase and stripped."""
|
"""Normalize display name."""
|
||||||
return value.lower().strip()
|
return value.strip()
|
||||||
|
|
||||||
def get_summary(self) -> str:
|
def get_summary(self) -> str:
|
||||||
"""
|
"""
|
||||||
@ -128,28 +183,33 @@ class DocumentMetadata(BaseModel):
|
|||||||
Formatted string containing key metadata information
|
Formatted string containing key metadata information
|
||||||
"""
|
"""
|
||||||
summary_parts = [
|
summary_parts = [
|
||||||
f"File: {self.file_name}",
|
f"Source: {self.display_name}",
|
||||||
f"Type: {self.file_type}",
|
f"Type: {self.source_type.value}",
|
||||||
f"Size: {self._format_file_size()}",
|
f"Size: {self._format_size()}",
|
||||||
]
|
]
|
||||||
|
|
||||||
if self.author:
|
if self.author:
|
||||||
summary_parts.append(f"Author: {self.author}")
|
summary_parts.append(f"Author: {self.author}")
|
||||||
|
|
||||||
if self.page_count:
|
|
||||||
summary_parts.append(f"Pages: {self.page_count}")
|
|
||||||
|
|
||||||
return " | ".join(summary_parts)
|
return " | ".join(summary_parts)
|
||||||
|
|
||||||
def _format_file_size(self) -> str:
|
def _format_size(self) -> str:
|
||||||
"""Format file size in human-readable format."""
|
"""Format size in human-readable format."""
|
||||||
size = self.file_size_bytes
|
size = self.size_bytes
|
||||||
for unit in ['B', 'KB', 'MB', 'GB']:
|
for unit in ['B', 'KB', 'MB', 'GB']:
|
||||||
if size < 1024.0:
|
if size < 1024.0:
|
||||||
return f"{size:.2f} {unit}"
|
return f"{size:.2f} {unit}"
|
||||||
size /= 1024.0
|
size /= 1024.0
|
||||||
return f"{size:.2f} TB"
|
return f"{size:.2f} TB"
|
||||||
|
|
||||||
|
def is_file_source(self) -> bool:
|
||||||
|
"""Check if this is a file-based source."""
|
||||||
|
return self.source_type == SourceType.FILE
|
||||||
|
|
||||||
|
def is_web_source(self) -> bool:
|
||||||
|
"""Check if this is a web-based source."""
|
||||||
|
return self.source_type == SourceType.WEB
|
||||||
|
|
||||||
|
|
||||||
class Document(BaseModel):
|
class Document(BaseModel):
|
||||||
"""
|
"""
|
||||||
@ -281,6 +341,8 @@ class Chunk(BaseModel):
|
|||||||
"""
|
"""
|
||||||
Represents a chunk of text extracted from a document.
|
Represents a chunk of text extracted from a document.
|
||||||
|
|
||||||
|
Enhanced to track section membership for precision chunking.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
id: Unique identifier for the chunk
|
id: Unique identifier for the chunk
|
||||||
document_id: ID of the parent document
|
document_id: ID of the parent document
|
||||||
@ -288,6 +350,8 @@ class Chunk(BaseModel):
|
|||||||
sequence_number: Order of this chunk in the document
|
sequence_number: Order of this chunk in the document
|
||||||
start_char: Starting character position in original document
|
start_char: Starting character position in original document
|
||||||
end_char: Ending character position in original document
|
end_char: Ending character position in original document
|
||||||
|
section_title: Title of the section this chunk belongs to
|
||||||
|
section_index: Index of the section in document.sections
|
||||||
metadata: Optional metadata specific to this chunk
|
metadata: Optional metadata specific to this chunk
|
||||||
"""
|
"""
|
||||||
id: UUID = Field(default_factory=uuid4, description="Unique chunk ID")
|
id: UUID = Field(default_factory=uuid4, description="Unique chunk ID")
|
||||||
@ -296,6 +360,8 @@ class Chunk(BaseModel):
|
|||||||
sequence_number: int = Field(..., ge=0, description="Chunk order in document")
|
sequence_number: int = Field(..., ge=0, description="Chunk order in document")
|
||||||
start_char: int = Field(..., ge=0, description="Start position in document")
|
start_char: int = Field(..., ge=0, description="Start position in document")
|
||||||
end_char: int = Field(..., gt=0, description="End position in document")
|
end_char: int = Field(..., gt=0, description="End position in document")
|
||||||
|
section_title: Optional[str] = Field(None, description="Section title")
|
||||||
|
section_index: Optional[int] = Field(None, ge=0, description="Section index")
|
||||||
metadata: Dict[str, str] = Field(default_factory=dict)
|
metadata: Dict[str, str] = Field(default_factory=dict)
|
||||||
|
|
||||||
model_config = {
|
model_config = {
|
||||||
@ -342,6 +408,21 @@ class Chunk(BaseModel):
|
|||||||
search_text = text if case_sensitive else text.lower()
|
search_text = text if case_sensitive else text.lower()
|
||||||
return search_text in content
|
return search_text in content
|
||||||
|
|
||||||
|
def belongs_to_section(self) -> bool:
|
||||||
|
"""Check if this chunk belongs to a specific section."""
|
||||||
|
return self.section_title is not None and self.section_index is not None
|
||||||
|
|
||||||
|
def get_section_context(self) -> str:
|
||||||
|
"""
|
||||||
|
Get a string describing the section context.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Section context description or 'No section'
|
||||||
|
"""
|
||||||
|
if self.belongs_to_section():
|
||||||
|
return f"Section {self.section_index}: {self.section_title}"
|
||||||
|
return "No section"
|
||||||
|
|
||||||
|
|
||||||
class ChunkingStrategy(BaseModel):
|
class ChunkingStrategy(BaseModel):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1,14 +1,13 @@
|
|||||||
"""
|
"""
|
||||||
Outgoing Port - Text Chunker Interface.
|
Outgoing Port - Text Chunker Interface.
|
||||||
|
|
||||||
This defines the contract for chunking text into smaller pieces.
|
This defines the contract for chunking documents into smaller pieces.
|
||||||
Different strategies can be implemented as adapters.
|
Different strategies can be implemented as adapters.
|
||||||
"""
|
"""
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List
|
from typing import List
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from ...domain.models import Chunk, ChunkingStrategy
|
from ...domain.models import Chunk, ChunkingStrategy, Document
|
||||||
|
|
||||||
|
|
||||||
class IChunker(ABC):
|
class IChunker(ABC):
|
||||||
@ -16,26 +15,26 @@ class IChunker(ABC):
|
|||||||
Interface for text chunking strategies.
|
Interface for text chunking strategies.
|
||||||
|
|
||||||
Implementations of this interface provide different strategies
|
Implementations of this interface provide different strategies
|
||||||
for splitting text into manageable chunks.
|
for splitting documents into manageable chunks with section awareness.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def chunk(
|
def chunk(
|
||||||
self,
|
self,
|
||||||
text: str,
|
document: Document,
|
||||||
document_id: UUID,
|
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Split text into chunks according to a strategy.
|
Split document into chunks according to a strategy.
|
||||||
|
|
||||||
|
Chunkers can utilize document.sections for section-aware chunking.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text content to chunk
|
document: Full Document entity with raw_markdown and sections
|
||||||
document_id: ID of the parent document
|
|
||||||
strategy: Chunking strategy configuration
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of Chunk entities
|
List of Chunk entities with section metadata
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ChunkingError: If chunking fails
|
ChunkingError: If chunking fails
|
||||||
|
|||||||
@ -5,9 +5,8 @@ This defines the contract for managing chunking strategies.
|
|||||||
"""
|
"""
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List
|
from typing import List
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from ...domain.models import Chunk, ChunkingStrategy
|
from ...domain.models import Chunk, ChunkingStrategy, Document
|
||||||
from .chunker import IChunker
|
from .chunker import IChunker
|
||||||
|
|
||||||
|
|
||||||
@ -22,23 +21,21 @@ class IChunkingContext(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def execute_chunking(
|
def execute_chunking(
|
||||||
self,
|
self,
|
||||||
text: str,
|
document: Document,
|
||||||
document_id: UUID,
|
|
||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Execute chunking using the specified strategy.
|
Execute chunking using the specified strategy.
|
||||||
|
|
||||||
This method is stateless and thread-safe. It selects the appropriate
|
This method is stateless and thread-safe. It accepts the full
|
||||||
chunker based on the strategy configuration and executes chunking.
|
Document object (with sections) to enable section-aware chunking.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text to chunk
|
document: Full Document entity with raw_markdown and sections
|
||||||
document_id: ID of parent document
|
|
||||||
strategy: Chunking strategy configuration (includes strategy_name)
|
strategy: Chunking strategy configuration (includes strategy_name)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of chunks
|
List of chunks with section metadata
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ChunkingError: If strategy is not registered or chunking fails
|
ChunkingError: If strategy is not registered or chunking fails
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
Outgoing Port - Text Extractor Interface.
|
Outgoing Port - Text Extractor Interface.
|
||||||
|
|
||||||
This defines the contract for extracting text from documents.
|
This defines the contract for extracting content from documents.
|
||||||
Different adapters can implement this for various file types.
|
Different adapters can implement this for various file types and sources.
|
||||||
"""
|
"""
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -16,7 +16,7 @@ class IExtractor(ABC):
|
|||||||
Interface for text extraction from documents.
|
Interface for text extraction from documents.
|
||||||
|
|
||||||
Implementations of this interface handle specific file formats
|
Implementations of this interface handle specific file formats
|
||||||
(PDF, DOCX, TXT, etc.) and adapt external libraries to the domain.
|
(PDF, DOCX, TXT, etc.) or web sources and return Document entities.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -24,11 +24,14 @@ class IExtractor(ABC):
|
|||||||
"""
|
"""
|
||||||
Extract text and metadata from a document file.
|
Extract text and metadata from a document file.
|
||||||
|
|
||||||
|
Extractors create Document entities with raw_markdown and metadata.
|
||||||
|
Sections are parsed later in the pipeline.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the document file
|
file_path: Path to the document file
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document entity with extracted content and metadata
|
Document entity with raw_markdown and metadata populated
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ExtractionError: If extraction fails
|
ExtractionError: If extraction fails
|
||||||
|
|||||||
@ -59,21 +59,21 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
chunking_strategy: ChunkingStrategy,
|
chunking_strategy: ChunkingStrategy,
|
||||||
) -> Document:
|
) -> Document:
|
||||||
"""
|
"""
|
||||||
Process a document by extracting, parsing, and storing it.
|
Process a document using the stateless pipeline.
|
||||||
|
|
||||||
New Pragmatic Pipeline:
|
Pipeline Order:
|
||||||
1. Extract: Get raw Markdown from SourceFile using extractor
|
1. Extract Document with raw_markdown and metadata (via Adapter)
|
||||||
2. Parse: Use parse_markdown to create structured sections
|
2. Parse Markdown into DocumentSection objects
|
||||||
3. Assemble: Create rich Document with raw_markdown + sections
|
3. Update Document with sections
|
||||||
4. Persist: Save to repository
|
4. Validate and persist Document
|
||||||
5. Finalize: Mark as processed
|
5. Mark as processed
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the document file
|
file_path: Path to the document file
|
||||||
chunking_strategy: Strategy configuration (for metadata)
|
chunking_strategy: Strategy configuration (for metadata)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Processed Document entity with structured sections
|
Fully processed Document entity
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ExtractionError: If text extraction fails
|
ExtractionError: If text extraction fails
|
||||||
@ -83,15 +83,14 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
try:
|
try:
|
||||||
logger.info(f"Processing document: {file_path}")
|
logger.info(f"Processing document: {file_path}")
|
||||||
|
|
||||||
# Step 1: Extract raw Markdown from SourceFile
|
# Step 1: Extract Document with raw_markdown and metadata
|
||||||
source_file = self._create_source_file(file_path)
|
document = self._extract_document(file_path)
|
||||||
document = self._extract_from_source(source_file)
|
|
||||||
|
|
||||||
# Step 2: Parse Markdown into structured sections
|
# Step 2: Parse Markdown into structured sections
|
||||||
sections = parse_markdown(document.raw_markdown)
|
sections = parse_markdown(document.raw_markdown)
|
||||||
logger.debug(f"Parsed {len(sections)} sections from document")
|
logger.debug(f"Parsed {len(sections)} sections from document")
|
||||||
|
|
||||||
# Step 3: Assemble rich Document model
|
# Step 3: Update Document with sections
|
||||||
document = document.model_copy(update={"sections": sections})
|
document = document.model_copy(update={"sections": sections})
|
||||||
|
|
||||||
# Step 4: Validate document content
|
# Step 4: Validate document content
|
||||||
@ -100,7 +99,7 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
# Step 5: Persist to repository
|
# Step 5: Persist to repository
|
||||||
saved_document = self._repository.save(document)
|
saved_document = self._repository.save(document)
|
||||||
|
|
||||||
# Step 6: Finalize - mark as processed
|
# Step 6: Mark as processed
|
||||||
saved_document.mark_as_processed()
|
saved_document.mark_as_processed()
|
||||||
self._repository.save(saved_document)
|
self._repository.save(saved_document)
|
||||||
|
|
||||||
@ -128,17 +127,17 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
Extract text from document and split into chunks.
|
Extract text from document and split into chunks.
|
||||||
|
|
||||||
Pipeline:
|
Pipeline:
|
||||||
1. Extract raw Markdown from SourceFile
|
1. Extract Document with raw_markdown and metadata
|
||||||
2. Parse into structured sections
|
2. Parse into structured sections
|
||||||
3. Apply chunking strategy to raw content
|
3. Update Document with sections
|
||||||
4. Return chunks
|
4. Apply chunking strategy with section awareness
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the document file
|
file_path: Path to the document file
|
||||||
chunking_strategy: Strategy configuration for chunking
|
chunking_strategy: Strategy configuration for chunking
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of text chunks
|
List of chunks with section metadata
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ExtractionError: If text extraction fails
|
ExtractionError: If text extraction fails
|
||||||
@ -147,15 +146,16 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
try:
|
try:
|
||||||
logger.info(f"Extracting and chunking: {file_path}")
|
logger.info(f"Extracting and chunking: {file_path}")
|
||||||
|
|
||||||
# Extract from source
|
# Extract Document
|
||||||
source_file = self._create_source_file(file_path)
|
document = self._extract_document(file_path)
|
||||||
document = self._extract_from_source(source_file)
|
|
||||||
|
|
||||||
# Parse sections
|
# Parse sections
|
||||||
sections = parse_markdown(document.raw_markdown)
|
sections = parse_markdown(document.raw_markdown)
|
||||||
|
|
||||||
|
# Update Document with sections
|
||||||
document = document.model_copy(update={"sections": sections})
|
document = document.model_copy(update={"sections": sections})
|
||||||
|
|
||||||
# Chunk using strategy
|
# Chunk using strategy (section-aware)
|
||||||
chunks = self._chunk_document(document, chunking_strategy)
|
chunks = self._chunk_document(document, chunking_strategy)
|
||||||
|
|
||||||
logger.info(f"Created {len(chunks)} chunks from document")
|
logger.info(f"Created {len(chunks)} chunks from document")
|
||||||
@ -223,43 +223,24 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
|
|
||||||
return self._repository.delete(document_id)
|
return self._repository.delete(document_id)
|
||||||
|
|
||||||
def _create_source_file(self, file_path: Path) -> SourceFile:
|
def _extract_document(self, file_path: Path) -> Document:
|
||||||
"""
|
"""
|
||||||
Create a SourceFile model from a file path.
|
Extract Document using appropriate extractor.
|
||||||
|
|
||||||
|
Extractors create Document entities with raw_markdown and metadata.
|
||||||
|
Sections will be parsed later in the pipeline.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the source file
|
file_path: Path to document file
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
SourceFile entity
|
Document entity with raw_markdown and metadata (sections empty)
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If file doesn't exist or is invalid
|
|
||||||
"""
|
|
||||||
if not file_path.exists():
|
|
||||||
raise ValueError(f"File does not exist: {file_path}")
|
|
||||||
|
|
||||||
return SourceFile(
|
|
||||||
path=file_path,
|
|
||||||
extension=file_path.suffix.lstrip('.'),
|
|
||||||
size_bytes=file_path.stat().st_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_from_source(self, source_file: SourceFile) -> Document:
|
|
||||||
"""
|
|
||||||
Extract raw Markdown from SourceFile using appropriate extractor.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
source_file: Source file to extract from
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Document entity with raw_markdown populated
|
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ExtractionError: If extraction fails
|
ExtractionError: If extraction fails
|
||||||
"""
|
"""
|
||||||
extractor = self._extractor_factory.create_extractor(source_file.path)
|
extractor = self._extractor_factory.create_extractor(file_path)
|
||||||
return extractor.extract(source_file.path)
|
return extractor.extract(file_path)
|
||||||
|
|
||||||
def _chunk_document(
|
def _chunk_document(
|
||||||
self,
|
self,
|
||||||
@ -267,20 +248,20 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
strategy: ChunkingStrategy,
|
strategy: ChunkingStrategy,
|
||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Chunk document using specified strategy.
|
Chunk document using specified strategy with section awareness.
|
||||||
|
|
||||||
This method is thread-safe as it delegates to a stateless
|
This method is thread-safe as it delegates to a stateless
|
||||||
chunking context that selects the strategy based on configuration.
|
chunking context. The full Document (with sections) is passed
|
||||||
|
to enable section-aware chunking.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
document: Document to chunk
|
document: Full Document entity with sections
|
||||||
strategy: Chunking strategy configuration
|
strategy: Chunking strategy configuration
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of chunks
|
List of chunks with section metadata
|
||||||
"""
|
"""
|
||||||
return self._chunking_context.execute_chunking(
|
return self._chunking_context.execute_chunking(
|
||||||
text=document.content,
|
document=document,
|
||||||
document_id=document.id,
|
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user