make the domain general and open to add crawling system

This commit is contained in:
m.dabbagh 2026-01-08 04:57:35 +03:30
parent 359026fa98
commit 2c375ce6bd
8 changed files with 183 additions and 119 deletions

View File

@ -12,7 +12,7 @@ from ....core.domain.exceptions import (
EmptyContentError, EmptyContentError,
ExtractionError, ExtractionError,
) )
from ....core.domain.models import Document, DocumentMetadata from ....core.domain.models import Document, DocumentMetadata, SourceType
from ....core.ports.outgoing.extractor import IExtractor from ....core.ports.outgoing.extractor import IExtractor
@ -209,7 +209,7 @@ class DocxExtractor(IExtractor):
def _create_metadata(self, file_path: Path) -> DocumentMetadata: def _create_metadata(self, file_path: Path) -> DocumentMetadata:
""" """
Create document metadata from file. Create source-neutral document metadata from file.
Args: Args:
file_path: Path to the file file_path: Path to the file
@ -220,7 +220,8 @@ class DocxExtractor(IExtractor):
stat = file_path.stat() stat = file_path.stat()
return DocumentMetadata( return DocumentMetadata(
file_name=file_path.name, source_id=str(file_path.absolute()),
file_type=file_path.suffix.lstrip('.').lower(), source_type=SourceType.FILE,
file_size_bytes=stat.st_size, display_name=file_path.name,
size_bytes=stat.st_size,
) )

View File

@ -12,7 +12,7 @@ from ....core.domain.exceptions import (
EmptyContentError, EmptyContentError,
ExtractionError, ExtractionError,
) )
from ....core.domain.models import Document, DocumentMetadata from ....core.domain.models import Document, DocumentMetadata, SourceType
from ....core.ports.outgoing.extractor import IExtractor from ....core.ports.outgoing.extractor import IExtractor
@ -200,7 +200,7 @@ class PDFExtractor(IExtractor):
def _create_metadata(self, file_path: Path) -> DocumentMetadata: def _create_metadata(self, file_path: Path) -> DocumentMetadata:
""" """
Create document metadata from file. Create source-neutral document metadata from file.
Args: Args:
file_path: Path to the file file_path: Path to the file
@ -211,7 +211,8 @@ class PDFExtractor(IExtractor):
stat = file_path.stat() stat = file_path.stat()
return DocumentMetadata( return DocumentMetadata(
file_name=file_path.name, source_id=str(file_path.absolute()),
file_type=file_path.suffix.lstrip('.').lower(), source_type=SourceType.FILE,
file_size_bytes=stat.st_size, display_name=file_path.name,
size_bytes=stat.st_size,
) )

View File

@ -12,7 +12,7 @@ from ....core.domain.exceptions import (
EmptyContentError, EmptyContentError,
ExtractionError, ExtractionError,
) )
from ....core.domain.models import Document, DocumentMetadata from ....core.domain.models import Document, DocumentMetadata, SourceType
from ....core.ports.outgoing.extractor import IExtractor from ....core.ports.outgoing.extractor import IExtractor
@ -187,7 +187,7 @@ class TxtExtractor(IExtractor):
def _create_metadata(self, file_path: Path) -> DocumentMetadata: def _create_metadata(self, file_path: Path) -> DocumentMetadata:
""" """
Create document metadata from file. Create source-neutral document metadata from file.
Args: Args:
file_path: Path to the file file_path: Path to the file
@ -198,7 +198,8 @@ class TxtExtractor(IExtractor):
stat = file_path.stat() stat = file_path.stat()
return DocumentMetadata( return DocumentMetadata(
file_name=file_path.name, source_id=str(file_path.absolute()),
file_type=file_path.suffix.lstrip('.').lower(), source_type=SourceType.FILE,
file_size_bytes=stat.st_size, display_name=file_path.name,
size_bytes=stat.st_size,
) )

View File

@ -5,6 +5,7 @@ This module contains the domain entities that represent the core business concep
All models are immutable by default and include comprehensive validation. All models are immutable by default and include comprehensive validation.
""" """
from datetime import datetime from datetime import datetime
from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional
from uuid import UUID, uuid4 from uuid import UUID, uuid4
@ -12,6 +13,12 @@ from uuid import UUID, uuid4
from pydantic import BaseModel, Field, field_validator, model_validator from pydantic import BaseModel, Field, field_validator, model_validator
class SourceType(str, Enum):
"""Enumeration of supported source types."""
FILE = "file"
WEB = "web"
class SourceFile(BaseModel): class SourceFile(BaseModel):
""" """
Represents the raw input file before processing. Represents the raw input file before processing.
@ -58,6 +65,48 @@ class SourceFile(BaseModel):
return self.path.stem return self.path.stem
class WebPageSource(BaseModel):
"""
Represents a web page source for document extraction.
This model encapsulates URL information about the document source.
Flow: WebPageSource -> Extraction -> Document
Attributes:
url: URL of the web page
display_name: Human-readable name (e.g., 'about_us.html')
content_length: Optional content length in bytes
"""
url: str = Field(..., min_length=1, description="Web page URL")
display_name: str = Field(..., min_length=1, description="Display name")
content_length: Optional[int] = Field(None, ge=0, description="Content length")
model_config = {
"frozen": True, # WebPageSource is immutable
}
@field_validator('url')
@classmethod
def validate_url(cls, value: str) -> str:
"""Validate URL format."""
value = value.strip()
if not (value.startswith('http://') or value.startswith('https://')):
raise ValueError(f"URL must start with http:// or https://: {value}")
return value
@field_validator('display_name')
@classmethod
def normalize_display_name(cls, value: str) -> str:
"""Normalize display name."""
return value.strip()
def get_domain(self) -> str:
"""Extract domain from URL."""
from urllib.parse import urlparse
parsed = urlparse(self.url)
return parsed.netloc
class DocumentSection(BaseModel): class DocumentSection(BaseModel):
""" """
Represents a structured section of a Markdown document. Represents a structured section of a Markdown document.
@ -95,30 +144,36 @@ class DocumentSection(BaseModel):
class DocumentMetadata(BaseModel): class DocumentMetadata(BaseModel):
""" """
Metadata associated with a document. Source-neutral metadata for documents.
This metadata works for both file-based and web-based sources,
enabling a unified processing pipeline.
Attributes: Attributes:
file_name: Original filename of the document source_id: Path or URL identifying the source
file_type: Type/extension of the file (e.g., 'pdf', 'docx') source_type: Type of source (FILE or WEB)
file_size_bytes: Size of the file in bytes display_name: Human-readable name (e.g., 'manual.pdf', 'about_us.html')
created_at: Timestamp when document was created size_bytes: Size in bytes (file size or content length)
created_at: Timestamp when metadata was created
author: Optional author information author: Optional author information
page_count: Optional number of pages in document extra_metadata: Additional source-specific metadata
custom_fields: Additional metadata fields
""" """
file_name: str = Field(..., min_length=1, description="Original filename") source_id: str = Field(..., min_length=1, description="Path or URL")
file_type: str = Field(..., min_length=1, description="File extension") source_type: SourceType = Field(..., description="Source type enum")
file_size_bytes: int = Field(..., ge=0, description="File size in bytes") display_name: str = Field(..., min_length=1, description="Display name")
size_bytes: int = Field(..., ge=0, description="Size in bytes")
created_at: datetime = Field(default_factory=datetime.utcnow) created_at: datetime = Field(default_factory=datetime.utcnow)
author: Optional[str] = Field(None, description="Document author") author: Optional[str] = Field(None, description="Author information")
page_count: Optional[int] = Field(None, ge=1, description="Number of pages") extra_metadata: Dict[str, str] = Field(
custom_fields: Dict[str, str] = Field(default_factory=dict) default_factory=dict,
description="Additional metadata"
)
@field_validator('file_type') @field_validator('display_name')
@classmethod @classmethod
def validate_file_type(cls, value: str) -> str: def normalize_display_name(cls, value: str) -> str:
"""Ensure file type is lowercase and stripped.""" """Normalize display name."""
return value.lower().strip() return value.strip()
def get_summary(self) -> str: def get_summary(self) -> str:
""" """
@ -128,28 +183,33 @@ class DocumentMetadata(BaseModel):
Formatted string containing key metadata information Formatted string containing key metadata information
""" """
summary_parts = [ summary_parts = [
f"File: {self.file_name}", f"Source: {self.display_name}",
f"Type: {self.file_type}", f"Type: {self.source_type.value}",
f"Size: {self._format_file_size()}", f"Size: {self._format_size()}",
] ]
if self.author: if self.author:
summary_parts.append(f"Author: {self.author}") summary_parts.append(f"Author: {self.author}")
if self.page_count:
summary_parts.append(f"Pages: {self.page_count}")
return " | ".join(summary_parts) return " | ".join(summary_parts)
def _format_file_size(self) -> str: def _format_size(self) -> str:
"""Format file size in human-readable format.""" """Format size in human-readable format."""
size = self.file_size_bytes size = self.size_bytes
for unit in ['B', 'KB', 'MB', 'GB']: for unit in ['B', 'KB', 'MB', 'GB']:
if size < 1024.0: if size < 1024.0:
return f"{size:.2f} {unit}" return f"{size:.2f} {unit}"
size /= 1024.0 size /= 1024.0
return f"{size:.2f} TB" return f"{size:.2f} TB"
def is_file_source(self) -> bool:
"""Check if this is a file-based source."""
return self.source_type == SourceType.FILE
def is_web_source(self) -> bool:
"""Check if this is a web-based source."""
return self.source_type == SourceType.WEB
class Document(BaseModel): class Document(BaseModel):
""" """
@ -281,6 +341,8 @@ class Chunk(BaseModel):
""" """
Represents a chunk of text extracted from a document. Represents a chunk of text extracted from a document.
Enhanced to track section membership for precision chunking.
Attributes: Attributes:
id: Unique identifier for the chunk id: Unique identifier for the chunk
document_id: ID of the parent document document_id: ID of the parent document
@ -288,6 +350,8 @@ class Chunk(BaseModel):
sequence_number: Order of this chunk in the document sequence_number: Order of this chunk in the document
start_char: Starting character position in original document start_char: Starting character position in original document
end_char: Ending character position in original document end_char: Ending character position in original document
section_title: Title of the section this chunk belongs to
section_index: Index of the section in document.sections
metadata: Optional metadata specific to this chunk metadata: Optional metadata specific to this chunk
""" """
id: UUID = Field(default_factory=uuid4, description="Unique chunk ID") id: UUID = Field(default_factory=uuid4, description="Unique chunk ID")
@ -296,6 +360,8 @@ class Chunk(BaseModel):
sequence_number: int = Field(..., ge=0, description="Chunk order in document") sequence_number: int = Field(..., ge=0, description="Chunk order in document")
start_char: int = Field(..., ge=0, description="Start position in document") start_char: int = Field(..., ge=0, description="Start position in document")
end_char: int = Field(..., gt=0, description="End position in document") end_char: int = Field(..., gt=0, description="End position in document")
section_title: Optional[str] = Field(None, description="Section title")
section_index: Optional[int] = Field(None, ge=0, description="Section index")
metadata: Dict[str, str] = Field(default_factory=dict) metadata: Dict[str, str] = Field(default_factory=dict)
model_config = { model_config = {
@ -342,6 +408,21 @@ class Chunk(BaseModel):
search_text = text if case_sensitive else text.lower() search_text = text if case_sensitive else text.lower()
return search_text in content return search_text in content
def belongs_to_section(self) -> bool:
"""Check if this chunk belongs to a specific section."""
return self.section_title is not None and self.section_index is not None
def get_section_context(self) -> str:
"""
Get a string describing the section context.
Returns:
Section context description or 'No section'
"""
if self.belongs_to_section():
return f"Section {self.section_index}: {self.section_title}"
return "No section"
class ChunkingStrategy(BaseModel): class ChunkingStrategy(BaseModel):
""" """

View File

@ -1,14 +1,13 @@
""" """
Outgoing Port - Text Chunker Interface. Outgoing Port - Text Chunker Interface.
This defines the contract for chunking text into smaller pieces. This defines the contract for chunking documents into smaller pieces.
Different strategies can be implemented as adapters. Different strategies can be implemented as adapters.
""" """
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List from typing import List
from uuid import UUID
from ...domain.models import Chunk, ChunkingStrategy from ...domain.models import Chunk, ChunkingStrategy, Document
class IChunker(ABC): class IChunker(ABC):
@ -16,26 +15,26 @@ class IChunker(ABC):
Interface for text chunking strategies. Interface for text chunking strategies.
Implementations of this interface provide different strategies Implementations of this interface provide different strategies
for splitting text into manageable chunks. for splitting documents into manageable chunks with section awareness.
""" """
@abstractmethod @abstractmethod
def chunk( def chunk(
self, self,
text: str, document: Document,
document_id: UUID,
strategy: ChunkingStrategy, strategy: ChunkingStrategy,
) -> List[Chunk]: ) -> List[Chunk]:
""" """
Split text into chunks according to a strategy. Split document into chunks according to a strategy.
Chunkers can utilize document.sections for section-aware chunking.
Args: Args:
text: Text content to chunk document: Full Document entity with raw_markdown and sections
document_id: ID of the parent document
strategy: Chunking strategy configuration strategy: Chunking strategy configuration
Returns: Returns:
List of Chunk entities List of Chunk entities with section metadata
Raises: Raises:
ChunkingError: If chunking fails ChunkingError: If chunking fails

View File

@ -5,9 +5,8 @@ This defines the contract for managing chunking strategies.
""" """
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List from typing import List
from uuid import UUID
from ...domain.models import Chunk, ChunkingStrategy from ...domain.models import Chunk, ChunkingStrategy, Document
from .chunker import IChunker from .chunker import IChunker
@ -22,23 +21,21 @@ class IChunkingContext(ABC):
@abstractmethod @abstractmethod
def execute_chunking( def execute_chunking(
self, self,
text: str, document: Document,
document_id: UUID,
strategy: ChunkingStrategy, strategy: ChunkingStrategy,
) -> List[Chunk]: ) -> List[Chunk]:
""" """
Execute chunking using the specified strategy. Execute chunking using the specified strategy.
This method is stateless and thread-safe. It selects the appropriate This method is stateless and thread-safe. It accepts the full
chunker based on the strategy configuration and executes chunking. Document object (with sections) to enable section-aware chunking.
Args: Args:
text: Text to chunk document: Full Document entity with raw_markdown and sections
document_id: ID of parent document
strategy: Chunking strategy configuration (includes strategy_name) strategy: Chunking strategy configuration (includes strategy_name)
Returns: Returns:
List of chunks List of chunks with section metadata
Raises: Raises:
ChunkingError: If strategy is not registered or chunking fails ChunkingError: If strategy is not registered or chunking fails

View File

@ -1,8 +1,8 @@
""" """
Outgoing Port - Text Extractor Interface. Outgoing Port - Text Extractor Interface.
This defines the contract for extracting text from documents. This defines the contract for extracting content from documents.
Different adapters can implement this for various file types. Different adapters can implement this for various file types and sources.
""" """
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
@ -16,7 +16,7 @@ class IExtractor(ABC):
Interface for text extraction from documents. Interface for text extraction from documents.
Implementations of this interface handle specific file formats Implementations of this interface handle specific file formats
(PDF, DOCX, TXT, etc.) and adapt external libraries to the domain. (PDF, DOCX, TXT, etc.) or web sources and return Document entities.
""" """
@abstractmethod @abstractmethod
@ -24,11 +24,14 @@ class IExtractor(ABC):
""" """
Extract text and metadata from a document file. Extract text and metadata from a document file.
Extractors create Document entities with raw_markdown and metadata.
Sections are parsed later in the pipeline.
Args: Args:
file_path: Path to the document file file_path: Path to the document file
Returns: Returns:
Document entity with extracted content and metadata Document entity with raw_markdown and metadata populated
Raises: Raises:
ExtractionError: If extraction fails ExtractionError: If extraction fails

View File

@ -59,21 +59,21 @@ class DocumentProcessorService(ITextProcessor):
chunking_strategy: ChunkingStrategy, chunking_strategy: ChunkingStrategy,
) -> Document: ) -> Document:
""" """
Process a document by extracting, parsing, and storing it. Process a document using the stateless pipeline.
New Pragmatic Pipeline: Pipeline Order:
1. Extract: Get raw Markdown from SourceFile using extractor 1. Extract Document with raw_markdown and metadata (via Adapter)
2. Parse: Use parse_markdown to create structured sections 2. Parse Markdown into DocumentSection objects
3. Assemble: Create rich Document with raw_markdown + sections 3. Update Document with sections
4. Persist: Save to repository 4. Validate and persist Document
5. Finalize: Mark as processed 5. Mark as processed
Args: Args:
file_path: Path to the document file file_path: Path to the document file
chunking_strategy: Strategy configuration (for metadata) chunking_strategy: Strategy configuration (for metadata)
Returns: Returns:
Processed Document entity with structured sections Fully processed Document entity
Raises: Raises:
ExtractionError: If text extraction fails ExtractionError: If text extraction fails
@ -83,15 +83,14 @@ class DocumentProcessorService(ITextProcessor):
try: try:
logger.info(f"Processing document: {file_path}") logger.info(f"Processing document: {file_path}")
# Step 1: Extract raw Markdown from SourceFile # Step 1: Extract Document with raw_markdown and metadata
source_file = self._create_source_file(file_path) document = self._extract_document(file_path)
document = self._extract_from_source(source_file)
# Step 2: Parse Markdown into structured sections # Step 2: Parse Markdown into structured sections
sections = parse_markdown(document.raw_markdown) sections = parse_markdown(document.raw_markdown)
logger.debug(f"Parsed {len(sections)} sections from document") logger.debug(f"Parsed {len(sections)} sections from document")
# Step 3: Assemble rich Document model # Step 3: Update Document with sections
document = document.model_copy(update={"sections": sections}) document = document.model_copy(update={"sections": sections})
# Step 4: Validate document content # Step 4: Validate document content
@ -100,7 +99,7 @@ class DocumentProcessorService(ITextProcessor):
# Step 5: Persist to repository # Step 5: Persist to repository
saved_document = self._repository.save(document) saved_document = self._repository.save(document)
# Step 6: Finalize - mark as processed # Step 6: Mark as processed
saved_document.mark_as_processed() saved_document.mark_as_processed()
self._repository.save(saved_document) self._repository.save(saved_document)
@ -128,17 +127,17 @@ class DocumentProcessorService(ITextProcessor):
Extract text from document and split into chunks. Extract text from document and split into chunks.
Pipeline: Pipeline:
1. Extract raw Markdown from SourceFile 1. Extract Document with raw_markdown and metadata
2. Parse into structured sections 2. Parse into structured sections
3. Apply chunking strategy to raw content 3. Update Document with sections
4. Return chunks 4. Apply chunking strategy with section awareness
Args: Args:
file_path: Path to the document file file_path: Path to the document file
chunking_strategy: Strategy configuration for chunking chunking_strategy: Strategy configuration for chunking
Returns: Returns:
List of text chunks List of chunks with section metadata
Raises: Raises:
ExtractionError: If text extraction fails ExtractionError: If text extraction fails
@ -147,15 +146,16 @@ class DocumentProcessorService(ITextProcessor):
try: try:
logger.info(f"Extracting and chunking: {file_path}") logger.info(f"Extracting and chunking: {file_path}")
# Extract from source # Extract Document
source_file = self._create_source_file(file_path) document = self._extract_document(file_path)
document = self._extract_from_source(source_file)
# Parse sections # Parse sections
sections = parse_markdown(document.raw_markdown) sections = parse_markdown(document.raw_markdown)
# Update Document with sections
document = document.model_copy(update={"sections": sections}) document = document.model_copy(update={"sections": sections})
# Chunk using strategy # Chunk using strategy (section-aware)
chunks = self._chunk_document(document, chunking_strategy) chunks = self._chunk_document(document, chunking_strategy)
logger.info(f"Created {len(chunks)} chunks from document") logger.info(f"Created {len(chunks)} chunks from document")
@ -223,43 +223,24 @@ class DocumentProcessorService(ITextProcessor):
return self._repository.delete(document_id) return self._repository.delete(document_id)
def _create_source_file(self, file_path: Path) -> SourceFile: def _extract_document(self, file_path: Path) -> Document:
""" """
Create a SourceFile model from a file path. Extract Document using appropriate extractor.
Extractors create Document entities with raw_markdown and metadata.
Sections will be parsed later in the pipeline.
Args: Args:
file_path: Path to the source file file_path: Path to document file
Returns: Returns:
SourceFile entity Document entity with raw_markdown and metadata (sections empty)
Raises:
ValueError: If file doesn't exist or is invalid
"""
if not file_path.exists():
raise ValueError(f"File does not exist: {file_path}")
return SourceFile(
path=file_path,
extension=file_path.suffix.lstrip('.'),
size_bytes=file_path.stat().st_size,
)
def _extract_from_source(self, source_file: SourceFile) -> Document:
"""
Extract raw Markdown from SourceFile using appropriate extractor.
Args:
source_file: Source file to extract from
Returns:
Document entity with raw_markdown populated
Raises: Raises:
ExtractionError: If extraction fails ExtractionError: If extraction fails
""" """
extractor = self._extractor_factory.create_extractor(source_file.path) extractor = self._extractor_factory.create_extractor(file_path)
return extractor.extract(source_file.path) return extractor.extract(file_path)
def _chunk_document( def _chunk_document(
self, self,
@ -267,20 +248,20 @@ class DocumentProcessorService(ITextProcessor):
strategy: ChunkingStrategy, strategy: ChunkingStrategy,
) -> List[Chunk]: ) -> List[Chunk]:
""" """
Chunk document using specified strategy. Chunk document using specified strategy with section awareness.
This method is thread-safe as it delegates to a stateless This method is thread-safe as it delegates to a stateless
chunking context that selects the strategy based on configuration. chunking context. The full Document (with sections) is passed
to enable section-aware chunking.
Args: Args:
document: Document to chunk document: Full Document entity with sections
strategy: Chunking strategy configuration strategy: Chunking strategy configuration
Returns: Returns:
List of chunks List of chunks with section metadata
""" """
return self._chunking_context.execute_chunking( return self._chunking_context.execute_chunking(
text=document.content, document=document,
document_id=document.id,
strategy=strategy, strategy=strategy,
) )