""" Core Domain Models - Rich Pydantic v2 Entities with Internal Validation. This module contains the domain entities that represent the core business concepts. All models are immutable by default and include comprehensive validation. """ from datetime import datetime from enum import Enum from pathlib import Path from typing import Dict, List, Optional from uuid import UUID, uuid4 from pydantic import BaseModel, Field, field_validator, model_validator class SourceType(str, Enum): """Enumeration of supported source types.""" FILE = "file" WEB = "web" TEXT = "text" class ChunkingMethod(str, Enum): """Enumeration of supported chunking methods.""" FIXED_SIZE = "fixed_size" PARAGRAPH = "paragraph" class SourceFile(BaseModel): """ Represents the raw input file before processing. This model encapsulates file system information about the document source. Flow: SourceFile -> Extraction -> Document Attributes: path: Absolute path to the source file extension: File extension (e.g., 'md', 'pdf', 'docx') size_bytes: Size of the file in bytes """ path: Path = Field(..., description="Absolute path to source file") extension: str = Field(..., min_length=1, description="File extension") size_bytes: int = Field(..., ge=0, description="File size in bytes") model_config = { "frozen": True, # SourceFile is immutable } @field_validator('extension') @classmethod def normalize_extension(cls, value: str) -> str: """Normalize extension to lowercase without leading dot.""" normalized = value.lower().strip() return normalized.lstrip('.') @field_validator('path') @classmethod def validate_path_exists(cls, value: Path) -> Path: """Validate that the path exists.""" if not value.exists(): raise ValueError(f"Source file does not exist: {value}") if not value.is_file(): raise ValueError(f"Path is not a file: {value}") return value def get_file_name(self) -> str: """Get the filename without path.""" return self.path.name def get_file_stem(self) -> str: """Get the filename without extension.""" return self.path.stem class WebPageSource(BaseModel): """ Represents a web page source for document extraction. This model encapsulates URL information about the document source. Flow: WebPageSource -> Extraction -> Document Attributes: url: URL of the web page display_name: Human-readable name (e.g., 'about_us.html') content_length: Optional content length in bytes """ url: str = Field(..., min_length=1, description="Web page URL") display_name: str = Field(..., min_length=1, description="Display name") content_length: Optional[int] = Field(None, ge=0, description="Content length") model_config = { "frozen": True, # WebPageSource is immutable } @field_validator('url') @classmethod def validate_url(cls, value: str) -> str: """Validate URL format.""" value = value.strip() if not (value.startswith('http://') or value.startswith('https://')): raise ValueError(f"URL must start with http:// or https://: {value}") return value @field_validator('display_name') @classmethod def normalize_display_name(cls, value: str) -> str: """Normalize display name.""" return value.strip() def get_domain(self) -> str: """Extract domain from URL.""" from urllib.parse import urlparse parsed = urlparse(self.url) return parsed.netloc class DocumentSection(BaseModel): """ Represents a structured section of a Markdown document. Sections are created by parsing Markdown headers. Text before the first header is grouped into an "Introduction" section. Attributes: title: Section title (from header or "Introduction") level: Header level (1-6 for h1-h6, 0 for Introduction) content: Section content with preserved Markdown formatting """ title: Optional[str] = Field(None, min_length=1, description="Section title") level: int = Field(..., ge=0, le=6, description="Header level (0=intro)") content: str = Field(..., description="Section content with formatting") model_config = { "frozen": True, # Sections are immutable } @field_validator('title') @classmethod def normalize_title(cls, value: str) -> str: """Normalize title by stripping whitespace.""" if value: return value.strip() return value def is_introduction(self) -> bool: """Check if this is the introduction section.""" return self.level == 0 and self.title == "Introduction" def get_word_count(self) -> int: """Get approximate word count of section content.""" return len(self.content.split()) class DocumentMetadata(BaseModel): """ Source-neutral metadata for documents. This metadata works for both file-based and web-based sources, enabling a unified processing pipeline. Attributes: source_id: Path or URL identifying the source source_type: Type of source (FILE or WEB) size_bytes: Size in bytes (file size or content length) created_at: Timestamp when metadata was created author: Optional author information extra_metadata: Additional source-specific metadata """ source_id: str = Field(..., min_length=1, description="Path or URL") source_type: SourceType = Field(..., description="Source type enum") size_bytes: int = Field(..., ge=0, description="Size in bytes") created_at: datetime = Field(default_factory=datetime.utcnow) author: Optional[str] = Field(None, description="Author information") extra_metadata: Dict[str, str] = Field( default_factory=dict, description="Additional metadata" ) def _format_size(self) -> str: """Format size in human-readable format.""" size = self.size_bytes for unit in ['B', 'KB', 'MB', 'GB']: if size < 1024.0: return f"{size:.2f} {unit}" size /= 1024.0 return f"{size:.2f} TB" def is_file_source(self) -> bool: """Check if this is a file-based source.""" return self.source_type == SourceType.FILE def is_web_source(self) -> bool: """Check if this is a web-based source.""" return self.source_type == SourceType.WEB class Document(BaseModel): """ Core domain entity representing a document with extracted and structured content. This rich model contains both the raw Markdown and parsed sections, enabling flexible querying and processing strategies. Attributes: id: Unique identifier for the document raw_markdown: Raw Markdown text extracted from source sections: Parsed structured sections from Markdown metadata: Associated metadata is_processed: Flag indicating if document has been processed download_url: Optional presigned URL for downloading the markdown file """ id: UUID = Field(default_factory=uuid4, description="Unique document ID") raw_markdown: str = Field(..., description="Raw Markdown content") title: str = Field(..., description="Document title") sections: List[DocumentSection] = Field( default_factory=list, description="Structured document sections" ) metadata: DocumentMetadata = Field(..., description="Document metadata") is_processed: bool = Field(default=False, description="Processing status") download_url: Optional[str] = Field(None, description="Presigned download URL") model_config = { "frozen": False, # Allow mutation for processing status "str_strip_whitespace": True, } @field_validator('raw_markdown') @classmethod def validate_content_not_empty(cls, value: str) -> str: """Ensure content is not empty or just whitespace.""" if not value or not value.strip(): raise ValueError("Document content cannot be empty") return value @property def content(self) -> str: """ Backward compatibility property for raw content access. Returns: Raw markdown content """ return self.raw_markdown def validate_content(self) -> bool: """ Validate that the document content meets quality standards. Returns: True if content is valid, raises ValueError otherwise Raises: ValueError: If content fails validation checks """ # Check minimum length if len(self.raw_markdown.strip()) < 10: raise ValueError("Document content is too short (minimum 10 characters)") # Check for suspicious patterns (e.g., too many special characters) special_char_ratio = sum( not c.isalnum() and not c.isspace() for c in self.raw_markdown ) / len(self.raw_markdown) if special_char_ratio > 0.5: raise ValueError( f"Document content has too many special characters ({special_char_ratio:.2%})" ) return True def mark_as_processed(self) -> None: """Mark the document as processed.""" self.is_processed = True def get_content_preview(self, length: int = 100) -> str: """ Get a preview of the document content. Args: length: Maximum length of preview Returns: Truncated content with ellipsis if needed """ if len(self.raw_markdown) <= length: return self.raw_markdown return f"{self.raw_markdown[:length]}..." def get_section_count(self) -> int: """Get the number of sections in the document.""" return len(self.sections) def get_sections_by_level(self, level: int) -> List[DocumentSection]: """ Get all sections at a specific header level. Args: level: Header level to filter by (0-6) Returns: List of sections at the specified level """ return [section for section in self.sections if section.level == level] def get_section_titles(self) -> List[str]: """ Get all section titles in document order. Returns: List of section titles """ return [section.title for section in self.sections] class Chunk(BaseModel): """ Represents a chunk of text extracted from a document. Enhanced to track section membership for precision chunking. Attributes: id: Unique identifier for the chunk document_id: ID of the parent document content: Text content of the chunk sequence_number: Order of this chunk in the document section_title: Title of the section this chunk belongs to section_index: Index of the section in document.sections metadata: Optional metadata specific to this chunk """ id: UUID = Field(default_factory=uuid4, description="Unique chunk ID") document_id: UUID = Field(..., description="Parent document ID") content: str = Field(..., min_length=1, description="Chunk text content") sequence_number: int = Field(..., ge=0, description="Chunk order in document") section_title: Optional[str] = Field(None, description="Section title") section_index: Optional[int] = Field(None, ge=0, description="Section index") metadata: Dict[str, str] = Field(default_factory=dict) model_config = { "frozen": True, # Chunks are immutable } def get_length(self) -> int: """Get the length of the chunk content.""" return len(self.content) def contains_text(self, text: str, case_sensitive: bool = False) -> bool: """ Check if chunk contains specific text. Args: text: Text to search for case_sensitive: Whether search should be case-sensitive Returns: True if text is found in chunk """ content = self.content if case_sensitive else self.content.lower() search_text = text if case_sensitive else text.lower() return search_text in content def belongs_to_section(self) -> bool: """Check if this chunk belongs to a specific section.""" return self.section_title is not None and self.section_index is not None def get_section_context(self) -> str: """ Get a string describing the section context. Returns: Section context description or 'No section' """ if self.belongs_to_section(): return f"Section {self.section_index}: {self.section_title}" return "No section" class ChunkingStrategy(BaseModel): """ Configuration for a chunking strategy. Attributes: strategy_name: Chunking method (fixed_size or paragraph) chunk_size: Target size for chunks (in characters) overlap_size: Number of characters to overlap between chunks respect_boundaries: Whether to respect sentence/paragraph boundaries """ strategy_name: ChunkingMethod = Field(..., description="Chunking method") chunk_size: int = Field(..., ge=1, le=10000, description="Target chunk size") overlap_size: int = Field(default=0, ge=0, description="Overlap between chunks") respect_boundaries: bool = Field( default=True, description="Respect text boundaries" ) @model_validator(mode='after') def validate_overlap_less_than_size(self) -> 'ChunkingStrategy': """Ensure overlap is less than chunk size.""" if self.overlap_size >= self.chunk_size: raise ValueError( f"overlap_size ({self.overlap_size}) must be less than " f"chunk_size ({self.chunk_size})" ) return self def calculate_effective_step(self) -> int: """ Calculate the effective step size between chunks. Returns: Number of characters to advance for next chunk """ return self.chunk_size - self.overlap_size