text_processor/src/core/domain/models.py

"""
Core Domain Models - Rich Pydantic v2 Entities with Internal Validation.

This module contains the domain entities that represent the core business concepts.
All models are immutable by default and include comprehensive validation.
"""
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional
from uuid import UUID, uuid4

from pydantic import BaseModel, Field, field_validator, model_validator


class SourceType(str, Enum):
    """Enumeration of supported source types."""
    FILE = "file"
    WEB = "web"
    TEXT = "text"


class ChunkingMethod(str, Enum):
    """Enumeration of supported chunking methods."""
    FIXED_SIZE = "fixed_size"
    PARAGRAPH = "paragraph"


class SourceFile(BaseModel):
    """
    Represents the raw input file before processing.

    This model encapsulates file system information about the document source.
    Flow: SourceFile -> Extraction -> Document

    Attributes:
        path: Absolute path to the source file
        extension: File extension (e.g., 'md', 'pdf', 'docx')
        size_bytes: Size of the file in bytes
    """
    path: Path = Field(..., description="Absolute path to source file")
    extension: str = Field(..., min_length=1, description="File extension")
    size_bytes: int = Field(..., ge=0, description="File size in bytes")

    model_config = {
        "frozen": True,  # SourceFile is immutable
    }

    @field_validator('extension')
    @classmethod
    def normalize_extension(cls, value: str) -> str:
        """Normalize extension to lowercase without leading dot."""
        normalized = value.lower().strip()
        return normalized.lstrip('.')

    @field_validator('path')
    @classmethod
    def validate_path_exists(cls, value: Path) -> Path:
        """Validate that the path exists."""
        if not value.exists():
            raise ValueError(f"Source file does not exist: {value}")
        if not value.is_file():
            raise ValueError(f"Path is not a file: {value}")
        return value

    def get_file_name(self) -> str:
        """Get the filename without path."""
        return self.path.name

    def get_file_stem(self) -> str:
        """Get the filename without extension."""
        return self.path.stem


class WebPageSource(BaseModel):
    """
    Represents a web page source for document extraction.

    This model encapsulates URL information about the document source.
    Flow: WebPageSource -> Extraction -> Document

    Attributes:
        url: URL of the web page
        display_name: Human-readable name (e.g., 'about_us.html')
        content_length: Optional content length in bytes
    """
    url: str = Field(..., min_length=1, description="Web page URL")
    display_name: str = Field(..., min_length=1, description="Display name")
    content_length: Optional[int] = Field(None, ge=0, description="Content length")

    model_config = {
        "frozen": True,  # WebPageSource is immutable
    }

    @field_validator('url')
    @classmethod
    def validate_url(cls, value: str) -> str:
        """Validate URL format."""
        value = value.strip()
        if not (value.startswith('http://') or value.startswith('https://')):
            raise ValueError(f"URL must start with http:// or https://: {value}")
        return value

    @field_validator('display_name')
    @classmethod
    def normalize_display_name(cls, value: str) -> str:
        """Normalize display name."""
        return value.strip()

    def get_domain(self) -> str:
        """Extract domain from URL."""
        from urllib.parse import urlparse
        parsed = urlparse(self.url)
        return parsed.netloc


class DocumentSection(BaseModel):
    """
    Represents a structured section of a Markdown document.

    Sections are created by parsing Markdown headers. Text before the first
    header is grouped into an "Introduction" section.

    Attributes:
        title: Section title (from header or "Introduction")
        level: Header level (1-6 for h1-h6, 0 for Introduction)
        content: Section content with preserved Markdown formatting
    """
    title: Optional[str] = Field(None, min_length=1, description="Section title")
    level: int = Field(..., ge=0, le=6, description="Header level (0=intro)")
    content: str = Field(..., description="Section content with formatting")

    model_config = {
        "frozen": True,  # Sections are immutable
    }

    @field_validator('title')
    @classmethod
    def normalize_title(cls, value: str) -> str:
        """Normalize title by stripping whitespace."""
        if value:
            return value.strip()
        return value

    def is_introduction(self) -> bool:
        """Check if this is the introduction section."""
        return self.level == 0 and self.title == "Introduction"

    def get_word_count(self) -> int:
        """Get approximate word count of section content."""
        return len(self.content.split())


class DocumentMetadata(BaseModel):
    """
    Source-neutral metadata for documents.

    This metadata works for both file-based and web-based sources,
    enabling a unified processing pipeline.

    Attributes:
        source_id: Path or URL identifying the source
        source_type: Type of source (FILE or WEB)
        size_bytes: Size in bytes (file size or content length)
        created_at: Timestamp when metadata was created
        author: Optional author information
        extra_metadata: Additional source-specific metadata
    """
    source_id: str = Field(..., min_length=1, description="Path or URL")
    source_type: SourceType = Field(..., description="Source type enum")
    size_bytes: int = Field(..., ge=0, description="Size in bytes")
    created_at: datetime = Field(default_factory=datetime.utcnow)
    author: Optional[str] = Field(None, description="Author information")
    extra_metadata: Dict[str, str] = Field(
        default_factory=dict,
        description="Additional metadata"
    )

    def _format_size(self) -> str:
        """Format size in human-readable format."""
        size = self.size_bytes
        for unit in ['B', 'KB', 'MB', 'GB']:
            if size < 1024.0:
                return f"{size:.2f} {unit}"
            size /= 1024.0
        return f"{size:.2f} TB"

    def is_file_source(self) -> bool:
        """Check if this is a file-based source."""
        return self.source_type == SourceType.FILE

    def is_web_source(self) -> bool:
        """Check if this is a web-based source."""
        return self.source_type == SourceType.WEB


class Document(BaseModel):
    """
    Core domain entity representing a document with extracted and structured content.

    This rich model contains both the raw Markdown and parsed sections,
    enabling flexible querying and processing strategies.

    Attributes:
        id: Unique identifier for the document
        raw_markdown: Raw Markdown text extracted from source
        sections: Parsed structured sections from Markdown
        metadata: Associated metadata
        is_processed: Flag indicating if document has been processed

        download_url: Optional presigned URL for downloading the markdown file
    """
    id: UUID = Field(default_factory=uuid4, description="Unique document ID")
    raw_markdown: str = Field(..., description="Raw Markdown content")
    title: str = Field(..., description="Document title")
    sections: List[DocumentSection] = Field(
        default_factory=list,
        description="Structured document sections"
    )
    metadata: DocumentMetadata = Field(..., description="Document metadata")
    is_processed: bool = Field(default=False, description="Processing status")
    download_url: Optional[str] = Field(None, description="Presigned download URL")

    model_config = {
        "frozen": False,  # Allow mutation for processing status
        "str_strip_whitespace": True,
    }

    @field_validator('raw_markdown')
    @classmethod
    def validate_content_not_empty(cls, value: str) -> str:
        """Ensure content is not empty or just whitespace."""
        if not value or not value.strip():
            raise ValueError("Document content cannot be empty")
        return value

    @property
    def content(self) -> str:
        """
        Backward compatibility property for raw content access.

        Returns:
            Raw markdown content
        """
        return self.raw_markdown

    def validate_content(self) -> bool:
        """
        Validate that the document content meets quality standards.

        Returns:
            True if content is valid, raises ValueError otherwise

        Raises:
            ValueError: If content fails validation checks
        """
        # Check minimum length
        if len(self.raw_markdown.strip()) < 10:
            raise ValueError("Document content is too short (minimum 10 characters)")

        # Check for suspicious patterns (e.g., too many special characters)
        special_char_ratio = sum(
            not c.isalnum() and not c.isspace()
            for c in self.raw_markdown
        ) / len(self.raw_markdown)

        if special_char_ratio > 0.5:
            raise ValueError(
                f"Document content has too many special characters ({special_char_ratio:.2%})"
            )

        return True

    def mark_as_processed(self) -> None:
        """Mark the document as processed."""
        self.is_processed = True

    def get_content_preview(self, length: int = 100) -> str:
        """
        Get a preview of the document content.

        Args:
            length: Maximum length of preview

        Returns:
            Truncated content with ellipsis if needed
        """
        if len(self.raw_markdown) <= length:
            return self.raw_markdown
        return f"{self.raw_markdown[:length]}..."

    def get_section_count(self) -> int:
        """Get the number of sections in the document."""
        return len(self.sections)

    def get_sections_by_level(self, level: int) -> List[DocumentSection]:
        """
        Get all sections at a specific header level.

        Args:
            level: Header level to filter by (0-6)

        Returns:
            List of sections at the specified level
        """
        return [section for section in self.sections if section.level == level]

    def get_section_titles(self) -> List[str]:
        """
        Get all section titles in document order.

        Returns:
            List of section titles
        """
        return [section.title for section in self.sections]


class Chunk(BaseModel):
    """
    Represents a chunk of text extracted from a document.

    Enhanced to track section membership for precision chunking.

    Attributes:
        id: Unique identifier for the chunk
        document_id: ID of the parent document
        content: Text content of the chunk
        sequence_number: Order of this chunk in the document
        section_title: Title of the section this chunk belongs to
        section_index: Index of the section in document.sections
        metadata: Optional metadata specific to this chunk
    """
    id: UUID = Field(default_factory=uuid4, description="Unique chunk ID")
    document_id: UUID = Field(..., description="Parent document ID")
    content: str = Field(..., min_length=1, description="Chunk text content")
    sequence_number: int = Field(..., ge=0, description="Chunk order in document")
    section_title: Optional[str] = Field(None, description="Section title")
    section_index: Optional[int] = Field(None, ge=0, description="Section index")
    metadata: Dict[str, str] = Field(default_factory=dict)

    model_config = {
        "frozen": True,  # Chunks are immutable
    }

    def get_length(self) -> int:
        """Get the length of the chunk content."""
        return len(self.content)

    def contains_text(self, text: str, case_sensitive: bool = False) -> bool:
        """
        Check if chunk contains specific text.

        Args:
            text: Text to search for
            case_sensitive: Whether search should be case-sensitive

        Returns:
            True if text is found in chunk
        """
        content = self.content if case_sensitive else self.content.lower()
        search_text = text if case_sensitive else text.lower()
        return search_text in content

    def belongs_to_section(self) -> bool:
        """Check if this chunk belongs to a specific section."""
        return self.section_title is not None and self.section_index is not None

    def get_section_context(self) -> str:
        """
        Get a string describing the section context.

        Returns:
            Section context description or 'No section'
        """
        if self.belongs_to_section():
            return f"Section {self.section_index}: {self.section_title}"
        return "No section"


class ChunkingStrategy(BaseModel):
    """
    Configuration for a chunking strategy.

    Attributes:
        strategy_name: Chunking method (fixed_size or paragraph)
        chunk_size: Target size for chunks (in characters)
        overlap_size: Number of characters to overlap between chunks
        respect_boundaries: Whether to respect sentence/paragraph boundaries
    """
    strategy_name: ChunkingMethod = Field(..., description="Chunking method")
    chunk_size: int = Field(..., ge=1, le=10000, description="Target chunk size")
    overlap_size: int = Field(default=0, ge=0, description="Overlap between chunks")
    respect_boundaries: bool = Field(
        default=True,
        description="Respect text boundaries"
    )

    @model_validator(mode='after')
    def validate_overlap_less_than_size(self) -> 'ChunkingStrategy':
        """Ensure overlap is less than chunk size."""
        if self.overlap_size >= self.chunk_size:
            raise ValueError(
                f"overlap_size ({self.overlap_size}) must be less than "
                f"chunk_size ({self.chunk_size})"
            )
        return self

    def calculate_effective_step(self) -> int:
        """
        Calculate the effective step size between chunks.

        Returns:
            Number of characters to advance for next chunk
        """
        return self.chunk_size - self.overlap_size