make the domain general and open to add crawling system

2026-01-08 04:57:35 +03:30 · 2026-01-08 04:57:35 +03:30 · 2c375ce6bd
commit 2c375ce6bd
parent 359026fa98
8 changed files with 183 additions and 119 deletions
--- a/src/adapters/outgoing/extractors/docx_extractor.py
+++ b/src/adapters/outgoing/extractors/docx_extractor.py
@ -12,7 +12,7 @@ from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
 )
-from ....core.domain.models import Document, DocumentMetadata
+from ....core.domain.models import Document, DocumentMetadata, SourceType
 from ....core.ports.outgoing.extractor import IExtractor
@ -209,7 +209,7 @@ class DocxExtractor(IExtractor):
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
-        Create document metadata from file.
+        Create source-neutral document metadata from file.
        Args:
            file_path: Path to the file
@ -220,7 +220,8 @@ class DocxExtractor(IExtractor):
        stat = file_path.stat()
        return DocumentMetadata(
-            file_name=file_path.name,
+            source_id=str(file_path.absolute()),
-            file_type=file_path.suffix.lstrip('.').lower(),
+            source_type=SourceType.FILE,
-            file_size_bytes=stat.st_size,
+            display_name=file_path.name,
            size_bytes=stat.st_size,
        )
--- a/src/adapters/outgoing/extractors/pdf_extractor.py
+++ b/src/adapters/outgoing/extractors/pdf_extractor.py
@ -12,7 +12,7 @@ from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
 )
-from ....core.domain.models import Document, DocumentMetadata
+from ....core.domain.models import Document, DocumentMetadata, SourceType
 from ....core.ports.outgoing.extractor import IExtractor
@ -200,7 +200,7 @@ class PDFExtractor(IExtractor):
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
-        Create document metadata from file.
+        Create source-neutral document metadata from file.
        Args:
            file_path: Path to the file
@ -211,7 +211,8 @@ class PDFExtractor(IExtractor):
        stat = file_path.stat()
        return DocumentMetadata(
-            file_name=file_path.name,
+            source_id=str(file_path.absolute()),
-            file_type=file_path.suffix.lstrip('.').lower(),
+            source_type=SourceType.FILE,
-            file_size_bytes=stat.st_size,
+            display_name=file_path.name,
            size_bytes=stat.st_size,
        )
--- a/src/adapters/outgoing/extractors/txt_extractor.py
+++ b/src/adapters/outgoing/extractors/txt_extractor.py
@ -12,7 +12,7 @@ from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
 )
-from ....core.domain.models import Document, DocumentMetadata
+from ....core.domain.models import Document, DocumentMetadata, SourceType
 from ....core.ports.outgoing.extractor import IExtractor
@ -187,7 +187,7 @@ class TxtExtractor(IExtractor):
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
-        Create document metadata from file.
+        Create source-neutral document metadata from file.
        Args:
            file_path: Path to the file
@ -198,7 +198,8 @@ class TxtExtractor(IExtractor):
        stat = file_path.stat()
        return DocumentMetadata(
-            file_name=file_path.name,
+            source_id=str(file_path.absolute()),
-            file_type=file_path.suffix.lstrip('.').lower(),
+            source_type=SourceType.FILE,
-            file_size_bytes=stat.st_size,
+            display_name=file_path.name,
            size_bytes=stat.st_size,
        )
--- a/src/core/domain/models.py
+++ b/src/core/domain/models.py
@ -5,6 +5,7 @@ This module contains the domain entities that represent the core business concep
 All models are immutable by default and include comprehensive validation.
 """
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
 from typing import Dict, List, Optional
 from uuid import UUID, uuid4
@ -12,6 +13,12 @@ from uuid import UUID, uuid4
 from pydantic import BaseModel, Field, field_validator, model_validator
 class SourceType(str, Enum):
    """Enumeration of supported source types."""
    FILE = "file"
    WEB = "web"
 class SourceFile(BaseModel):
    """
    Represents the raw input file before processing.
@ -58,6 +65,48 @@ class SourceFile(BaseModel):
        return self.path.stem
 class WebPageSource(BaseModel):
    """
    Represents a web page source for document extraction.
    This model encapsulates URL information about the document source.
    Flow: WebPageSource -> Extraction -> Document
    Attributes:
        url: URL of the web page
        display_name: Human-readable name (e.g., 'about_us.html')
        content_length: Optional content length in bytes
    """
    url: str = Field(..., min_length=1, description="Web page URL")
    display_name: str = Field(..., min_length=1, description="Display name")
    content_length: Optional[int] = Field(None, ge=0, description="Content length")
    model_config = {
        "frozen": True,  # WebPageSource is immutable
    }
    @field_validator('url')
    @classmethod
    def validate_url(cls, value: str) -> str:
        """Validate URL format."""
        value = value.strip()
        if not (value.startswith('http://') or value.startswith('https://')):
            raise ValueError(f"URL must start with http:// or https://: {value}")
        return value
    @field_validator('display_name')
    @classmethod
    def normalize_display_name(cls, value: str) -> str:
        """Normalize display name."""
        return value.strip()
    def get_domain(self) -> str:
        """Extract domain from URL."""
        from urllib.parse import urlparse
        parsed = urlparse(self.url)
        return parsed.netloc
 class DocumentSection(BaseModel):
    """
    Represents a structured section of a Markdown document.
@ -95,30 +144,36 @@ class DocumentSection(BaseModel):
 class DocumentMetadata(BaseModel):
    """
-    Metadata associated with a document.
+    Source-neutral metadata for documents.
    This metadata works for both file-based and web-based sources,
    enabling a unified processing pipeline.
    Attributes:
-        file_name: Original filename of the document
+        source_id: Path or URL identifying the source
-        file_type: Type/extension of the file (e.g., 'pdf', 'docx')
+        source_type: Type of source (FILE or WEB)
-        file_size_bytes: Size of the file in bytes
+        display_name: Human-readable name (e.g., 'manual.pdf', 'about_us.html')
-        created_at: Timestamp when document was created
+        size_bytes: Size in bytes (file size or content length)
        created_at: Timestamp when metadata was created
        author: Optional author information
-        page_count: Optional number of pages in document
+        extra_metadata: Additional source-specific metadata
        custom_fields: Additional metadata fields
    """
-    file_name: str = Field(..., min_length=1, description="Original filename")
+    source_id: str = Field(..., min_length=1, description="Path or URL")
-    file_type: str = Field(..., min_length=1, description="File extension")
+    source_type: SourceType = Field(..., description="Source type enum")
-    file_size_bytes: int = Field(..., ge=0, description="File size in bytes")
+    display_name: str = Field(..., min_length=1, description="Display name")
    size_bytes: int = Field(..., ge=0, description="Size in bytes")
    created_at: datetime = Field(default_factory=datetime.utcnow)
-    author: Optional[str] = Field(None, description="Document author")
+    author: Optional[str] = Field(None, description="Author information")
-    page_count: Optional[int] = Field(None, ge=1, description="Number of pages")
+    extra_metadata: Dict[str, str] = Field(
-    custom_fields: Dict[str, str] = Field(default_factory=dict)
+        default_factory=dict,
        description="Additional metadata"
    )
-    @field_validator('file_type')
+    @field_validator('display_name')
    @classmethod
-    def validate_file_type(cls, value: str) -> str:
+    def normalize_display_name(cls, value: str) -> str:
-        """Ensure file type is lowercase and stripped."""
+        """Normalize display name."""
-        return value.lower().strip()
+        return value.strip()
    def get_summary(self) -> str:
        """
@ -128,28 +183,33 @@ class DocumentMetadata(BaseModel):
            Formatted string containing key metadata information
        """
        summary_parts = [
-            f"File: {self.file_name}",
+            f"Source: {self.display_name}",
-            f"Type: {self.file_type}",
+            f"Type: {self.source_type.value}",
-            f"Size: {self._format_file_size()}",
+            f"Size: {self._format_size()}",
        ]
        if self.author:
            summary_parts.append(f"Author: {self.author}")
        if self.page_count:
            summary_parts.append(f"Pages: {self.page_count}")
        return " | ".join(summary_parts)
-    def _format_file_size(self) -> str:
+    def _format_size(self) -> str:
-        """Format file size in human-readable format."""
+        """Format size in human-readable format."""
-        size = self.file_size_bytes
+        size = self.size_bytes
        for unit in ['B', 'KB', 'MB', 'GB']:
            if size < 1024.0:
                return f"{size:.2f} {unit}"
            size /= 1024.0
        return f"{size:.2f} TB"
    def is_file_source(self) -> bool:
        """Check if this is a file-based source."""
        return self.source_type == SourceType.FILE
    def is_web_source(self) -> bool:
        """Check if this is a web-based source."""
        return self.source_type == SourceType.WEB
 class Document(BaseModel):
    """
@ -281,6 +341,8 @@ class Chunk(BaseModel):
    """
    Represents a chunk of text extracted from a document.
    Enhanced to track section membership for precision chunking.
    Attributes:
        id: Unique identifier for the chunk
        document_id: ID of the parent document
@ -288,6 +350,8 @@ class Chunk(BaseModel):
        sequence_number: Order of this chunk in the document
        start_char: Starting character position in original document
        end_char: Ending character position in original document
        section_title: Title of the section this chunk belongs to
        section_index: Index of the section in document.sections
        metadata: Optional metadata specific to this chunk
    """
    id: UUID = Field(default_factory=uuid4, description="Unique chunk ID")
@ -296,6 +360,8 @@ class Chunk(BaseModel):
    sequence_number: int = Field(..., ge=0, description="Chunk order in document")
    start_char: int = Field(..., ge=0, description="Start position in document")
    end_char: int = Field(..., gt=0, description="End position in document")
    section_title: Optional[str] = Field(None, description="Section title")
    section_index: Optional[int] = Field(None, ge=0, description="Section index")
    metadata: Dict[str, str] = Field(default_factory=dict)
    model_config = {
@ -342,6 +408,21 @@ class Chunk(BaseModel):
        search_text = text if case_sensitive else text.lower()
        return search_text in content
    def belongs_to_section(self) -> bool:
        """Check if this chunk belongs to a specific section."""
        return self.section_title is not None and self.section_index is not None
    def get_section_context(self) -> str:
        """
        Get a string describing the section context.
        Returns:
            Section context description or 'No section'
        """
        if self.belongs_to_section():
            return f"Section {self.section_index}: {self.section_title}"
        return "No section"
 class ChunkingStrategy(BaseModel):
    """
--- a/src/core/ports/outgoing/chunker.py
+++ b/src/core/ports/outgoing/chunker.py
@ -1,14 +1,13 @@
 """
 Outgoing Port - Text Chunker Interface.
-This defines the contract for chunking text into smaller pieces.
+This defines the contract for chunking documents into smaller pieces.
 Different strategies can be implemented as adapters.
 """
 from abc import ABC, abstractmethod
 from typing import List
 from uuid import UUID
-from ...domain.models import Chunk, ChunkingStrategy
+from ...domain.models import Chunk, ChunkingStrategy, Document
 class IChunker(ABC):
@ -16,26 +15,26 @@ class IChunker(ABC):
    Interface for text chunking strategies.
    Implementations of this interface provide different strategies
-    for splitting text into manageable chunks.
+    for splitting documents into manageable chunks with section awareness.
    """
    @abstractmethod
    def chunk(
        self,
-        text: str,
+        document: Document,
        document_id: UUID,
        strategy: ChunkingStrategy,
    ) -> List[Chunk]:
        """
-        Split text into chunks according to a strategy.
+        Split document into chunks according to a strategy.
        Chunkers can utilize document.sections for section-aware chunking.
        Args:
-            text: Text content to chunk
+            document: Full Document entity with raw_markdown and sections
            document_id: ID of the parent document
            strategy: Chunking strategy configuration
        Returns:
-            List of Chunk entities
+            List of Chunk entities with section metadata
        Raises:
            ChunkingError: If chunking fails
--- a/src/core/ports/outgoing/chunking_context.py
+++ b/src/core/ports/outgoing/chunking_context.py
@ -5,9 +5,8 @@ This defines the contract for managing chunking strategies.
 """
 from abc import ABC, abstractmethod
 from typing import List
 from uuid import UUID
-from ...domain.models import Chunk, ChunkingStrategy
+from ...domain.models import Chunk, ChunkingStrategy, Document
 from .chunker import IChunker
@ -22,23 +21,21 @@ class IChunkingContext(ABC):
    @abstractmethod
    def execute_chunking(
        self,
-        text: str,
+        document: Document,
        document_id: UUID,
        strategy: ChunkingStrategy,
    ) -> List[Chunk]:
        """
        Execute chunking using the specified strategy.
-        This method is stateless and thread-safe. It selects the appropriate
+        This method is stateless and thread-safe. It accepts the full
-        chunker based on the strategy configuration and executes chunking.
+        Document object (with sections) to enable section-aware chunking.
        Args:
-            text: Text to chunk
+            document: Full Document entity with raw_markdown and sections
            document_id: ID of parent document
            strategy: Chunking strategy configuration (includes strategy_name)
        Returns:
-            List of chunks
+            List of chunks with section metadata
        Raises:
            ChunkingError: If strategy is not registered or chunking fails
--- a/src/core/ports/outgoing/extractor.py
+++ b/src/core/ports/outgoing/extractor.py
@ -1,8 +1,8 @@
 """
 Outgoing Port - Text Extractor Interface.
-This defines the contract for extracting text from documents.
+This defines the contract for extracting content from documents.
-Different adapters can implement this for various file types.
+Different adapters can implement this for various file types and sources.
 """
 from abc import ABC, abstractmethod
 from pathlib import Path
@ -16,7 +16,7 @@ class IExtractor(ABC):
    Interface for text extraction from documents.
    Implementations of this interface handle specific file formats
-    (PDF, DOCX, TXT, etc.) and adapt external libraries to the domain.
+    (PDF, DOCX, TXT, etc.) or web sources and return Document entities.
    """
    @abstractmethod
@ -24,11 +24,14 @@ class IExtractor(ABC):
        """
        Extract text and metadata from a document file.
        Extractors create Document entities with raw_markdown and metadata.
        Sections are parsed later in the pipeline.
        Args:
            file_path: Path to the document file
        Returns:
-            Document entity with extracted content and metadata
+            Document entity with raw_markdown and metadata populated
        Raises:
            ExtractionError: If extraction fails
--- a/src/core/services/document_processor_service.py
+++ b/src/core/services/document_processor_service.py
@ -59,21 +59,21 @@ class DocumentProcessorService(ITextProcessor):
        chunking_strategy: ChunkingStrategy,
    ) -> Document:
        """
-        Process a document by extracting, parsing, and storing it.
+        Process a document using the stateless pipeline.
-        New Pragmatic Pipeline:
+        Pipeline Order:
-        1. Extract: Get raw Markdown from SourceFile using extractor
+        1. Extract Document with raw_markdown and metadata (via Adapter)
-        2. Parse: Use parse_markdown to create structured sections
+        2. Parse Markdown into DocumentSection objects
-        3. Assemble: Create rich Document with raw_markdown + sections
+        3. Update Document with sections
-        4. Persist: Save to repository
+        4. Validate and persist Document
-        5. Finalize: Mark as processed
+        5. Mark as processed
        Args:
            file_path: Path to the document file
            chunking_strategy: Strategy configuration (for metadata)
        Returns:
-            Processed Document entity with structured sections
+            Fully processed Document entity
        Raises:
            ExtractionError: If text extraction fails
@ -83,15 +83,14 @@ class DocumentProcessorService(ITextProcessor):
        try:
            logger.info(f"Processing document: {file_path}")
-            # Step 1: Extract raw Markdown from SourceFile
+            # Step 1: Extract Document with raw_markdown and metadata
-            source_file = self._create_source_file(file_path)
+            document = self._extract_document(file_path)
            document = self._extract_from_source(source_file)
            # Step 2: Parse Markdown into structured sections
            sections = parse_markdown(document.raw_markdown)
            logger.debug(f"Parsed {len(sections)} sections from document")
-            # Step 3: Assemble rich Document model
+            # Step 3: Update Document with sections
            document = document.model_copy(update={"sections": sections})
            # Step 4: Validate document content
@ -100,7 +99,7 @@ class DocumentProcessorService(ITextProcessor):
            # Step 5: Persist to repository
            saved_document = self._repository.save(document)
-            # Step 6: Finalize - mark as processed
+            # Step 6: Mark as processed
            saved_document.mark_as_processed()
            self._repository.save(saved_document)
@ -128,17 +127,17 @@ class DocumentProcessorService(ITextProcessor):
        Extract text from document and split into chunks.
        Pipeline:
-        1. Extract raw Markdown from SourceFile
+        1. Extract Document with raw_markdown and metadata
        2. Parse into structured sections
-        3. Apply chunking strategy to raw content
+        3. Update Document with sections
-        4. Return chunks
+        4. Apply chunking strategy with section awareness
        Args:
            file_path: Path to the document file
            chunking_strategy: Strategy configuration for chunking
        Returns:
-            List of text chunks
+            List of chunks with section metadata
        Raises:
            ExtractionError: If text extraction fails
@ -147,15 +146,16 @@ class DocumentProcessorService(ITextProcessor):
        try:
            logger.info(f"Extracting and chunking: {file_path}")
-            # Extract from source
+            # Extract Document
-            source_file = self._create_source_file(file_path)
+            document = self._extract_document(file_path)
            document = self._extract_from_source(source_file)
            # Parse sections
            sections = parse_markdown(document.raw_markdown)
            # Update Document with sections
            document = document.model_copy(update={"sections": sections})
-            # Chunk using strategy
+            # Chunk using strategy (section-aware)
            chunks = self._chunk_document(document, chunking_strategy)
            logger.info(f"Created {len(chunks)} chunks from document")
@ -223,43 +223,24 @@ class DocumentProcessorService(ITextProcessor):
        return self._repository.delete(document_id)
-    def _create_source_file(self, file_path: Path) -> SourceFile:
+    def _extract_document(self, file_path: Path) -> Document:
        """
-        Create a SourceFile model from a file path.
+        Extract Document using appropriate extractor.
        Extractors create Document entities with raw_markdown and metadata.
        Sections will be parsed later in the pipeline.
        Args:
-            file_path: Path to the source file
+            file_path: Path to document file
        Returns:
-            SourceFile entity
+            Document entity with raw_markdown and metadata (sections empty)
        Raises:
            ValueError: If file doesn't exist or is invalid
        """
        if not file_path.exists():
            raise ValueError(f"File does not exist: {file_path}")
        return SourceFile(
            path=file_path,
            extension=file_path.suffix.lstrip('.'),
            size_bytes=file_path.stat().st_size,
        )
    def _extract_from_source(self, source_file: SourceFile) -> Document:
        """
        Extract raw Markdown from SourceFile using appropriate extractor.
        Args:
            source_file: Source file to extract from
        Returns:
            Document entity with raw_markdown populated
        Raises:
            ExtractionError: If extraction fails
        """
-        extractor = self._extractor_factory.create_extractor(source_file.path)
+        extractor = self._extractor_factory.create_extractor(file_path)
-        return extractor.extract(source_file.path)
+        return extractor.extract(file_path)
    def _chunk_document(
        self,
@ -267,20 +248,20 @@ class DocumentProcessorService(ITextProcessor):
        strategy: ChunkingStrategy,
    ) -> List[Chunk]:
        """
-        Chunk document using specified strategy.
+        Chunk document using specified strategy with section awareness.
        This method is thread-safe as it delegates to a stateless
-        chunking context that selects the strategy based on configuration.
+        chunking context. The full Document (with sections) is passed
        to enable section-aware chunking.
        Args:
-            document: Document to chunk
+            document: Full Document entity with sections
            strategy: Chunking strategy configuration
        Returns:
-            List of chunks
+            List of chunks with section metadata
        """
        return self._chunking_context.execute_chunking(
-            text=document.content,
+            document=document,
            document_id=document.id,
            strategy=strategy,
        )