diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md deleted file mode 100644 index 5e8c85c..0000000 --- a/ARCHITECTURE.md +++ /dev/null @@ -1,410 +0,0 @@ -# Architecture Documentation - -## Hexagonal Architecture Overview - -``` -┌─────────────────────────────────────────────────────────────────────┐ -│ INCOMING ADAPTERS │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ FastAPI Routes (HTTP) │ │ -│ │ - ProcessDocumentRequest → API Schemas │ │ -│ │ - ExtractAndChunkRequest → API Schemas │ │ -│ └──────────────────────────────────────────────────────────────┘ │ -└──────────────────────────────┬──────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────────┐ -│ CORE DOMAIN │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ PORTS (Interfaces) │ │ -│ │ ┌────────────────────┐ ┌───────────────────────────┐ │ │ -│ │ │ Incoming Ports │ │ Outgoing Ports │ │ │ -│ │ │ - ITextProcessor │ │ - IExtractor │ │ │ -│ │ │ │ │ - IChunker │ │ │ -│ │ │ │ │ - IDocumentRepository │ │ │ -│ │ └────────────────────┘ └───────────────────────────┘ │ │ -│ └──────────────────────────────────────────────────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ SERVICES (Business Logic) │ │ -│ │ - DocumentProcessorService │ │ -│ │ • Orchestrates Extract → Clean → Chunk → Save │ │ -│ │ • Depends ONLY on Port interfaces │ │ -│ └──────────────────────────────────────────────────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ DOMAIN MODELS (Rich Entities) │ │ -│ │ - Document (with validation & business methods) │ │ -│ │ - Chunk (immutable value object) │ │ -│ │ - ChunkingStrategy (configuration) │ │ -│ │ - DocumentMetadata │ │ -│ └──────────────────────────────────────────────────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ DOMAIN LOGIC (Pure Functions) │ │ -│ │ - normalize_whitespace() │ │ -│ │ - clean_text() │ │ -│ │ - split_into_paragraphs() │ │ -│ │ - find_sentence_boundary_before() │ │ -│ └──────────────────────────────────────────────────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ EXCEPTIONS (Domain Errors) │ │ -│ │ - ExtractionError, ChunkingError, ProcessingError │ │ -│ │ - ValidationError, RepositoryError │ │ -│ └──────────────────────────────────────────────────────────────┘ │ -└──────────────────────────────┬──────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────────┐ -│ OUTGOING ADAPTERS │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ EXTRACTORS (Implements IExtractor) │ │ -│ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │ -│ │ │ PDFExtractor│ │DocxExtractor│ │TxtExtractor│ │ │ -│ │ │ (PyPDF2) │ │(python-docx)│ │ (built-in) │ │ │ -│ │ └────────────┘ └────────────┘ └────────────┘ │ │ -│ │ - Managed by ExtractorFactory (Factory Pattern) │ │ -│ └──────────────────────────────────────────────────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ CHUNKERS (Implements IChunker) │ │ -│ │ ┌─────────────────┐ ┌──────────────────┐ │ │ -│ │ │ FixedSizeChunker│ │ParagraphChunker │ │ │ -│ │ │ - Fixed chunks │ │ - Respect │ │ │ -│ │ │ - With overlap │ │ paragraphs │ │ │ -│ │ └─────────────────┘ └──────────────────┘ │ │ -│ │ - Managed by ChunkingContext (Strategy Pattern) │ │ -│ └──────────────────────────────────────────────────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ REPOSITORY (Implements IDocumentRepository) │ │ -│ │ ┌──────────────────────────────────┐ │ │ -│ │ │ InMemoryDocumentRepository │ │ │ -│ │ │ - Thread-safe Dict storage │ │ │ -│ │ │ - Easy to swap for PostgreSQL │ │ │ -│ │ └──────────────────────────────────┘ │ │ -│ └──────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────────────────────────────┐ -│ BOOTSTRAP (Wiring) │ -│ ApplicationContainer: │ -│ - Creates all adapters │ -│ - Injects dependencies into core │ -│ - ONLY place where adapters are instantiated │ -└─────────────────────────────────────────────────────────────────────┘ -``` - -## Data Flow: Process Document - -``` -1. HTTP Request - │ - ▼ -2. FastAPI Route (Incoming Adapter) - │ - Validates request schema - ▼ -3. DocumentProcessorService (Core) - │ - Calls ExtractorFactory - ▼ -4. PDFExtractor (Outgoing Adapter) - │ - Extracts text using PyPDF2 - │ - Maps PyPDF2 exceptions → Domain exceptions - ▼ -5. DocumentProcessorService - │ - Cleans text using domain logic utils - │ - Validates Document - ▼ -6. InMemoryRepository (Outgoing Adapter) - │ - Saves Document - ▼ -7. DocumentProcessorService - │ - Returns Document - ▼ -8. FastAPI Route - │ - Converts Document → DocumentResponse - ▼ -9. HTTP Response -``` - -## Data Flow: Extract and Chunk - -``` -1. HTTP Request - │ - ▼ -2. FastAPI Route - │ - Validates request - ▼ -3. DocumentProcessorService - │ - Gets extractor from factory - │ - Extracts text - ▼ -4. Extractor (PDF/DOCX/TXT) - │ - Returns Document - ▼ -5. DocumentProcessorService - │ - Cleans text - │ - Calls ChunkingContext - ▼ -6. ChunkingContext (Strategy Pattern) - │ - Selects appropriate chunker - ▼ -7. Chunker (FixedSize/Paragraph) - │ - Splits text into segments - │ - Creates Chunk entities - ▼ -8. DocumentProcessorService - │ - Returns List[Chunk] - ▼ -9. FastAPI Route - │ - Converts Chunks → ChunkResponse[] - ▼ -10. HTTP Response -``` - -## Dependency Rules - -### ✅ ALLOWED Dependencies - -``` -Incoming Adapters → Core Ports (Incoming) -Core Services → Core Ports (Outgoing) -Core → Core (Domain Models, Logic Utils, Exceptions) -Bootstrap → Everything (Wiring only) -``` - -### ❌ FORBIDDEN Dependencies - -``` -Core → Adapters (NEVER!) -Core → External Libraries (Only in Adapters) -Domain Models → Services -Domain Models → Ports -``` - -## Key Design Patterns - -### 1. Hexagonal Architecture (Ports & Adapters) -- **Purpose**: Isolate core business logic from external concerns -- **Implementation**: - - Ports: Interface definitions (ITextProcessor, IExtractor, etc.) - - Adapters: Concrete implementations (PDFExtractor, FastAPI routes) - -### 2. Factory Pattern -- **Class**: `ExtractorFactory` -- **Purpose**: Create appropriate extractor based on file extension -- **Benefit**: Centralized extractor management, easy to add new types - -### 3. Strategy Pattern -- **Class**: `ChunkingContext` -- **Purpose**: Switch between chunking strategies at runtime -- **Strategies**: FixedSizeChunker, ParagraphChunker -- **Benefit**: Easy to add new chunking algorithms - -### 4. Repository Pattern -- **Interface**: `IDocumentRepository` -- **Implementation**: `InMemoryDocumentRepository` -- **Purpose**: Abstract data persistence -- **Benefit**: Easy to swap storage (memory → PostgreSQL → MongoDB) - -### 5. Dependency Injection -- **Class**: `ApplicationContainer` -- **Purpose**: Wire all dependencies at startup -- **Benefit**: Loose coupling, easy testing - -### 6. Template Method Pattern -- **Classes**: `BaseExtractor`, `BaseChunker` -- **Purpose**: Define algorithm skeleton, let subclasses fill in details -- **Benefit**: Code reuse, consistent behavior - -## SOLID Principles Application - -### Single Responsibility Principle (SRP) -- Each extractor handles ONE file type -- Each chunker handles ONE strategy -- Each service method does ONE thing -- Functions are max 15-20 lines - -### Open/Closed Principle (OCP) -- Add new extractors without modifying core -- Add new chunkers without modifying service -- Extend via interfaces, not modification - -### Liskov Substitution Principle (LSP) -- All IExtractor implementations are interchangeable -- All IChunker implementations are interchangeable -- Polymorphism works correctly - -### Interface Segregation Principle (ISP) -- Small, focused interfaces -- IExtractor: Only extraction concerns -- IChunker: Only chunking concerns -- No fat interfaces - -### Dependency Inversion Principle (DIP) -- Core depends on IExtractor (abstraction) -- Core does NOT depend on PDFExtractor (concrete) -- High-level modules don't depend on low-level modules - -## Error Handling Strategy - -### Domain Exceptions -All external errors are caught and wrapped in domain exceptions: - -```python -try: - PyPDF2.PdfReader(file) # External library -except PyPDF2.errors.PdfReadError as e: - raise ExtractionError( # Domain exception - message="Invalid PDF", - details=str(e), - ) -``` - -### Exception Hierarchy -``` -DomainException (Base) -├── ExtractionError -│ ├── UnsupportedFileTypeError -│ └── EmptyContentError -├── ChunkingError -├── ProcessingError -├── ValidationError -└── RepositoryError - └── DocumentNotFoundError -``` - -### HTTP Error Mapping -FastAPI adapter maps domain exceptions to HTTP status codes: -- `UnsupportedFileTypeError` → 400 Bad Request -- `ExtractionError` → 422 Unprocessable Entity -- `DocumentNotFoundError` → 404 Not Found -- `ProcessingError` → 500 Internal Server Error - -## Testing Strategy - -### Unit Tests (Core) -- Test domain models in isolation -- Test logic utils (pure functions) -- Test services with mock ports - -### Integration Tests (Adapters) -- Test extractors with real files -- Test chunkers with real text -- Test repository operations - -### API Tests (End-to-End) -- Test FastAPI routes -- Test complete workflows -- Test error scenarios - -### Example Test Structure -```python -def test_document_processor_service(): - # Arrange: Create mocks - mock_repository = MockRepository() - mock_factory = MockExtractorFactory() - mock_context = MockChunkingContext() - - # Act: Inject mocks - service = DocumentProcessorService( - extractor_factory=mock_factory, - chunking_context=mock_context, - repository=mock_repository, - ) - - # Assert: Test behavior - result = service.process_document(...) - assert result.is_processed -``` - -## Extensibility Examples - -### Adding a New Extractor (HTML) -1. Create `html_extractor.py`: -```python -class HTMLExtractor(BaseExtractor): - def __init__(self): - super().__init__(supported_extensions=['html', 'htm']) - - def _extract_text(self, file_path: Path) -> str: - from bs4 import BeautifulSoup - html = file_path.read_text() - soup = BeautifulSoup(html, 'html.parser') - return soup.get_text() -``` - -2. Register in `bootstrap.py`: -```python -factory.register_extractor(HTMLExtractor()) -``` - -### Adding a New Chunking Strategy (Sentence) -1. Create `sentence_chunker.py`: -```python -class SentenceChunker(BaseChunker): - def __init__(self): - super().__init__(strategy_name="sentence") - - def _split_text(self, text: str, strategy: ChunkingStrategy) -> List[tuple[str, int, int]]: - # Use NLTK to split into sentences - sentences = nltk.sent_tokenize(text) - # Group sentences to reach chunk_size - return grouped_segments -``` - -2. Register in `bootstrap.py`: -```python -context.register_chunker(SentenceChunker()) -``` - -### Adding Database Persistence -1. Create `postgres_repository.py`: -```python -class PostgresDocumentRepository(IDocumentRepository): - def __init__(self, connection_string: str): - self.engine = create_engine(connection_string) - - def save(self, document: Document) -> Document: - # Save to PostgreSQL - pass -``` - -2. Swap in `bootstrap.py`: -```python -def _create_repository(self): - return PostgresDocumentRepository("postgresql://...") -``` - -## Performance Considerations - -### Current Implementation -- In-memory storage: O(1) lookups, limited by RAM -- Synchronous processing: Sequential file processing -- Thread-safe: Uses locks for concurrent access - -### Future Optimizations -- **Async Processing**: Use `asyncio` for concurrent document processing -- **Caching**: Add Redis for frequently accessed documents -- **Streaming**: Process large files in chunks -- **Database**: Use PostgreSQL with indexes for better queries -- **Message Queue**: Use Celery/RabbitMQ for background processing - -## Deployment Considerations - -### Configuration -- Use environment variables for settings -- Externalize file paths, database connections -- Use `pydantic-settings` for config management - -### Monitoring -- Add structured logging (JSON format) -- Track metrics: processing time, error rates -- Use APM tools (DataDog, New Relic) - -### Scaling -- Horizontal: Run multiple FastAPI instances behind load balancer -- Vertical: Increase resources for compute-heavy extraction -- Database: Use connection pooling, read replicas diff --git a/ARCHITECTURE_CORRECTIONS_SUMMARY.md b/ARCHITECTURE_CORRECTIONS_SUMMARY.md deleted file mode 100644 index e25d9ea..0000000 --- a/ARCHITECTURE_CORRECTIONS_SUMMARY.md +++ /dev/null @@ -1,408 +0,0 @@ -# Architecture Corrections Summary - -## What Was Fixed - -This document summarizes the corrections made to ensure **strict Hexagonal Architecture compliance**. - ---- - -## ❌ Problems Found - -### 1. Base Classes in Wrong Layer -**Problem**: Abstract base classes (`base.py`) were located in the Adapters layer. - -**Files Removed**: -- `src/adapters/outgoing/extractors/base.py` ❌ -- `src/adapters/outgoing/chunkers/base.py` ❌ - -**Why This Was Wrong**: -- Abstract base classes define **contracts** (interfaces) -- Contracts belong in the **Core Ports** layer, NOT Adapters -- Adapters should only contain **concrete implementations** - -### 2. Missing Port Interfaces -**Problem**: Factory and Context interfaces were defined in Adapters. - -**What Was Missing**: -- No `IExtractorFactory` interface in Core Ports -- No `IChunkingContext` interface in Core Ports - -**Why This Was Wrong**: -- Service layer was importing from Adapters (violates dependency rules) -- Core → Adapters dependency is **strictly forbidden** - -### 3. Incorrect Imports in Service -**Problem**: Core Service imported from Adapters layer. - -```python -# WRONG ❌ -from ...adapters.outgoing.extractors.factory import IExtractorFactory -from ...adapters.outgoing.chunkers.context import IChunkingContext -``` - -**Why This Was Wrong**: -- Core must NEVER import from Adapters -- Creates circular dependency risk -- Violates Dependency Inversion Principle - ---- - -## ✅ Solutions Implemented - -### 1. Created Port Interfaces in Core - -**New Files Created**: -``` -src/core/ports/outgoing/extractor_factory.py ✅ -src/core/ports/outgoing/chunking_context.py ✅ -``` - -**Content**: -```python -# src/core/ports/outgoing/extractor_factory.py -class IExtractorFactory(ABC): - """Interface for extractor factory (PORT).""" - - @abstractmethod - def create_extractor(self, file_path: Path) -> IExtractor: - pass - - @abstractmethod - def register_extractor(self, extractor: IExtractor) -> None: - pass -``` - -```python -# src/core/ports/outgoing/chunking_context.py -class IChunkingContext(ABC): - """Interface for chunking context (PORT).""" - - @abstractmethod - def set_strategy(self, strategy_name: str) -> None: - pass - - @abstractmethod - def execute_chunking(...) -> List[Chunk]: - pass -``` - -### 2. Updated Concrete Implementations - -**Extractors** - Now directly implement `IExtractor` port: -```python -# src/adapters/outgoing/extractors/pdf_extractor.py -from ....core.ports.outgoing.extractor import IExtractor ✅ - -class PDFExtractor(IExtractor): - """Concrete PDF extractor implementing IExtractor port.""" - - def extract(self, file_path: Path) -> Document: - # Direct implementation, no base class needed - pass -``` - -**Chunkers** - Now directly implement `IChunker` port: -```python -# src/adapters/outgoing/chunkers/fixed_size_chunker.py -from ....core.ports.outgoing.chunker import IChunker ✅ - -class FixedSizeChunker(IChunker): - """Concrete fixed-size chunker implementing IChunker port.""" - - def chunk(self, text: str, ...) -> List[Chunk]: - # Direct implementation, no base class needed - pass -``` - -**Factory** - Now implements `IExtractorFactory` port: -```python -# src/adapters/outgoing/extractors/factory.py -from ....core.ports.outgoing.extractor_factory import IExtractorFactory ✅ - -class ExtractorFactory(IExtractorFactory): - """Concrete factory implementing IExtractorFactory port.""" - pass -``` - -**Context** - Now implements `IChunkingContext` port: -```python -# src/adapters/outgoing/chunkers/context.py -from ....core.ports.outgoing.chunking_context import IChunkingContext ✅ - -class ChunkingContext(IChunkingContext): - """Concrete context implementing IChunkingContext port.""" - pass -``` - -### 3. Fixed Service Layer Imports - -**Before** (WRONG ❌): -```python -# src/core/services/document_processor_service.py -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from ...adapters.outgoing.extractors.factory import IExtractorFactory - from ...adapters.outgoing.chunkers.context import IChunkingContext -``` - -**After** (CORRECT ✅): -```python -# src/core/services/document_processor_service.py -from ..ports.outgoing.chunking_context import IChunkingContext -from ..ports.outgoing.extractor_factory import IExtractorFactory -``` - ---- - -## 🎯 Final Architecture - -### Core Layer (Pure Domain) -``` -src/core/ -├── domain/ -│ ├── models.py # Pydantic v2 entities -│ ├── exceptions.py # Domain exceptions -│ └── logic_utils.py # Pure functions -├── ports/ -│ ├── incoming/ -│ │ └── text_processor.py # ITextProcessor -│ └── outgoing/ -│ ├── extractor.py # IExtractor -│ ├── extractor_factory.py # IExtractorFactory ✅ NEW -│ ├── chunker.py # IChunker -│ ├── chunking_context.py # IChunkingContext ✅ NEW -│ └── repository.py # IDocumentRepository -└── services/ - └── document_processor_service.py # Orchestrator -``` - -### Adapters Layer (Infrastructure) -``` -src/adapters/ -├── incoming/ -│ ├── api_routes.py # FastAPI (implements incoming port) -│ └── api_schemas.py # API DTOs -└── outgoing/ - ├── extractors/ - │ ├── pdf_extractor.py # Implements IExtractor - │ ├── docx_extractor.py # Implements IExtractor - │ ├── txt_extractor.py # Implements IExtractor - │ └── factory.py # Implements IExtractorFactory - ├── chunkers/ - │ ├── fixed_size_chunker.py # Implements IChunker - │ ├── paragraph_chunker.py # Implements IChunker - │ └── context.py # Implements IChunkingContext - └── persistence/ - └── in_memory_repository.py # Implements IDocumentRepository -``` - -### Bootstrap Layer (Wiring) -``` -src/bootstrap.py # Dependency Injection -``` - ---- - -## ✅ Verification Results - -### 1. No Adapters Imports in Core -```bash -$ grep -r "from.*adapters" src/core/ -# Result: NO MATCHES ✅ -``` - -### 2. No External Libraries in Core -```bash -$ grep -rE "import (PyPDF2|docx|fastapi)" src/core/ -# Result: NO MATCHES ✅ -``` - -### 3. All Interfaces in Core Ports -```bash -$ find src/core/ports -name "*.py" | grep -v __init__ -src/core/ports/incoming/text_processor.py -src/core/ports/outgoing/extractor.py -src/core/ports/outgoing/extractor_factory.py ✅ NEW -src/core/ports/outgoing/chunker.py -src/core/ports/outgoing/chunking_context.py ✅ NEW -src/core/ports/outgoing/repository.py -# Result: ALL INTERFACES IN PORTS ✅ -``` - -### 4. No Base Classes in Adapters -```bash -$ find src/adapters -name "base.py" -# Result: NO MATCHES ✅ -``` - ---- - -## 📊 Dependency Direction - -### ✅ Correct Flow (Inward) -``` -FastAPI Routes - │ - ▼ -ITextProcessor (PORT) - │ - ▼ -DocumentProcessorService (CORE) - │ - ├──► IExtractor (PORT) - │ │ - │ ▼ - │ PDFExtractor (ADAPTER) - │ - ├──► IChunker (PORT) - │ │ - │ ▼ - │ FixedSizeChunker (ADAPTER) - │ - └──► IDocumentRepository (PORT) - │ - ▼ - InMemoryRepository (ADAPTER) -``` - -### ❌ What We Avoided -``` -Core Service ──X──> Adapters # NEVER! -Core Service ──X──> PyPDF2 # NEVER! -Core Service ──X──> FastAPI # NEVER! -Domain Models ──X──> Services # NEVER! -Domain Models ──X──> Ports # NEVER! -``` - ---- - -## 🏆 Benefits Achieved - -### 1. **Pure Core Domain** -- Core has ZERO framework dependencies -- Core can be tested without ANY infrastructure -- Core is completely portable - -### 2. **True Dependency Inversion** -- Core depends on abstractions (Ports) -- Adapters depend on Core Ports -- NO Core → Adapter dependencies - -### 3. **Easy Testing** -```python -# Test Core without ANY adapters -def test_service(): - mock_factory = MockExtractorFactory() # Mock Port - mock_context = MockChunkingContext() # Mock Port - mock_repo = MockRepository() # Mock Port - - service = DocumentProcessorService( - extractor_factory=mock_factory, - chunking_context=mock_context, - repository=mock_repo, - ) - - # Test pure business logic - result = service.process_document(...) - assert result.is_processed -``` - -### 4. **Easy Extension** -```python -# Add new file type - NO Core changes needed -class HTMLExtractor(IExtractor): - def extract(self, file_path: Path) -> Document: - # Implementation - pass - -# Register in Bootstrap -factory.register_extractor(HTMLExtractor()) -``` - -### 5. **Swappable Implementations** -```python -# Swap repository - ONE line change in Bootstrap -# Before: -self._repository = InMemoryDocumentRepository() - -# After: -self._repository = PostgresDocumentRepository(connection_string) - -# NO other code changes needed! -``` - ---- - -## 📝 Summary of Changes - -### Files Deleted -- ❌ `src/adapters/outgoing/extractors/base.py` -- ❌ `src/adapters/outgoing/chunkers/base.py` - -### Files Created -- ✅ `src/core/ports/outgoing/extractor_factory.py` -- ✅ `src/core/ports/outgoing/chunking_context.py` -- ✅ `HEXAGONAL_ARCHITECTURE_COMPLIANCE.md` -- ✅ `ARCHITECTURE_CORRECTIONS_SUMMARY.md` - -### Files Modified -- 🔧 `src/core/services/document_processor_service.py` (fixed imports) -- 🔧 `src/adapters/outgoing/extractors/pdf_extractor.py` (implement port directly) -- 🔧 `src/adapters/outgoing/extractors/docx_extractor.py` (implement port directly) -- 🔧 `src/adapters/outgoing/extractors/txt_extractor.py` (implement port directly) -- 🔧 `src/adapters/outgoing/extractors/factory.py` (implement port from Core) -- 🔧 `src/adapters/outgoing/chunkers/fixed_size_chunker.py` (implement port directly) -- 🔧 `src/adapters/outgoing/chunkers/paragraph_chunker.py` (implement port directly) -- 🔧 `src/adapters/outgoing/chunkers/context.py` (implement port from Core) - ---- - -## 🎓 Key Learnings - -### What is a "Port"? -- An **interface** (abstract base class) -- Defines a **contract** -- Lives in **Core** layer -- Independent of implementation details - -### What is an "Adapter"? -- A **concrete implementation** -- Implements a **Port** interface -- Lives in **Adapters** layer -- Contains technology-specific code - -### Where Do Factories/Contexts Live? -- **Interfaces** (IExtractorFactory, IChunkingContext) → **Core Ports** -- **Implementations** (ExtractorFactory, ChunkingContext) → **Adapters** -- Bootstrap injects implementations into Core Service - -### Dependency Rule -``` -Adapters → Ports (Core) ✅ -Core → Ports (Core) ✅ -Core → Adapters ❌ NEVER! -``` - ---- - -## ✅ Final Certification - -This codebase now **STRICTLY ADHERES** to Hexagonal Architecture: - -- ✅ All interfaces in Core Ports -- ✅ All implementations in Adapters -- ✅ Zero Core → Adapter dependencies -- ✅ Pure domain layer -- ✅ Proper dependency inversion -- ✅ Easy to test -- ✅ Easy to extend -- ✅ Production-ready - -**Architecture Compliance**: **GOLD STANDARD** ⭐⭐⭐⭐⭐ - ---- - -*Corrections Applied: 2026-01-07* -*Architecture Review: APPROVED* -*Compliance Status: CERTIFIED* diff --git a/DIRECTORY_TREE.txt b/DIRECTORY_TREE.txt deleted file mode 100644 index f1513cf..0000000 --- a/DIRECTORY_TREE.txt +++ /dev/null @@ -1,230 +0,0 @@ -TEXT PROCESSOR - HEXAGONAL ARCHITECTURE -Complete Directory Structure - -text_processor_hex/ -│ -├── 📄 README.md Project documentation and overview -├── 📄 QUICK_START.md Quick start guide for users -├── 📄 ARCHITECTURE.md Detailed architecture documentation -├── 📄 PROJECT_SUMMARY.md Complete project summary -├── 📄 DIRECTORY_TREE.txt This file -│ -├── 📄 requirements.txt Python dependencies -├── 🚀 main.py FastAPI application entry point -├── 📝 example_usage.py Programmatic usage examples -│ -└── 📁 src/ - ├── 📄 __init__.py - ├── 🔧 bootstrap.py ⚙️ DEPENDENCY INJECTION CONTAINER - │ - ├── 📁 core/ ⭐ DOMAIN LAYER (Pure Business Logic) - │ ├── 📄 __init__.py - │ │ - │ ├── 📁 domain/ Domain Models & Logic - │ │ ├── 📄 __init__.py - │ │ ├── 📦 models.py Rich Pydantic v2 Entities - │ │ │ - Document - │ │ │ - DocumentMetadata - │ │ │ - Chunk - │ │ │ - ChunkingStrategy - │ │ ├── ⚠️ exceptions.py Domain Exceptions - │ │ │ - ExtractionError - │ │ │ - ChunkingError - │ │ │ - ProcessingError - │ │ │ - ValidationError - │ │ │ - RepositoryError - │ │ └── 🔨 logic_utils.py Pure Functions - │ │ - normalize_whitespace() - │ │ - clean_text() - │ │ - split_into_paragraphs() - │ │ - truncate_to_word_boundary() - │ │ - │ ├── 📁 ports/ Port Interfaces (Abstractions) - │ │ ├── 📄 __init__.py - │ │ │ - │ │ ├── 📁 incoming/ Service Interfaces (Use Cases) - │ │ │ ├── 📄 __init__.py - │ │ │ └── 🔌 text_processor.py ITextProcessor - │ │ │ - process_document() - │ │ │ - extract_and_chunk() - │ │ │ - get_document() - │ │ │ - list_documents() - │ │ │ - │ │ └── 📁 outgoing/ SPIs (Service Provider Interfaces) - │ │ ├── 📄 __init__.py - │ │ ├── 🔌 extractor.py IExtractor - │ │ │ - extract() - │ │ │ - supports_file_type() - │ │ ├── 🔌 chunker.py IChunker - │ │ │ - chunk() - │ │ │ - supports_strategy() - │ │ └── 🔌 repository.py IDocumentRepository - │ │ - save() - │ │ - find_by_id() - │ │ - delete() - │ │ - │ └── 📁 services/ Business Logic Orchestration - │ ├── 📄 __init__.py - │ └── ⚙️ document_processor_service.py - │ DocumentProcessorService - │ Implements: ITextProcessor - │ Workflow: Extract → Clean → Chunk → Save - │ - ├── 📁 adapters/ 🔌 ADAPTER LAYER (External Concerns) - │ ├── 📄 __init__.py - │ │ - │ ├── 📁 incoming/ Driving Adapters (Primary) - │ │ ├── 📄 __init__.py - │ │ ├── 🌐 api_routes.py FastAPI Routes (HTTP Adapter) - │ │ │ - POST /process - │ │ │ - POST /extract-and-chunk - │ │ │ - GET /documents/{id} - │ │ │ - GET /documents - │ │ │ - DELETE /documents/{id} - │ │ └── 📋 api_schemas.py Pydantic Request/Response Models - │ │ - ProcessDocumentRequest - │ │ - DocumentResponse - │ │ - ChunkResponse - │ │ - │ └── 📁 outgoing/ Driven Adapters (Secondary) - │ ├── 📄 __init__.py - │ │ - │ ├── 📁 extractors/ Text Extraction Adapters - │ │ ├── 📄 __init__.py - │ │ ├── 📑 base.py BaseExtractor (Template Method) - │ │ ├── 📕 pdf_extractor.py PDFExtractor - │ │ │ Uses: PyPDF2 - │ │ │ Supports: .pdf - │ │ ├── 📘 docx_extractor.py DocxExtractor - │ │ │ Uses: python-docx - │ │ │ Supports: .docx - │ │ ├── 📄 txt_extractor.py TxtExtractor - │ │ │ Uses: built-in - │ │ │ Supports: .txt, .md - │ │ └── 🏭 factory.py ExtractorFactory (Factory Pattern) - │ │ - create_extractor() - │ │ - register_extractor() - │ │ - │ ├── 📁 chunkers/ Text Chunking Adapters - │ │ ├── 📄 __init__.py - │ │ ├── 📑 base.py BaseChunker (Template Method) - │ │ ├── ✂️ fixed_size_chunker.py FixedSizeChunker - │ │ │ Strategy: Fixed-size chunks - │ │ │ Features: Overlap, boundaries - │ │ ├── 📝 paragraph_chunker.py ParagraphChunker - │ │ │ Strategy: Paragraph-based - │ │ │ Features: Respect paragraphs - │ │ └── 🎯 context.py ChunkingContext (Strategy Pattern) - │ │ - set_strategy() - │ │ - execute_chunking() - │ │ - │ └── 📁 persistence/ Data Persistence Adapters - │ ├── 📄 __init__.py - │ └── 💾 in_memory_repository.py - │ InMemoryDocumentRepository - │ Features: Thread-safe, Dict storage - │ - └── 📁 shared/ 🛠️ SHARED LAYER (Cross-Cutting) - ├── 📄 __init__.py - ├── 🎛️ constants.py Application Constants - │ - File types - │ - Chunk sizes - │ - API config - └── 📋 logging_config.py Logging Configuration - - setup_logging() - - get_logger() - - -═══════════════════════════════════════════════════════════════════════════ - -📊 PROJECT STATISTICS -═══════════════════════════════════════════════════════════════════════════ - -Total Files: 44 - - Python files: 42 - - Documentation: 4 (README, ARCHITECTURE, SUMMARY, QUICK_START) - - Configuration: 1 (requirements.txt) - - Other: 1 (this tree) - -Lines of Code: ~3,800 - - Core Domain: ~1,200 lines - - Adapters: ~1,400 lines - - Bootstrap/Main: ~200 lines - - Documentation: ~1,000 lines - -═══════════════════════════════════════════════════════════════════════════ - -🏗️ ARCHITECTURE LAYERS -═══════════════════════════════════════════════════════════════════════════ - -1. CORE (Domain Layer) - - Pure business logic - - No external dependencies - - Rich domain models - - Pure functions - -2. ADAPTERS (Infrastructure Layer) - - Incoming: FastAPI (HTTP) - - Outgoing: Extractors, Chunkers, Repository - - Technology-specific implementations - -3. BOOTSTRAP (Wiring Layer) - - Dependency injection - - Configuration - - Application assembly - -4. SHARED (Utilities Layer) - - Cross-cutting concerns - - Logging, constants - - No business logic - -═══════════════════════════════════════════════════════════════════════════ - -🎨 DESIGN PATTERNS -═══════════════════════════════════════════════════════════════════════════ - -✓ Hexagonal Architecture (Ports & Adapters) -✓ Factory Pattern (ExtractorFactory) -✓ Strategy Pattern (ChunkingContext) -✓ Repository Pattern (IDocumentRepository) -✓ Template Method Pattern (BaseExtractor, BaseChunker) -✓ Dependency Injection (ApplicationContainer) - -═══════════════════════════════════════════════════════════════════════════ - -💎 SOLID PRINCIPLES -═══════════════════════════════════════════════════════════════════════════ - -✓ Single Responsibility: Each class has one job -✓ Open/Closed: Extend via interfaces, not modification -✓ Liskov Substitution: All implementations are interchangeable -✓ Interface Segregation: Small, focused interfaces -✓ Dependency Inversion: Depend on abstractions, not concretions - -═══════════════════════════════════════════════════════════════════════════ - -🎯 KEY FEATURES -═══════════════════════════════════════════════════════════════════════════ - -✓ Multiple file types (PDF, DOCX, TXT) -✓ Multiple chunking strategies (Fixed, Paragraph) -✓ Rich domain models with validation -✓ Comprehensive error handling -✓ RESTful API with FastAPI -✓ Thread-safe repository -✓ 100% type hints -✓ Google-style docstrings -✓ Complete documentation - -═══════════════════════════════════════════════════════════════════════════ - -📚 DOCUMENTATION FILES -═══════════════════════════════════════════════════════════════════════════ - -README.md - Project overview and installation -QUICK_START.md - Quick start guide for users -ARCHITECTURE.md - Detailed architecture documentation with diagrams -PROJECT_SUMMARY.md - Complete project summary and statistics -DIRECTORY_TREE.txt - This file - -═══════════════════════════════════════════════════════════════════════════ diff --git a/HEXAGONAL_ARCHITECTURE_COMPLIANCE.md b/HEXAGONAL_ARCHITECTURE_COMPLIANCE.md deleted file mode 100644 index 314bba8..0000000 --- a/HEXAGONAL_ARCHITECTURE_COMPLIANCE.md +++ /dev/null @@ -1,590 +0,0 @@ -# Hexagonal Architecture Compliance Report - -## Overview -This document certifies that the Text Processor codebase strictly adheres to **Hexagonal Architecture** (Ports & Adapters) principles as defined by Alistair Cockburn. - ---- - -## ✅ Architectural Compliance Checklist - -### 1. Core Domain Isolation -- [x] **Core has ZERO dependencies on Adapters** -- [x] **Core depends ONLY on standard library and Pydantic** -- [x] **No framework dependencies in Core** (no FastAPI, no PyPDF2, no python-docx) -- [x] **All external tool usage is in Adapters** - -### 2. Port Definitions (Interfaces) -- [x] **ALL interfaces defined in `src/core/ports/`** -- [x] **NO abstract base classes in `src/adapters/`** -- [x] **Incoming Ports**: `ITextProcessor` (Service Interface) -- [x] **Outgoing Ports**: `IExtractor`, `IChunker`, `IDocumentRepository` - -### 3. Adapter Implementation -- [x] **ALL concrete implementations in `src/adapters/`** -- [x] **Adapters implement Core Ports** -- [x] **Adapters catch technical errors and raise Domain exceptions** -- [x] **NO business logic in Adapters** - -### 4. Dependency Direction -- [x] **Dependencies point INWARD** (Adapters → Core, never Core → Adapters) -- [x] **Dependency Inversion Principle satisfied** -- [x] **Bootstrap is ONLY place that knows about both Core and Adapters** - -### 5. Factory & Strategy Patterns -- [x] **ExtractorFactory in Adapters layer** (not Core) -- [x] **ChunkingContext in Adapters layer** (not Core) -- [x] **Factories/Contexts registered in Bootstrap** - ---- - -## 📂 Corrected Directory Structure - -``` -src/ -├── core/ # DOMAIN LAYER (Pure Logic) -│ ├── domain/ -│ │ ├── models.py # Rich Pydantic entities -│ │ ├── exceptions.py # Domain exceptions -│ │ └── logic_utils.py # Pure functions -│ ├── ports/ -│ │ ├── incoming/ -│ │ │ └── text_processor.py # ITextProcessor (USE CASE) -│ │ └── outgoing/ -│ │ ├── extractor.py # IExtractor (SPI) -│ │ ├── chunker.py # IChunker (SPI) -│ │ └── repository.py # IDocumentRepository (SPI) -│ └── services/ -│ └── document_processor_service.py # Orchestrator (depends on Ports) -│ -├── adapters/ # INFRASTRUCTURE LAYER -│ ├── incoming/ -│ │ ├── api_routes.py # FastAPI adapter -│ │ └── api_schemas.py # API DTOs -│ └── outgoing/ -│ ├── extractors/ -│ │ ├── pdf_extractor.py # Implements IExtractor -│ │ ├── docx_extractor.py # Implements IExtractor -│ │ ├── txt_extractor.py # Implements IExtractor -│ │ └── factory.py # Factory (ADAPTER LAYER) -│ ├── chunkers/ -│ │ ├── fixed_size_chunker.py # Implements IChunker -│ │ ├── paragraph_chunker.py # Implements IChunker -│ │ └── context.py # Strategy Context (ADAPTER LAYER) -│ └── persistence/ -│ └── in_memory_repository.py # Implements IDocumentRepository -│ -├── shared/ # UTILITIES -│ ├── constants.py -│ └── logging_config.py -│ -└── bootstrap.py # DEPENDENCY INJECTION -``` - ---- - -## 🔍 Key Corrections Made - -### ❌ REMOVED: `base.py` files from Adapters -**Before (WRONG)**: -``` -src/adapters/outgoing/extractors/base.py # Abstract base in Adapters ❌ -src/adapters/outgoing/chunkers/base.py # Abstract base in Adapters ❌ -``` - -**After (CORRECT)**: -- Removed all `base.py` files from adapters -- Abstract interfaces exist ONLY in `src/core/ports/outgoing/` - -### ✅ Concrete Implementations Directly Implement Ports - -**Before (WRONG)**: -```python -# In src/adapters/outgoing/extractors/pdf_extractor.py -from .base import BaseExtractor # Inheriting from adapter base ❌ - -class PDFExtractor(BaseExtractor): - pass -``` - -**After (CORRECT)**: -```python -# In src/adapters/outgoing/extractors/pdf_extractor.py -from ....core.ports.outgoing.extractor import IExtractor # Port from Core ✅ - -class PDFExtractor(IExtractor): - """Concrete implementation of IExtractor for PDF files.""" - - def extract(self, file_path: Path) -> Document: - # Implementation - pass - - def supports_file_type(self, file_extension: str) -> bool: - # Implementation - pass - - def get_supported_types(self) -> List[str]: - # Implementation - pass -``` - ---- - -## 🎯 Dependency Graph - -``` -┌──────────────────────────────────────────────────────────────┐ -│ HTTP Request (FastAPI) │ -└────────────────────────┬─────────────────────────────────────┘ - │ - ▼ -┌──────────────────────────────────────────────────────────────┐ -│ INCOMING ADAPTER (api_routes.py) │ -│ Depends on: ITextProcessor (Port) │ -└────────────────────────┬─────────────────────────────────────┘ - │ - ▼ -┌──────────────────────────────────────────────────────────────┐ -│ CORE DOMAIN LAYER │ -│ ┌────────────────────────────────────────────────────────┐ │ -│ │ DocumentProcessorService (implements ITextProcessor) │ │ -│ │ Depends on: │ │ -│ │ - IExtractor (Port) │ │ -│ │ - IChunker (Port) │ │ -│ │ - IDocumentRepository (Port) │ │ -│ │ - Domain Models │ │ -│ │ - Domain Logic Utils │ │ -│ └────────────────────────────────────────────────────────┘ │ -└────────────────────────┬─────────────────────────────────────┘ - │ - ▼ -┌──────────────────────────────────────────────────────────────┐ -│ OUTGOING ADAPTERS │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │PDFExtractor │ │FixedSizeChkr │ │InMemoryRepo │ │ -│ │(IExtractor) │ │(IChunker) │ │(IRepository) │ │ -│ └──────────────┘ └──────────────┘ └──────────────┘ │ -│ │ -│ Uses: PyPDF2 Uses: Logic Uses: Dict │ -│ Utils │ -└──────────────────────────────────────────────────────────────┘ -``` - ---- - -## 🔒 Dependency Rules Enforcement - -### ✅ ALLOWED Dependencies - -``` -Core Domain ──→ Standard Library -Core Domain ──→ Pydantic (Data Validation) -Core Services ──→ Core Ports (Interfaces) -Core Services ──→ Core Domain Models -Core Services ──→ Core Logic Utils - -Adapters ──→ Core Ports (Implement interfaces) -Adapters ──→ Core Domain Models (Use entities) -Adapters ──→ Core Exceptions (Raise domain errors) -Adapters ──→ External Libraries (PyPDF2, python-docx, FastAPI) - -Bootstrap ──→ Core (Services, Ports) -Bootstrap ──→ Adapters (Concrete implementations) -``` - -### ❌ FORBIDDEN Dependencies - -``` -Core ──X──> Adapters (NEVER!) -Core ──X──> External Libraries (ONLY via Adapters) -Core ──X──> FastAPI (ONLY in Adapters) -Core ──X──> PyPDF2 (ONLY in Adapters) -Core ──X──> python-docx (ONLY in Adapters) - -Domain Models ──X──> Services -Domain Models ──X──> Ports -``` - ---- - -## 📋 Port Interfaces (Core Layer) - -### Incoming Port: ITextProcessor -```python -# src/core/ports/incoming/text_processor.py -from abc import ABC, abstractmethod - -class ITextProcessor(ABC): - """Service interface for text processing use cases.""" - - @abstractmethod - def process_document(self, file_path: Path, strategy: ChunkingStrategy) -> Document: - pass - - @abstractmethod - def extract_and_chunk(self, file_path: Path, strategy: ChunkingStrategy) -> List[Chunk]: - pass -``` - -### Outgoing Port: IExtractor -```python -# src/core/ports/outgoing/extractor.py -from abc import ABC, abstractmethod - -class IExtractor(ABC): - """Interface for text extraction from documents.""" - - @abstractmethod - def extract(self, file_path: Path) -> Document: - pass - - @abstractmethod - def supports_file_type(self, file_extension: str) -> bool: - pass - - @abstractmethod - def get_supported_types(self) -> List[str]: - pass -``` - -### Outgoing Port: IChunker -```python -# src/core/ports/outgoing/chunker.py -from abc import ABC, abstractmethod - -class IChunker(ABC): - """Interface for text chunking strategies.""" - - @abstractmethod - def chunk(self, text: str, document_id: UUID, strategy: ChunkingStrategy) -> List[Chunk]: - pass - - @abstractmethod - def supports_strategy(self, strategy_name: str) -> bool: - pass - - @abstractmethod - def get_strategy_name(self) -> str: - pass -``` - -### Outgoing Port: IDocumentRepository -```python -# src/core/ports/outgoing/repository.py -from abc import ABC, abstractmethod - -class IDocumentRepository(ABC): - """Interface for document persistence.""" - - @abstractmethod - def save(self, document: Document) -> Document: - pass - - @abstractmethod - def find_by_id(self, document_id: UUID) -> Optional[Document]: - pass -``` - ---- - -## 🔧 Adapter Implementations - -### PDF Extractor -```python -# src/adapters/outgoing/extractors/pdf_extractor.py -from ....core.ports.outgoing.extractor import IExtractor -from ....core.domain.models import Document -from ....core.domain.exceptions import ExtractionError - -class PDFExtractor(IExtractor): - """Concrete PDF extractor using PyPDF2.""" - - def extract(self, file_path: Path) -> Document: - try: - import PyPDF2 # External library ONLY in adapter - # ... extraction logic - except PyPDF2.errors.PdfReadError as e: - # Map technical error to domain error - raise ExtractionError( - message="Invalid PDF file", - details=str(e), - file_path=str(file_path), - ) -``` - -### Fixed Size Chunker -```python -# src/adapters/outgoing/chunkers/fixed_size_chunker.py -from ....core.ports.outgoing.chunker import IChunker -from ....core.domain.models import Chunk, ChunkingStrategy -from ....core.domain import logic_utils # Pure functions from Core - -class FixedSizeChunker(IChunker): - """Concrete fixed-size chunker.""" - - def chunk(self, text: str, document_id: UUID, strategy: ChunkingStrategy) -> List[Chunk]: - # Uses pure functions from Core (logic_utils) - # Creates Chunk entities from Core domain - pass -``` - ---- - -## 🎨 Design Pattern Locations - -### Factory Pattern -**Location**: `src/adapters/outgoing/extractors/factory.py` -```python -class ExtractorFactory: - """Factory for creating extractors (ADAPTER LAYER).""" - - def create_extractor(self, file_path: Path) -> IExtractor: - # Returns implementations of IExtractor port - pass -``` - -**Why in Adapters?** -- Factory knows about concrete implementations (PDFExtractor, DocxExtractor) -- Core should NOT know about concrete implementations -- Factory registered in Bootstrap, injected into Service - -### Strategy Pattern -**Location**: `src/adapters/outgoing/chunkers/context.py` -```python -class ChunkingContext: - """Strategy context for chunking (ADAPTER LAYER).""" - - def set_strategy(self, strategy_name: str) -> None: - # Selects concrete IChunker implementation - pass - - def execute_chunking(self, ...) -> List[Chunk]: - # Delegates to selected strategy - pass -``` - -**Why in Adapters?** -- Context knows about concrete strategies (FixedSizeChunker, ParagraphChunker) -- Core should NOT know about concrete strategies -- Context registered in Bootstrap, injected into Service - ---- - -## 🧪 Error Handling: Adapter → Domain - -Adapters catch technical errors and map them to domain exceptions: - -```python -# In PDFExtractor (Adapter) -try: - import PyPDF2 - # ... PyPDF2 operations -except PyPDF2.errors.PdfReadError as e: # Technical error - raise ExtractionError( # Domain error - message="Invalid PDF file", - details=str(e), - ) - -# In DocxExtractor (Adapter) -try: - import docx - # ... python-docx operations -except Exception as e: # Technical error - raise ExtractionError( # Domain error - message="DOCX extraction failed", - details=str(e), - ) -``` - -**Why?** -- Core defines domain exceptions (ExtractionError, ChunkingError, etc.) -- Adapters catch library-specific errors (PyPDF2.errors, etc.) -- Service layer only deals with domain exceptions -- Clean separation of technical vs. business concerns - ---- - -## 🏗️ Bootstrap: The Wiring Layer - -**Location**: `src/bootstrap.py` - -```python -class ApplicationContainer: - """Dependency injection container.""" - - def __init__(self): - # Create ADAPTERS (knows about concrete implementations) - self._repository = InMemoryDocumentRepository() - self._extractor_factory = self._create_extractor_factory() - self._chunking_context = self._create_chunking_context() - - # Inject into CORE SERVICE (only knows about Ports) - self._service = DocumentProcessorService( - extractor_factory=self._extractor_factory, # IExtractorFactory - chunking_context=self._chunking_context, # IChunkingContext - repository=self._repository, # IDocumentRepository - ) - - def _create_extractor_factory(self) -> ExtractorFactory: - factory = ExtractorFactory() - factory.register_extractor(PDFExtractor()) # Concrete - factory.register_extractor(DocxExtractor()) # Concrete - factory.register_extractor(TxtExtractor()) # Concrete - return factory - - def _create_chunking_context(self) -> ChunkingContext: - context = ChunkingContext() - context.register_chunker(FixedSizeChunker()) # Concrete - context.register_chunker(ParagraphChunker()) # Concrete - return context -``` - -**Key Points**: -1. Bootstrap is the ONLY place that imports both Core and Adapters -2. Core Service receives interfaces (Ports), not concrete implementations -3. Adapters are created and registered here -4. Perfect Dependency Inversion - ---- - -## ✅ SOLID Principles Compliance - -### Single Responsibility Principle -- [x] Each extractor handles ONE file type -- [x] Each chunker handles ONE strategy -- [x] Each service method has ONE responsibility -- [x] Functions are max 15-20 lines - -### Open/Closed Principle -- [x] Add new extractors without modifying Core -- [x] Add new chunkers without modifying Core -- [x] Extend via Ports, not modification - -### Liskov Substitution Principle -- [x] All IExtractor implementations are interchangeable -- [x] All IChunker implementations are interchangeable -- [x] Polymorphism works correctly - -### Interface Segregation Principle -- [x] Small, focused Port interfaces -- [x] IExtractor: Only extraction concerns -- [x] IChunker: Only chunking concerns -- [x] No fat interfaces - -### Dependency Inversion Principle -- [x] Core depends on IExtractor (abstraction), not PDFExtractor (concrete) -- [x] Core depends on IChunker (abstraction), not FixedSizeChunker (concrete) -- [x] High-level modules don't depend on low-level modules -- [x] Both depend on abstractions (Ports) - ---- - -## 🧪 Testing Benefits - -### Unit Tests (Core) -```python -def test_document_processor_service(): - # Mock the Ports (interfaces) - mock_factory = MockExtractorFactory() - mock_context = MockChunkingContext() - mock_repo = MockRepository() - - # Inject mocks (Dependency Inversion) - service = DocumentProcessorService( - extractor_factory=mock_factory, - chunking_context=mock_context, - repository=mock_repo, - ) - - # Test business logic WITHOUT any infrastructure - result = service.process_document(...) - assert result.is_processed -``` - -### Integration Tests (Adapters) -```python -def test_pdf_extractor(): - # Test concrete implementation with real PDF - extractor = PDFExtractor() - document = extractor.extract(Path("test.pdf")) - assert len(document.content) > 0 -``` - ---- - -## 📊 Verification Checklist - -Run these checks to verify architecture compliance: - -### 1. Import Analysis -```bash -# Core should NOT import from adapters -grep -r "from.*adapters" src/core/ -# Expected: NO RESULTS ✅ - -# Core should NOT import external libs (except Pydantic) -grep -r "import PyPDF2\|import docx\|import fastapi" src/core/ -# Expected: NO RESULTS ✅ -``` - -### 2. Dependency Direction -```bash -# All imports should point inward (toward Core) -# Adapters → Core: YES ✅ -# Core → Adapters: NO ❌ -``` - -### 3. Abstract Base Classes -```bash -# NO base.py files in adapters -find src/adapters -name "base.py" -# Expected: NO RESULTS ✅ - -# All interfaces in Core ports -find src/core/ports -name "*.py" | grep -v __init__ -# Expected: extractor.py, chunker.py, repository.py, text_processor.py ✅ -``` - ---- - -## 🎯 Summary - -### What Changed -1. **Removed** `base.py` from `src/adapters/outgoing/extractors/` -2. **Removed** `base.py` from `src/adapters/outgoing/chunkers/` -3. **Updated** all concrete implementations to directly implement Core Ports -4. **Confirmed** Factory and Context are in Adapters layer (correct location) -5. **Verified** Core has ZERO dependencies on Adapters - -### Architecture Guarantees -- ✅ Core is **100% pure** (no framework dependencies) -- ✅ Core depends ONLY on **abstractions** (Ports) -- ✅ Adapters implement **Core Ports** -- ✅ Bootstrap performs **Dependency Injection** -- ✅ **Zero circular dependencies** -- ✅ **Perfect Dependency Inversion** - -### Benefits Achieved -1. **Testability**: Core can be tested with mocks, no infrastructure needed -2. **Flexibility**: Swap implementations (in-memory → PostgreSQL) with one line -3. **Maintainability**: Clear separation of concerns -4. **Extensibility**: Add new file types/strategies without touching Core - ---- - -## 🏆 Certification - -This codebase is **CERTIFIED** as a true Hexagonal Architecture implementation: - -- ✅ Adheres to Alistair Cockburn's Ports & Adapters pattern -- ✅ Satisfies all SOLID principles -- ✅ Maintains proper dependency direction -- ✅ Zero Core → Adapter dependencies -- ✅ All interfaces in Core, all implementations in Adapters -- ✅ Bootstrap handles all dependency injection - -**Compliance Level**: **GOLD STANDARD** ⭐⭐⭐⭐⭐ - ---- - -*Last Updated: 2026-01-07* -*Architecture Review Status: APPROVED* diff --git a/PROJECT_SUMMARY.md b/PROJECT_SUMMARY.md deleted file mode 100644 index 8cbc642..0000000 --- a/PROJECT_SUMMARY.md +++ /dev/null @@ -1,419 +0,0 @@ -# Project Summary: Text Processor - Hexagonal Architecture - -## Overview -This is a **production-ready, "Gold Standard" implementation** of a text extraction and chunking system built with **Hexagonal Architecture** (Ports & Adapters pattern). - -## Complete File Structure - -``` -text_processor_hex/ -├── README.md # Project documentation -├── ARCHITECTURE.md # Detailed architecture guide -├── PROJECT_SUMMARY.md # This file -├── requirements.txt # Python dependencies -├── main.py # FastAPI application entry point -├── example_usage.py # Programmatic usage example -│ -└── src/ - ├── __init__.py - ├── bootstrap.py # Dependency Injection Container - │ - ├── core/ # DOMAIN LAYER (Pure Business Logic) - │ ├── __init__.py - │ ├── domain/ - │ │ ├── __init__.py - │ │ ├── models.py # Rich Pydantic v2 Entities - │ │ ├── exceptions.py # Domain Exceptions - │ │ └── logic_utils.py # Pure Functions - │ ├── ports/ - │ │ ├── __init__.py - │ │ ├── incoming/ - │ │ │ ├── __init__.py - │ │ │ └── text_processor.py # Service Interface (Use Case) - │ │ └── outgoing/ - │ │ ├── __init__.py - │ │ ├── extractor.py # Extractor Interface (SPI) - │ │ ├── chunker.py # Chunker Interface (SPI) - │ │ └── repository.py # Repository Interface (SPI) - │ └── services/ - │ ├── __init__.py - │ └── document_processor_service.py # Business Logic Orchestration - │ - ├── adapters/ # ADAPTER LAYER (External Concerns) - │ ├── __init__.py - │ ├── incoming/ # Driving Adapters (HTTP) - │ │ ├── __init__.py - │ │ ├── api_routes.py # FastAPI Routes - │ │ └── api_schemas.py # Pydantic Request/Response Models - │ └── outgoing/ # Driven Adapters (Infrastructure) - │ ├── __init__.py - │ ├── extractors/ - │ │ ├── __init__.py - │ │ ├── base.py # Abstract Base Extractor - │ │ ├── pdf_extractor.py # PDF Implementation (PyPDF2) - │ │ ├── docx_extractor.py # DOCX Implementation (python-docx) - │ │ ├── txt_extractor.py # TXT Implementation (built-in) - │ │ └── factory.py # Extractor Factory (Factory Pattern) - │ ├── chunkers/ - │ │ ├── __init__.py - │ │ ├── base.py # Abstract Base Chunker - │ │ ├── fixed_size_chunker.py # Fixed Size Strategy - │ │ ├── paragraph_chunker.py # Paragraph Strategy - │ │ └── context.py # Chunking Context (Strategy Pattern) - │ └── persistence/ - │ ├── __init__.py - │ └── in_memory_repository.py # In-Memory Repository (Thread-Safe) - │ - └── shared/ # SHARED LAYER (Cross-Cutting) - ├── __init__.py - ├── constants.py # Application Constants - └── logging_config.py # Logging Configuration -``` - -## File Count & Statistics - -### Total Files -- **42 Python files** (.py) -- **3 Documentation files** (.md) -- **1 Requirements file** (.txt) -- **Total: 46 files** - -### Lines of Code (Approximate) -- Core Domain: ~1,200 lines -- Adapters: ~1,400 lines -- Bootstrap & Main: ~200 lines -- Documentation: ~1,000 lines -- **Total: ~3,800 lines** - -## Architecture Layers - -### 1. Core Domain (src/core/) -**Responsibility**: Pure business logic, no external dependencies - -#### Domain Models (models.py) -- `Document`: Rich entity with validation and business methods -- `DocumentMetadata`: Value object for file information -- `Chunk`: Immutable chunk entity -- `ChunkingStrategy`: Strategy configuration - -**Features**: -- Pydantic v2 validation -- Business methods: `validate_content()`, `get_metadata_summary()` -- Immutability where appropriate - -#### Domain Exceptions (exceptions.py) -- `DomainException`: Base exception -- `ExtractionError`, `ChunkingError`, `ProcessingError` -- `ValidationError`, `RepositoryError` -- `UnsupportedFileTypeError`, `DocumentNotFoundError`, `EmptyContentError` - -#### Domain Logic Utils (logic_utils.py) -Pure functions for text processing: -- `normalize_whitespace()`, `clean_text()` -- `split_into_sentences()`, `split_into_paragraphs()` -- `truncate_to_word_boundary()` -- `find_sentence_boundary_before()` - -#### Ports (Interfaces) -**Incoming**: -- `ITextProcessor`: Service interface (use cases) - -**Outgoing**: -- `IExtractor`: Text extraction interface -- `IChunker`: Chunking strategy interface -- `IDocumentRepository`: Persistence interface - -#### Services (document_processor_service.py) -- `DocumentProcessorService`: Orchestrates Extract → Clean → Chunk → Save -- Depends ONLY on port interfaces -- Implements ITextProcessor - -### 2. Adapters (src/adapters/) -**Responsibility**: Connect core to external world - -#### Incoming Adapters (incoming/) -**FastAPI HTTP Adapter**: -- `api_routes.py`: HTTP endpoints -- `api_schemas.py`: Pydantic request/response models -- Maps HTTP requests to domain operations -- Maps domain exceptions to HTTP status codes - -**Endpoints**: -- `POST /api/v1/process`: Process document -- `POST /api/v1/extract-and-chunk`: Extract and chunk -- `GET /api/v1/documents/{id}`: Get document -- `GET /api/v1/documents`: List documents -- `DELETE /api/v1/documents/{id}`: Delete document -- `GET /api/v1/health`: Health check - -#### Outgoing Adapters (outgoing/) - -**Extractors (extractors/)**: -- `base.py`: Template method pattern base class -- `pdf_extractor.py`: PDF extraction using PyPDF2 -- `docx_extractor.py`: DOCX extraction using python-docx -- `txt_extractor.py`: Plain text extraction (multi-encoding) -- `factory.py`: Factory pattern for extractor selection - -**Chunkers (chunkers/)**: -- `base.py`: Template method pattern base class -- `fixed_size_chunker.py`: Fixed-size chunks with overlap -- `paragraph_chunker.py`: Paragraph-based chunking -- `context.py`: Strategy pattern context - -**Persistence (persistence/)**: -- `in_memory_repository.py`: Thread-safe in-memory storage - -### 3. Bootstrap (src/bootstrap.py) -**Responsibility**: Dependency injection and wiring - -**ApplicationContainer**: -- Creates all adapters -- Injects dependencies into core -- ONLY place where concrete implementations are instantiated -- Provides factory method: `create_application()` - -### 4. Shared (src/shared/) -**Responsibility**: Cross-cutting concerns - -- `constants.py`: Application constants -- `logging_config.py`: Centralized logging setup - -## Design Patterns Implemented - -### 1. Hexagonal Architecture (Ports & Adapters) -- Core isolated from external concerns -- Dependency inversion at boundaries -- Easy to swap implementations - -### 2. Factory Pattern -- `ExtractorFactory`: Creates appropriate extractor based on file type -- Centralized management -- Easy to add new file types - -### 3. Strategy Pattern -- `ChunkingContext`: Runtime strategy selection -- `FixedSizeChunker`, `ParagraphChunker` -- Easy to add new strategies - -### 4. Repository Pattern -- `IDocumentRepository`: Abstract persistence -- `InMemoryDocumentRepository`: Concrete implementation -- Easy to swap storage (memory → DB) - -### 5. Template Method Pattern -- `BaseExtractor`: Common extraction workflow -- `BaseChunker`: Common chunking workflow -- Subclasses fill in specific details - -### 6. Dependency Injection -- `ApplicationContainer`: Constructor injection -- Loose coupling -- Easy testing with mocks - -## SOLID Principles Compliance - -### Single Responsibility Principle ✓ -- Each class has one reason to change -- Each function does ONE thing -- Maximum 15-20 lines per function - -### Open/Closed Principle ✓ -- Open for extension (add extractors, chunkers) -- Closed for modification (core unchanged) - -### Liskov Substitution Principle ✓ -- All IExtractor implementations are interchangeable -- All IChunker implementations are interchangeable - -### Interface Segregation Principle ✓ -- Small, focused interfaces -- No fat interfaces - -### Dependency Inversion Principle ✓ -- Core depends on abstractions (ports) -- Core does NOT depend on concrete implementations -- High-level modules independent of low-level modules - -## Clean Code Principles - -### DRY (Don't Repeat Yourself) ✓ -- Base classes for common functionality -- Pure functions for reusable logic -- No code duplication - -### KISS (Keep It Simple, Stupid) ✓ -- Simple, readable solutions -- No over-engineering -- Clear naming - -### YAGNI (You Aren't Gonna Need It) ✓ -- Implements only required features -- No speculative generality -- Focused on current needs - -## Type Safety - -- **100% type hints** on all functions -- Python 3.10+ type annotations -- Pydantic for runtime validation -- Mypy compatible - -## Documentation Standards - -- **Google-style docstrings** on all public APIs -- Module-level documentation -- Inline comments for complex logic -- Architecture documentation -- Usage examples - -## Testing Strategy - -### Unit Tests -- Test domain models in isolation -- Test pure functions -- Test services with mocks - -### Integration Tests -- Test extractors with real files -- Test chunkers with real text -- Test repository operations - -### API Tests -- Test FastAPI endpoints -- Test error scenarios -- Test complete workflows - -## Error Handling - -### Domain Exceptions -- All external errors wrapped in domain exceptions -- Rich error context (file path, operation, details) -- Hierarchical exception structure - -### HTTP Error Mapping -- 400: Invalid request, unsupported file type -- 404: Document not found -- 422: Extraction/chunking failed -- 500: Internal processing error - -## Extensibility - -### Adding New File Type (Example: HTML) -1. Create `html_extractor.py` extending `BaseExtractor` -2. Register in `bootstrap.py`: `factory.register_extractor(HTMLExtractor())` -3. Done! No changes to core required - -### Adding New Chunking Strategy (Example: Sentence) -1. Create `sentence_chunker.py` extending `BaseChunker` -2. Register in `bootstrap.py`: `context.register_chunker(SentenceChunker())` -3. Done! No changes to core required - -### Swapping Storage (Example: PostgreSQL) -1. Create `postgres_repository.py` implementing `IDocumentRepository` -2. Swap in `bootstrap.py`: `return PostgresDocumentRepository(...)` -3. Done! No changes to core or API required - -## Dependencies - -### Production -- `pydantic==2.10.5`: Data validation and models -- `fastapi==0.115.6`: Web framework -- `uvicorn==0.34.0`: ASGI server -- `PyPDF2==3.0.1`: PDF extraction -- `python-docx==1.1.2`: DOCX extraction - -### Development -- `pytest==8.3.4`: Testing framework -- `black==24.10.0`: Code formatting -- `ruff==0.8.5`: Linting -- `mypy==1.14.0`: Type checking - -## Running the Application - -### Install Dependencies -```bash -pip install -r requirements.txt -``` - -### Run FastAPI Server -```bash -python main.py -# or -uvicorn main:app --reload -``` - -### Run Example Script -```bash -python example_usage.py -``` - -### Access API Documentation -- Swagger UI: http://localhost:8000/docs -- ReDoc: http://localhost:8000/redoc - -## Key Achievements - -### Architecture -✓ Pure hexagonal architecture implementation -✓ Zero circular dependencies -✓ Core completely isolated from adapters -✓ Perfect dependency inversion - -### Code Quality -✓ 100% type-hinted -✓ Google-style docstrings on all APIs -✓ Functions ≤ 15-20 lines -✓ DRY, KISS, YAGNI principles - -### Design Patterns -✓ 6 patterns implemented correctly -✓ Factory for extractors -✓ Strategy for chunkers -✓ Repository for persistence -✓ Template method for base classes - -### SOLID Principles -✓ All 5 principles demonstrated -✓ Single Responsibility throughout -✓ Open/Closed via interfaces -✓ Dependency Inversion at boundaries - -### Features -✓ Multiple file type support (PDF, DOCX, TXT) -✓ Multiple chunking strategies -✓ Rich domain models with validation -✓ Comprehensive error handling -✓ Thread-safe repository -✓ RESTful API with FastAPI -✓ Complete documentation - -## Next Steps (Future Enhancements) - -1. **Database Persistence**: PostgreSQL/MongoDB repository -2. **Async Processing**: Async extractors and chunkers -3. **Caching**: Redis for frequently accessed documents -4. **More Strategies**: Sentence-based, semantic chunking -5. **Batch Processing**: Process multiple documents at once -6. **Search**: Full-text search integration -7. **Monitoring**: Structured logging, metrics, APM -8. **Testing**: Add comprehensive test suite - -## Conclusion - -This implementation represents a **"Gold Standard"** hexagonal architecture: - -- **Clean**: Clear separation of concerns -- **Testable**: Easy to mock and test -- **Flexible**: Easy to extend and modify -- **Maintainable**: Well-documented and organized -- **Production-Ready**: Error handling, logging, type safety - -The architecture allows you to: -- Add new file types without touching core logic -- Swap storage implementations with one line change -- Add new chunking algorithms independently -- Test business logic without any infrastructure -- Scale horizontally or vertically as needed - -This is how professional, enterprise-grade software should be built. diff --git a/QUICK_START.md b/QUICK_START.md deleted file mode 100644 index b627c05..0000000 --- a/QUICK_START.md +++ /dev/null @@ -1,256 +0,0 @@ -# Quick Start Guide - -## Installation - -```bash -# Navigate to project directory -cd text_processor_hex - -# Create virtual environment -python -m venv venv - -# Activate virtual environment -source venv/bin/activate # On Windows: venv\Scripts\activate - -# Install dependencies -pip install -r requirements.txt -``` - -## Run the Application - -### Option 1: FastAPI Server -```bash -python main.py -``` -Then visit: http://localhost:8000/docs - -### Option 2: Programmatic Usage -```bash -python example_usage.py -``` - -## Basic Usage Examples - -### 1. Using the API (cURL) - -**Process a Document:** -```bash -curl -X POST "http://localhost:8000/api/v1/process" \ - -H "Content-Type: application/json" \ - -d '{ - "file_path": "/path/to/document.pdf", - "chunking_strategy": { - "strategy_name": "fixed_size", - "chunk_size": 1000, - "overlap_size": 100, - "respect_boundaries": true - } - }' -``` - -**Extract and Chunk:** -```bash -curl -X POST "http://localhost:8000/api/v1/extract-and-chunk" \ - -H "Content-Type: application/json" \ - -d '{ - "file_path": "/path/to/document.pdf", - "chunking_strategy": { - "strategy_name": "paragraph", - "chunk_size": 1000, - "overlap_size": 0, - "respect_boundaries": true - } - }' -``` - -**Get Document:** -```bash -curl -X GET "http://localhost:8000/api/v1/documents/{document_id}" -``` - -**List Documents:** -```bash -curl -X GET "http://localhost:8000/api/v1/documents?limit=10&offset=0" -``` - -**Delete Document:** -```bash -curl -X DELETE "http://localhost:8000/api/v1/documents/{document_id}" -``` - -### 2. Using Python Code - -```python -from pathlib import Path -from src.bootstrap import create_application -from src.core.domain.models import ChunkingStrategy - -# Initialize -container = create_application() -service = container.text_processor_service - -# Process a PDF -strategy = ChunkingStrategy( - strategy_name="fixed_size", - chunk_size=1000, - overlap_size=100, - respect_boundaries=True, -) - -document = service.process_document( - file_path=Path("example.pdf"), - chunking_strategy=strategy, -) - -print(f"Document ID: {document.id}") -print(f"Metadata: {document.get_metadata_summary()}") - -# Extract and chunk -chunks = service.extract_and_chunk( - file_path=Path("example.pdf"), - chunking_strategy=strategy, -) - -for chunk in chunks: - print(f"Chunk {chunk.sequence_number}: {chunk.get_length()} chars") -``` - -## Available Chunking Strategies - -### 1. Fixed Size -Splits text into equal-sized chunks with optional overlap. - -```python -ChunkingStrategy( - strategy_name="fixed_size", - chunk_size=1000, # Target size in characters - overlap_size=100, # Overlap between chunks - respect_boundaries=True # Try to break at sentences -) -``` - -### 2. Paragraph -Splits text by paragraph boundaries, combining paragraphs to reach target size. - -```python -ChunkingStrategy( - strategy_name="paragraph", - chunk_size=1000, - overlap_size=0, - respect_boundaries=True -) -``` - -## Supported File Types - -- **PDF** (.pdf) - using PyPDF2 -- **DOCX** (.docx) - using python-docx -- **Text** (.txt, .md, .text) - native Python - -## Project Structure - -``` -text_processor_hex/ -├── main.py # FastAPI entry point -├── example_usage.py # Usage examples -├── requirements.txt # Dependencies -│ -└── src/ - ├── core/ # Business logic (NO external dependencies) - │ ├── domain/ # Models, exceptions, logic - │ ├── ports/ # Interface definitions - │ └── services/ # Orchestration - │ - ├── adapters/ # External integrations - │ ├── incoming/ # FastAPI routes - │ └── outgoing/ # Extractors, chunkers, storage - │ - ├── shared/ # Utilities - └── bootstrap.py # Dependency injection -``` - -## Common Tasks - -### Add a New File Type -1. Create extractor in `src/adapters/outgoing/extractors/` -2. Extend `BaseExtractor` -3. Register in `bootstrap.py` - -### Add a New Chunking Strategy -1. Create chunker in `src/adapters/outgoing/chunkers/` -2. Extend `BaseChunker` -3. Register in `bootstrap.py` - -### Change Storage -1. Implement `IDocumentRepository` interface -2. Swap implementation in `bootstrap.py` - -## Testing - -```bash -# Run example -python example_usage.py - -# Test API with curl -curl http://localhost:8000/health - -# Check API docs -# Visit: http://localhost:8000/docs -``` - -## Troubleshooting - -### Import Errors -```bash -# Make sure you're in the right directory -cd text_processor_hex - -# Activate virtual environment -source venv/bin/activate -``` - -### Missing Dependencies -```bash -pip install -r requirements.txt -``` - -### File Not Found Errors -Use absolute paths for file_path in API requests: -```json -{ - "file_path": "/absolute/path/to/file.pdf" -} -``` - -## Architecture Highlights - -**Hexagonal Architecture:** -- Core business logic is isolated -- Easy to test without infrastructure -- Easy to swap implementations - -**Design Patterns:** -- Factory: ExtractorFactory selects extractor by file type -- Strategy: ChunkingContext selects chunking strategy -- Repository: Abstract data storage -- Dependency Injection: All dependencies injected via bootstrap - -**SOLID Principles:** -- Single Responsibility: Each class does one thing -- Open/Closed: Add features without modifying core -- Dependency Inversion: Core depends on abstractions - -## Next Steps - -1. Read `README.md` for detailed documentation -2. Read `ARCHITECTURE.md` for architecture details -3. Run `example_usage.py` to see it in action -4. Explore the code starting from `bootstrap.py` -5. Try the API using the Swagger docs at `/docs` - -## Need Help? - -- Check `README.md` for detailed docs -- Check `ARCHITECTURE.md` for architecture diagrams -- Check `PROJECT_SUMMARY.md` for complete overview -- Look at `example_usage.py` for usage patterns diff --git a/example_usage.py b/example_usage.py deleted file mode 100644 index 55c136d..0000000 --- a/example_usage.py +++ /dev/null @@ -1,157 +0,0 @@ -""" -Example Usage Script - Demonstrates how to use the Text Processor. - -This script shows how to use the text processor programmatically -without going through the HTTP API. -""" -from pathlib import Path - -from src.bootstrap import create_application -from src.core.domain.models import ChunkingStrategy - - -def main(): - """Main example function.""" - print("=" * 70) - print("Text Processor - Hexagonal Architecture Example") - print("=" * 70) - print() - - # Step 1: Create application container with dependency injection - print("1. Initializing application container...") - container = create_application(log_level="INFO") - service = container.text_processor_service - print(" ✓ Container initialized\n") - - # Step 2: Create a sample text file for demonstration - print("2. Creating sample text file...") - sample_text = """ - The Hexagonal Architecture Pattern - - Introduction - Hexagonal Architecture, also known as Ports and Adapters, is a software design - pattern that aims to create loosely coupled application components. The pattern - was invented by Alistair Cockburn in 2005. - - Core Concepts - The main idea is to isolate the core business logic from external concerns like - databases, user interfaces, and external services. This is achieved through the - use of ports and adapters. - - Ports are interfaces that define how the application core interacts with the - outside world. Adapters are implementations of these ports that connect the - application to specific technologies. - - Benefits - The benefits of this architecture include improved testability, flexibility, - and maintainability. By isolating the core logic, we can easily swap - implementations without affecting the business rules. - - Conclusion - Hexagonal Architecture is a powerful pattern for building maintainable and - flexible applications. It promotes clean separation of concerns and makes - testing much easier. - """ - - sample_file = Path("sample_document.txt") - sample_file.write_text(sample_text.strip()) - print(f" ✓ Created sample file: {sample_file}\n") - - # Step 3: Process document with fixed-size chunking - print("3. Processing document with FIXED SIZE strategy...") - fixed_strategy = ChunkingStrategy( - strategy_name="fixed_size", - chunk_size=300, - overlap_size=50, - respect_boundaries=True, - ) - - try: - document = service.process_document( - file_path=sample_file, - chunking_strategy=fixed_strategy, - ) - - print(f" Document ID: {document.id}") - print(f" Metadata: {document.get_metadata_summary()}") - print(f" Processed: {document.is_processed}") - print(f" Content length: {len(document.content)} characters") - print(f" Preview: {document.get_content_preview(100)}...\n") - - # Step 4: Extract and chunk with paragraph strategy - print("4. Extracting and chunking with PARAGRAPH strategy...") - paragraph_strategy = ChunkingStrategy( - strategy_name="paragraph", - chunk_size=500, - overlap_size=0, - respect_boundaries=True, - ) - - chunks = service.extract_and_chunk( - file_path=sample_file, - chunking_strategy=paragraph_strategy, - ) - - print(f" ✓ Created {len(chunks)} chunks\n") - - # Display chunk information - print(" Chunk Details:") - print(" " + "-" * 66) - for i, chunk in enumerate(chunks[:3], 1): # Show first 3 chunks - print(f" Chunk #{chunk.sequence_number}") - print(f" - Length: {chunk.get_length()} characters") - print(f" - Position: {chunk.start_char} to {chunk.end_char}") - print(f" - Preview: {chunk.content[:80]}...") - print(" " + "-" * 66) - - if len(chunks) > 3: - print(f" ... and {len(chunks) - 3} more chunks\n") - - # Step 5: Retrieve the document - print("5. Retrieving document from repository...") - retrieved = service.get_document(document.id) - print(f" ✓ Retrieved document: {retrieved.id}") - print(f" ✓ Content matches: {retrieved.content == document.content}\n") - - # Step 6: List all documents - print("6. Listing all documents...") - all_docs = service.list_documents(limit=10) - print(f" ✓ Found {len(all_docs)} document(s) in repository") - for doc in all_docs: - print(f" - {doc.metadata.file_name} ({doc.metadata.file_type})") - print() - - # Step 7: Delete the document - print("7. Cleaning up - deleting document...") - deleted = service.delete_document(document.id) - print(f" ✓ Document deleted: {deleted}\n") - - # Verify deletion - remaining = service.list_documents() - print(f" ✓ Remaining documents: {len(remaining)}\n") - - except Exception as e: - print(f" ✗ Error: {str(e)}\n") - raise - - finally: - # Clean up sample file - if sample_file.exists(): - sample_file.unlink() - print(f" ✓ Cleaned up sample file\n") - - print("=" * 70) - print("Example completed successfully!") - print("=" * 70) - print() - print("Key Takeaways:") - print("1. Core domain is completely isolated from adapters") - print("2. Dependencies are injected through bootstrap") - print("3. Easy to swap implementations (strategies, extractors)") - print("4. Rich domain models with built-in validation") - print("5. Clear separation between API models and domain models") - print() - - -if __name__ == "__main__": - main() diff --git a/main.py b/main.py index 0f6a437..eebecd7 100644 --- a/main.py +++ b/main.py @@ -1,110 +1,17 @@ """ Main Application Entry Point. -This module creates and runs the FastAPI application. +This module imports the FastAPI app directly from the routes module +and runs it via uvicorn. """ import logging -from contextlib import asynccontextmanager -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware - -from src.bootstrap import create_application -from src.shared.constants import ( - API_DESCRIPTION, - API_DOCS_URL, - API_PREFIX, - API_REDOC_URL, - API_TITLE, - APP_VERSION, -) +from src.adapters.incoming.api_routes import app logger = logging.getLogger(__name__) -# Application container (created on startup) -app_container = None - - -@asynccontextmanager -async def lifespan(app: FastAPI): - """ - Application lifespan manager. - - Handles startup and shutdown events. - """ - # Startup - global app_container - logger.info("Starting up application...") - - # Create application container with dependency injection - app_container = create_application(log_level="INFO") - - logger.info("Application started successfully") - - yield - - # Shutdown - logger.info("Shutting down application...") - app_container = None - logger.info("Application shut down") - - -# Create FastAPI application -app = FastAPI( - title=API_TITLE, - description=API_DESCRIPTION, - version=APP_VERSION, - docs_url=API_DOCS_URL, - redoc_url=API_REDOC_URL, - lifespan=lifespan, -) - -# Add CORS middleware -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], # Configure appropriately for production - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -@app.on_event("startup") -async def setup_routes(): - """Setup API routes on startup.""" - if app_container: - # Include the API routes from the incoming adapter - app.include_router( - app_container.api.router, - prefix=API_PREFIX, - tags=["Text Processing"], - ) - logger.info(f"API routes registered at {API_PREFIX}") - - -@app.get("/") -async def root(): - """Root endpoint with API information.""" - return { - "name": API_TITLE, - "version": APP_VERSION, - "description": API_DESCRIPTION, - "docs_url": API_DOCS_URL, - "api_prefix": API_PREFIX, - } - - -@app.get("/health") -async def health_check(): - """Basic health check endpoint.""" - return { - "status": "healthy", - "version": APP_VERSION, - } - - if __name__ == "__main__": import uvicorn diff --git a/requirements.txt b/requirements.txt index 76d1f64..caf1229 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,10 +6,6 @@ pydantic-settings==2.7.1 fastapi==0.115.6 uvicorn[standard]==0.34.0 -# Document Processing -PyPDF2==3.0.1 -python-docx==1.1.2 - # Utilities python-multipart==0.0.20 diff --git a/src/adapters/incoming/api_routes.py b/src/adapters/incoming/api_routes.py index 4d1169c..8c8708a 100644 --- a/src/adapters/incoming/api_routes.py +++ b/src/adapters/incoming/api_routes.py @@ -1,15 +1,14 @@ """ -API Routes - FastAPI routes for text processing operations. +API Routes - Functional FastAPI routes for text processing. This is the incoming adapter that translates HTTP requests into -use case calls. +domain operations. Routes pull the service directly from bootstrap. """ import logging from pathlib import Path -from typing import List from uuid import UUID -from fastapi import APIRouter, HTTPException, status +from fastapi import APIRouter, FastAPI, HTTPException, status from ...core.domain.exceptions import ( ChunkingError, @@ -19,15 +18,13 @@ from ...core.domain.exceptions import ( ProcessingError, UnsupportedFileTypeError, ) -from ...core.domain.models import Chunk, ChunkingStrategy, Document +from ...core.domain.models import ChunkingStrategy from ...core.ports.incoming.text_processor import ITextProcessor from .api_schemas import ( ChunkResponse, DeleteDocumentResponse, DocumentListResponse, - DocumentMetadataResponse, DocumentResponse, - ErrorResponse, ExtractAndChunkRequest, ExtractAndChunkResponse, HealthCheckResponse, @@ -39,361 +36,409 @@ from .api_schemas import ( logger = logging.getLogger(__name__) -class TextProcessorAPI: +# Create FastAPI application +app = FastAPI( + title="Text Processor API", + description="Text extraction and chunking system using Hexagonal Architecture", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", +) + +# Create API router +router = APIRouter(prefix="/api/v1", tags=["Text Processing"]) + + +def _get_service() -> ITextProcessor: """ - FastAPI routes for text processing. + Get the text processor service from bootstrap singleton. - This adapter translates HTTP requests into domain operations - and handles error mapping to HTTP responses. + This function pulls the service directly without using FastAPI's Depends. + + Returns: + ITextProcessor: Core service instance """ + from ...bootstrap import get_processor_service - def __init__(self, text_processor: ITextProcessor) -> None: - """ - Initialize API routes. + return get_processor_service() - Args: - text_processor: Text processor service (incoming port) - """ - self.text_processor = text_processor - self.router = APIRouter() - self._register_routes() - logger.info("TextProcessorAPI initialized") - def _register_routes(self) -> None: - """Register all API routes.""" - self.router.add_api_route( - "/process", - self.process_document, - methods=["POST"], - response_model=ProcessDocumentResponse, - status_code=status.HTTP_201_CREATED, - summary="Process a document", - description="Extract text from document and store it", +def _to_domain_strategy(request_strategy) -> ChunkingStrategy: + """ + Convert API request strategy to domain model. + + Args: + request_strategy: API request strategy schema + + Returns: + ChunkingStrategy: Domain strategy model + """ + return ChunkingStrategy( + strategy_name=request_strategy.strategy_name, + chunk_size=request_strategy.chunk_size, + overlap_size=request_strategy.overlap_size, + respect_boundaries=request_strategy.respect_boundaries, + ) + + +def _to_document_response(document) -> DocumentResponse: + """ + Convert domain document to API response. + + Args: + document: Domain Document entity + + Returns: + DocumentResponse: API response model + """ + from .api_schemas import DocumentMetadataResponse + + return DocumentResponse( + id=str(document.id), + content=document.content, + metadata=DocumentMetadataResponse( + file_name=document.metadata.file_name, + file_type=document.metadata.file_type, + file_size_bytes=document.metadata.file_size_bytes, + created_at=document.metadata.created_at.isoformat(), + author=document.metadata.author, + page_count=document.metadata.page_count, + ), + is_processed=document.is_processed, + content_preview=document.get_content_preview(200), + ) + + +def _to_chunk_response(chunk) -> ChunkResponse: + """ + Convert domain chunk to API response. + + Args: + chunk: Domain Chunk entity + + Returns: + ChunkResponse: API response model + """ + return ChunkResponse( + id=str(chunk.id), + document_id=str(chunk.document_id), + content=chunk.content, + sequence_number=chunk.sequence_number, + start_char=chunk.start_char, + end_char=chunk.end_char, + length=chunk.get_length(), + ) + + +def _map_domain_exception(exception: DomainException) -> HTTPException: + """ + Map domain exceptions to HTTP exceptions. + + Args: + exception: Domain exception + + Returns: + HTTPException: Corresponding HTTP exception + """ + if isinstance(exception, UnsupportedFileTypeError): + return HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=str(exception), + ) + elif isinstance(exception, ExtractionError): + return HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=str(exception), + ) + elif isinstance(exception, ChunkingError): + return HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=str(exception), + ) + elif isinstance(exception, ProcessingError): + return HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=str(exception), + ) + elif isinstance(exception, DocumentNotFoundError): + return HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=str(exception), + ) + else: + return HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=str(exception), ) - self.router.add_api_route( - "/extract-and-chunk", - self.extract_and_chunk, - methods=["POST"], - response_model=ExtractAndChunkResponse, - status_code=status.HTTP_200_OK, - summary="Extract and chunk document", - description="Extract text and split into chunks", + +@router.post( + "/process", + response_model=ProcessDocumentResponse, + status_code=status.HTTP_201_CREATED, + summary="Process a document", + description="Extract text from document and store it", +) +async def process_document(request: ProcessDocumentRequest) -> ProcessDocumentResponse: + """ + Process a document endpoint. + + Args: + request: Processing request with file path and strategy + + Returns: + Processing response with document details + + Raises: + HTTPException: If processing fails + """ + try: + # Pull service from bootstrap + service: ITextProcessor = _get_service() + + # Convert request to domain models + file_path = Path(request.file_path) + strategy = _to_domain_strategy(request.chunking_strategy) + + # Execute use case + document = service.process_document(file_path, strategy) + + # Convert to response + return ProcessDocumentResponse( + document=_to_document_response(document) ) - self.router.add_api_route( - "/documents/{document_id}", - self.get_document, - methods=["GET"], - response_model=DocumentResponse, - status_code=status.HTTP_200_OK, - summary="Get document by ID", - description="Retrieve a processed document", + except DomainException as e: + raise _map_domain_exception(e) + except Exception as e: + logger.error(f"Unexpected error processing document: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal server error: {str(e)}", ) - self.router.add_api_route( - "/documents", - self.list_documents, - methods=["GET"], - response_model=DocumentListResponse, - status_code=status.HTTP_200_OK, - summary="List all documents", - description="Retrieve all documents with pagination", + +@router.post( + "/extract-and-chunk", + response_model=ExtractAndChunkResponse, + status_code=status.HTTP_200_OK, + summary="Extract and chunk document", + description="Extract text and split into chunks", +) +async def extract_and_chunk( + request: ExtractAndChunkRequest, +) -> ExtractAndChunkResponse: + """ + Extract and chunk document endpoint. + + Args: + request: Extract and chunk request + + Returns: + Response with chunks + + Raises: + HTTPException: If extraction or chunking fails + """ + try: + # Pull service from bootstrap + service: ITextProcessor = _get_service() + + # Convert request to domain models + file_path = Path(request.file_path) + strategy = _to_domain_strategy(request.chunking_strategy) + + # Execute use case + chunks = service.extract_and_chunk(file_path, strategy) + + # Convert to response + chunk_responses = [_to_chunk_response(c) for c in chunks] + + return ExtractAndChunkResponse( + chunks=chunk_responses, + total_chunks=len(chunk_responses), ) - self.router.add_api_route( - "/documents/{document_id}", - self.delete_document, - methods=["DELETE"], - response_model=DeleteDocumentResponse, - status_code=status.HTTP_200_OK, - summary="Delete document", - description="Delete a document by ID", + except DomainException as e: + raise _map_domain_exception(e) + except Exception as e: + logger.error(f"Unexpected error extracting and chunking: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal server error: {str(e)}", ) - self.router.add_api_route( - "/health", - self.health_check, - methods=["GET"], - response_model=HealthCheckResponse, - status_code=status.HTTP_200_OK, - summary="Health check", - description="Check API health and configuration", + +@router.get( + "/documents/{document_id}", + response_model=DocumentResponse, + status_code=status.HTTP_200_OK, + summary="Get document by ID", + description="Retrieve a processed document", +) +async def get_document(document_id: str) -> DocumentResponse: + """ + Get document by ID endpoint. + + Args: + document_id: UUID of the document + + Returns: + Document response + + Raises: + HTTPException: If document not found + """ + try: + # Pull service from bootstrap + service: ITextProcessor = _get_service() + + doc_uuid = UUID(document_id) + document = service.get_document(doc_uuid) + return _to_document_response(document) + + except ValueError: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid document ID format: {document_id}", + ) + except DocumentNotFoundError as e: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=str(e), + ) + except Exception as e: + logger.error(f"Unexpected error retrieving document: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal server error: {str(e)}", ) - async def process_document( - self, - request: ProcessDocumentRequest, - ) -> ProcessDocumentResponse: - """ - Process a document endpoint. - Args: - request: Processing request with file path and strategy +@router.get( + "/documents", + response_model=DocumentListResponse, + status_code=status.HTTP_200_OK, + summary="List all documents", + description="Retrieve all documents with pagination", +) +async def list_documents(limit: int = 100, offset: int = 0) -> DocumentListResponse: + """ + List documents endpoint. - Returns: - Processing response with document details + Args: + limit: Maximum number of documents to return + offset: Number of documents to skip - Raises: - HTTPException: If processing fails - """ - try: - # Convert request to domain models - file_path = Path(request.file_path) - strategy = self._to_domain_strategy(request.chunking_strategy) + Returns: + List of documents with pagination info + """ + try: + # Pull service from bootstrap + service: ITextProcessor = _get_service() - # Execute use case - document = self.text_processor.process_document(file_path, strategy) + documents = service.list_documents(limit, offset) + doc_responses = [_to_document_response(d) for d in documents] - # Convert to response - return ProcessDocumentResponse( - document=self._to_document_response(document) - ) - - except DomainException as e: - raise self._map_domain_exception(e) - except Exception as e: - logger.error(f"Unexpected error processing document: {str(e)}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Internal server error: {str(e)}", - ) - - async def extract_and_chunk( - self, - request: ExtractAndChunkRequest, - ) -> ExtractAndChunkResponse: - """ - Extract and chunk document endpoint. - - Args: - request: Extract and chunk request - - Returns: - Response with chunks - - Raises: - HTTPException: If extraction or chunking fails - """ - try: - # Convert request to domain models - file_path = Path(request.file_path) - strategy = self._to_domain_strategy(request.chunking_strategy) - - # Execute use case - chunks = self.text_processor.extract_and_chunk(file_path, strategy) - - # Convert to response - chunk_responses = [self._to_chunk_response(c) for c in chunks] - - return ExtractAndChunkResponse( - chunks=chunk_responses, - total_chunks=len(chunk_responses), - ) - - except DomainException as e: - raise self._map_domain_exception(e) - except Exception as e: - logger.error(f"Unexpected error extracting and chunking: {str(e)}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Internal server error: {str(e)}", - ) - - async def get_document(self, document_id: str) -> DocumentResponse: - """ - Get document by ID endpoint. - - Args: - document_id: UUID of the document - - Returns: - Document response - - Raises: - HTTPException: If document not found - """ - try: - doc_uuid = UUID(document_id) - document = self.text_processor.get_document(doc_uuid) - return self._to_document_response(document) - - except ValueError: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Invalid document ID format: {document_id}", - ) - except DocumentNotFoundError as e: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=str(e), - ) - except Exception as e: - logger.error(f"Unexpected error retrieving document: {str(e)}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Internal server error: {str(e)}", - ) - - async def list_documents( - self, - limit: int = 100, - offset: int = 0, - ) -> DocumentListResponse: - """ - List documents endpoint. - - Args: - limit: Maximum number of documents to return - offset: Number of documents to skip - - Returns: - List of documents with pagination info - """ - try: - documents = self.text_processor.list_documents(limit, offset) - doc_responses = [self._to_document_response(d) for d in documents] - - return DocumentListResponse( - documents=doc_responses, - total=len(doc_responses), - limit=limit, - offset=offset, - ) - - except Exception as e: - logger.error(f"Unexpected error listing documents: {str(e)}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Internal server error: {str(e)}", - ) - - async def delete_document(self, document_id: str) -> DeleteDocumentResponse: - """ - Delete document endpoint. - - Args: - document_id: UUID of the document - - Returns: - Deletion response - - Raises: - HTTPException: If document not found or deletion fails - """ - try: - doc_uuid = UUID(document_id) - success = self.text_processor.delete_document(doc_uuid) - - return DeleteDocumentResponse( - success=success, - message=f"Document {document_id} deleted successfully", - document_id=document_id, - ) - - except ValueError: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Invalid document ID format: {document_id}", - ) - except DocumentNotFoundError as e: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=str(e), - ) - except Exception as e: - logger.error(f"Unexpected error deleting document: {str(e)}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Internal server error: {str(e)}", - ) - - async def health_check(self) -> HealthCheckResponse: - """ - Health check endpoint. - - Returns: - Health status and configuration - """ - # Note: This would ideally get info from dependencies - return HealthCheckResponse( - status="healthy", - version="1.0.0", - supported_file_types=["pdf", "docx", "txt"], - available_strategies=["fixed_size", "paragraph"], + return DocumentListResponse( + documents=doc_responses, + total=len(doc_responses), + limit=limit, + offset=offset, ) - def _to_domain_strategy(self, request_strategy) -> ChunkingStrategy: - """Convert API request strategy to domain model.""" - return ChunkingStrategy( - strategy_name=request_strategy.strategy_name, - chunk_size=request_strategy.chunk_size, - overlap_size=request_strategy.overlap_size, - respect_boundaries=request_strategy.respect_boundaries, + except Exception as e: + logger.error(f"Unexpected error listing documents: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal server error: {str(e)}", ) - def _to_document_response(self, document: Document) -> DocumentResponse: - """Convert domain document to API response.""" - return DocumentResponse( - id=str(document.id), - content=document.content, - metadata=DocumentMetadataResponse( - file_name=document.metadata.file_name, - file_type=document.metadata.file_type, - file_size_bytes=document.metadata.file_size_bytes, - created_at=document.metadata.created_at.isoformat(), - author=document.metadata.author, - page_count=document.metadata.page_count, - ), - is_processed=document.is_processed, - content_preview=document.get_content_preview(200), + +@router.delete( + "/documents/{document_id}", + response_model=DeleteDocumentResponse, + status_code=status.HTTP_200_OK, + summary="Delete document", + description="Delete a document by ID", +) +async def delete_document(document_id: str) -> DeleteDocumentResponse: + """ + Delete document endpoint. + + Args: + document_id: UUID of the document + + Returns: + Deletion response + + Raises: + HTTPException: If document not found or deletion fails + """ + try: + # Pull service from bootstrap + service: ITextProcessor = _get_service() + + doc_uuid = UUID(document_id) + success = service.delete_document(doc_uuid) + + return DeleteDocumentResponse( + success=success, + message=f"Document {document_id} deleted successfully", + document_id=document_id, ) - def _to_chunk_response(self, chunk: Chunk) -> ChunkResponse: - """Convert domain chunk to API response.""" - return ChunkResponse( - id=str(chunk.id), - document_id=str(chunk.document_id), - content=chunk.content, - sequence_number=chunk.sequence_number, - start_char=chunk.start_char, - end_char=chunk.end_char, - length=chunk.get_length(), + except ValueError: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid document ID format: {document_id}", + ) + except DocumentNotFoundError as e: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=str(e), + ) + except Exception as e: + logger.error(f"Unexpected error deleting document: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal server error: {str(e)}", ) - def _map_domain_exception(self, exception: DomainException) -> HTTPException: - """ - Map domain exceptions to HTTP exceptions. - This is where we translate domain errors into API errors. - """ - if isinstance(exception, UnsupportedFileTypeError): - return HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=str(exception), - ) - elif isinstance(exception, ExtractionError): - return HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=str(exception), - ) - elif isinstance(exception, ChunkingError): - return HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - detail=str(exception), - ) - elif isinstance(exception, ProcessingError): - return HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=str(exception), - ) - elif isinstance(exception, DocumentNotFoundError): - return HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=str(exception), - ) - else: - return HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=str(exception), - ) +@router.get( + "/health", + response_model=HealthCheckResponse, + status_code=status.HTTP_200_OK, + summary="Health check", + description="Check API health and configuration", +) +async def health_check() -> HealthCheckResponse: + """ + Health check endpoint. + + Returns: + Health status and configuration + """ + return HealthCheckResponse( + status="healthy", + version="1.0.0", + supported_file_types=["pdf", "docx", "txt"], + available_strategies=["fixed_size", "paragraph"], + ) + + +# Include router in app +app.include_router(router) + + +@app.get("/") +async def root(): + """Root endpoint with API information.""" + return { + "name": "Text Processor API", + "version": "1.0.0", + "description": "Text extraction and chunking system using Hexagonal Architecture", + "docs_url": "/docs", + "api_prefix": "/api/v1", + } diff --git a/src/bootstrap.py b/src/bootstrap.py index d0b4d08..435cf98 100644 --- a/src/bootstrap.py +++ b/src/bootstrap.py @@ -1,15 +1,15 @@ """ -Bootstrap - Dependency Injection and Wiring. +Bootstrap - Dependency Injection with Lazy Singleton Pattern. -This module wires together all components of the application. +This module wires together the Core and Outgoing Adapters. The Core never imports Adapters - only the Bootstrap does. -This is the ONLY place where concrete implementations are instantiated -and injected into the domain services. +The ApplicationContainer manages ONLY: +- Core Services +- Outgoing Adapters (Extractors, Chunkers, Repository) """ import logging -from .adapters.incoming.api_routes import TextProcessorAPI from .adapters.outgoing.chunkers.context import ChunkingContext from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker @@ -28,13 +28,18 @@ from .shared.logging_config import setup_logging logger = logging.getLogger(__name__) +# Module-level singleton instance (lazy initialization) +_container: 'ApplicationContainer | None' = None + + class ApplicationContainer: """ - Dependency Injection Container. + Dependency Injection Container for Core and Outgoing Adapters. + + This container manages the lifecycle and dependencies of: + - Core Domain Services + - Outgoing Adapters (Extractors, Chunkers, Repository) - This container manages the lifecycle and dependencies of all - application components. It follows the Dependency Inversion Principle - by depending on abstractions (ports) rather than concrete implementations. """ def __init__(self, log_level: str = "INFO") -> None: @@ -48,28 +53,25 @@ class ApplicationContainer: setup_logging(level=log_level) logger.info("Initializing ApplicationContainer") - # Outgoing adapters + # Create Outgoing Adapters self._repository = self._create_repository() self._extractor_factory = self._create_extractor_factory() self._chunking_context = self._create_chunking_context() - # Core service + # Create Core Service (depends only on Ports) self._text_processor_service = self._create_text_processor_service() - # Incoming adapter - self._api = self._create_api() - logger.info("ApplicationContainer initialized successfully") @property def text_processor_service(self) -> ITextProcessor: - """Get the text processor service.""" - return self._text_processor_service + """ + Get the text processor service. - @property - def api(self) -> TextProcessorAPI: - """Get the API adapter.""" - return self._api + Returns: + ITextProcessor: Core service implementing the incoming port + """ + return self._text_processor_service def _create_repository(self) -> InMemoryDocumentRepository: """ @@ -130,7 +132,7 @@ class ApplicationContainer: """ Create the core text processor service. - Injects all required dependencies (repositories, factories, contexts). + Injects all required dependencies via Ports (Dependency Inversion). Returns: Configured text processor service @@ -142,24 +144,36 @@ class ApplicationContainer: repository=self._repository, ) - def _create_api(self) -> TextProcessorAPI: - """ - Create the FastAPI adapter. - Injects the text processor service. +def get_processor_service() -> ITextProcessor: + """ + Lazy singleton provider for the text processor service. - Returns: - Configured API adapter - """ - logger.debug("Creating TextProcessorAPI") - return TextProcessorAPI(text_processor=self._text_processor_service) + This function ensures the ApplicationContainer is instantiated only once + and returns the core service. API routes pull the service via this function. + + Returns: + ITextProcessor: Core service implementing the incoming port + + Example: + >>> service = get_processor_service() + >>> document = service.process_document(file_path, strategy) + """ + global _container + + if _container is None: + logger.info("Lazy initializing ApplicationContainer (first access)") + _container = ApplicationContainer(log_level="INFO") + + return _container.text_processor_service def create_application(log_level: str = "INFO") -> ApplicationContainer: """ - Factory function to create a fully wired application. + Factory function to create a fully wired application container. - This is the main entry point for dependency injection. + This is the main entry point for manual dependency injection. + For API routes, use get_processor_service() instead. Args: log_level: Logging level for the application @@ -170,24 +184,6 @@ def create_application(log_level: str = "INFO") -> ApplicationContainer: Example: >>> container = create_application(log_level="DEBUG") >>> service = container.text_processor_service - >>> api = container.api """ - logger.info("Creating application container") + logger.info("Creating application container via factory") return ApplicationContainer(log_level=log_level) - - -def get_text_processor_service( - container: ApplicationContainer, -) -> ITextProcessor: - """ - Get the text processor service from container. - - This is a convenience function for accessing the service. - - Args: - container: Application container - - Returns: - Text processor service instance - """ - return container.text_processor_service