some fixes on architecture. make bootstrap wraps only the hexagonal plus the outgoing adapters
This commit is contained in:
parent
70f5b1478c
commit
fd39184c0c
410
ARCHITECTURE.md
410
ARCHITECTURE.md
@ -1,410 +0,0 @@
|
||||
# Architecture Documentation
|
||||
|
||||
## Hexagonal Architecture Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────────┐
|
||||
│ INCOMING ADAPTERS │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ FastAPI Routes (HTTP) │ │
|
||||
│ │ - ProcessDocumentRequest → API Schemas │ │
|
||||
│ │ - ExtractAndChunkRequest → API Schemas │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
└──────────────────────────────┬──────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────────┐
|
||||
│ CORE DOMAIN │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ PORTS (Interfaces) │ │
|
||||
│ │ ┌────────────────────┐ ┌───────────────────────────┐ │ │
|
||||
│ │ │ Incoming Ports │ │ Outgoing Ports │ │ │
|
||||
│ │ │ - ITextProcessor │ │ - IExtractor │ │ │
|
||||
│ │ │ │ │ - IChunker │ │ │
|
||||
│ │ │ │ │ - IDocumentRepository │ │ │
|
||||
│ │ └────────────────────┘ └───────────────────────────┘ │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ SERVICES (Business Logic) │ │
|
||||
│ │ - DocumentProcessorService │ │
|
||||
│ │ • Orchestrates Extract → Clean → Chunk → Save │ │
|
||||
│ │ • Depends ONLY on Port interfaces │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ DOMAIN MODELS (Rich Entities) │ │
|
||||
│ │ - Document (with validation & business methods) │ │
|
||||
│ │ - Chunk (immutable value object) │ │
|
||||
│ │ - ChunkingStrategy (configuration) │ │
|
||||
│ │ - DocumentMetadata │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ DOMAIN LOGIC (Pure Functions) │ │
|
||||
│ │ - normalize_whitespace() │ │
|
||||
│ │ - clean_text() │ │
|
||||
│ │ - split_into_paragraphs() │ │
|
||||
│ │ - find_sentence_boundary_before() │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ EXCEPTIONS (Domain Errors) │ │
|
||||
│ │ - ExtractionError, ChunkingError, ProcessingError │ │
|
||||
│ │ - ValidationError, RepositoryError │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
└──────────────────────────────┬──────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────────┐
|
||||
│ OUTGOING ADAPTERS │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ EXTRACTORS (Implements IExtractor) │ │
|
||||
│ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │
|
||||
│ │ │ PDFExtractor│ │DocxExtractor│ │TxtExtractor│ │ │
|
||||
│ │ │ (PyPDF2) │ │(python-docx)│ │ (built-in) │ │ │
|
||||
│ │ └────────────┘ └────────────┘ └────────────┘ │ │
|
||||
│ │ - Managed by ExtractorFactory (Factory Pattern) │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ CHUNKERS (Implements IChunker) │ │
|
||||
│ │ ┌─────────────────┐ ┌──────────────────┐ │ │
|
||||
│ │ │ FixedSizeChunker│ │ParagraphChunker │ │ │
|
||||
│ │ │ - Fixed chunks │ │ - Respect │ │ │
|
||||
│ │ │ - With overlap │ │ paragraphs │ │ │
|
||||
│ │ └─────────────────┘ └──────────────────┘ │ │
|
||||
│ │ - Managed by ChunkingContext (Strategy Pattern) │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
||||
│ │ REPOSITORY (Implements IDocumentRepository) │ │
|
||||
│ │ ┌──────────────────────────────────┐ │ │
|
||||
│ │ │ InMemoryDocumentRepository │ │ │
|
||||
│ │ │ - Thread-safe Dict storage │ │ │
|
||||
│ │ │ - Easy to swap for PostgreSQL │ │ │
|
||||
│ │ └──────────────────────────────────┘ │ │
|
||||
│ └──────────────────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌─────────────────────────────────────────────────────────────────────┐
|
||||
│ BOOTSTRAP (Wiring) │
|
||||
│ ApplicationContainer: │
|
||||
│ - Creates all adapters │
|
||||
│ - Injects dependencies into core │
|
||||
│ - ONLY place where adapters are instantiated │
|
||||
└─────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Data Flow: Process Document
|
||||
|
||||
```
|
||||
1. HTTP Request
|
||||
│
|
||||
▼
|
||||
2. FastAPI Route (Incoming Adapter)
|
||||
│ - Validates request schema
|
||||
▼
|
||||
3. DocumentProcessorService (Core)
|
||||
│ - Calls ExtractorFactory
|
||||
▼
|
||||
4. PDFExtractor (Outgoing Adapter)
|
||||
│ - Extracts text using PyPDF2
|
||||
│ - Maps PyPDF2 exceptions → Domain exceptions
|
||||
▼
|
||||
5. DocumentProcessorService
|
||||
│ - Cleans text using domain logic utils
|
||||
│ - Validates Document
|
||||
▼
|
||||
6. InMemoryRepository (Outgoing Adapter)
|
||||
│ - Saves Document
|
||||
▼
|
||||
7. DocumentProcessorService
|
||||
│ - Returns Document
|
||||
▼
|
||||
8. FastAPI Route
|
||||
│ - Converts Document → DocumentResponse
|
||||
▼
|
||||
9. HTTP Response
|
||||
```
|
||||
|
||||
## Data Flow: Extract and Chunk
|
||||
|
||||
```
|
||||
1. HTTP Request
|
||||
│
|
||||
▼
|
||||
2. FastAPI Route
|
||||
│ - Validates request
|
||||
▼
|
||||
3. DocumentProcessorService
|
||||
│ - Gets extractor from factory
|
||||
│ - Extracts text
|
||||
▼
|
||||
4. Extractor (PDF/DOCX/TXT)
|
||||
│ - Returns Document
|
||||
▼
|
||||
5. DocumentProcessorService
|
||||
│ - Cleans text
|
||||
│ - Calls ChunkingContext
|
||||
▼
|
||||
6. ChunkingContext (Strategy Pattern)
|
||||
│ - Selects appropriate chunker
|
||||
▼
|
||||
7. Chunker (FixedSize/Paragraph)
|
||||
│ - Splits text into segments
|
||||
│ - Creates Chunk entities
|
||||
▼
|
||||
8. DocumentProcessorService
|
||||
│ - Returns List[Chunk]
|
||||
▼
|
||||
9. FastAPI Route
|
||||
│ - Converts Chunks → ChunkResponse[]
|
||||
▼
|
||||
10. HTTP Response
|
||||
```
|
||||
|
||||
## Dependency Rules
|
||||
|
||||
### ✅ ALLOWED Dependencies
|
||||
|
||||
```
|
||||
Incoming Adapters → Core Ports (Incoming)
|
||||
Core Services → Core Ports (Outgoing)
|
||||
Core → Core (Domain Models, Logic Utils, Exceptions)
|
||||
Bootstrap → Everything (Wiring only)
|
||||
```
|
||||
|
||||
### ❌ FORBIDDEN Dependencies
|
||||
|
||||
```
|
||||
Core → Adapters (NEVER!)
|
||||
Core → External Libraries (Only in Adapters)
|
||||
Domain Models → Services
|
||||
Domain Models → Ports
|
||||
```
|
||||
|
||||
## Key Design Patterns
|
||||
|
||||
### 1. Hexagonal Architecture (Ports & Adapters)
|
||||
- **Purpose**: Isolate core business logic from external concerns
|
||||
- **Implementation**:
|
||||
- Ports: Interface definitions (ITextProcessor, IExtractor, etc.)
|
||||
- Adapters: Concrete implementations (PDFExtractor, FastAPI routes)
|
||||
|
||||
### 2. Factory Pattern
|
||||
- **Class**: `ExtractorFactory`
|
||||
- **Purpose**: Create appropriate extractor based on file extension
|
||||
- **Benefit**: Centralized extractor management, easy to add new types
|
||||
|
||||
### 3. Strategy Pattern
|
||||
- **Class**: `ChunkingContext`
|
||||
- **Purpose**: Switch between chunking strategies at runtime
|
||||
- **Strategies**: FixedSizeChunker, ParagraphChunker
|
||||
- **Benefit**: Easy to add new chunking algorithms
|
||||
|
||||
### 4. Repository Pattern
|
||||
- **Interface**: `IDocumentRepository`
|
||||
- **Implementation**: `InMemoryDocumentRepository`
|
||||
- **Purpose**: Abstract data persistence
|
||||
- **Benefit**: Easy to swap storage (memory → PostgreSQL → MongoDB)
|
||||
|
||||
### 5. Dependency Injection
|
||||
- **Class**: `ApplicationContainer`
|
||||
- **Purpose**: Wire all dependencies at startup
|
||||
- **Benefit**: Loose coupling, easy testing
|
||||
|
||||
### 6. Template Method Pattern
|
||||
- **Classes**: `BaseExtractor`, `BaseChunker`
|
||||
- **Purpose**: Define algorithm skeleton, let subclasses fill in details
|
||||
- **Benefit**: Code reuse, consistent behavior
|
||||
|
||||
## SOLID Principles Application
|
||||
|
||||
### Single Responsibility Principle (SRP)
|
||||
- Each extractor handles ONE file type
|
||||
- Each chunker handles ONE strategy
|
||||
- Each service method does ONE thing
|
||||
- Functions are max 15-20 lines
|
||||
|
||||
### Open/Closed Principle (OCP)
|
||||
- Add new extractors without modifying core
|
||||
- Add new chunkers without modifying service
|
||||
- Extend via interfaces, not modification
|
||||
|
||||
### Liskov Substitution Principle (LSP)
|
||||
- All IExtractor implementations are interchangeable
|
||||
- All IChunker implementations are interchangeable
|
||||
- Polymorphism works correctly
|
||||
|
||||
### Interface Segregation Principle (ISP)
|
||||
- Small, focused interfaces
|
||||
- IExtractor: Only extraction concerns
|
||||
- IChunker: Only chunking concerns
|
||||
- No fat interfaces
|
||||
|
||||
### Dependency Inversion Principle (DIP)
|
||||
- Core depends on IExtractor (abstraction)
|
||||
- Core does NOT depend on PDFExtractor (concrete)
|
||||
- High-level modules don't depend on low-level modules
|
||||
|
||||
## Error Handling Strategy
|
||||
|
||||
### Domain Exceptions
|
||||
All external errors are caught and wrapped in domain exceptions:
|
||||
|
||||
```python
|
||||
try:
|
||||
PyPDF2.PdfReader(file) # External library
|
||||
except PyPDF2.errors.PdfReadError as e:
|
||||
raise ExtractionError( # Domain exception
|
||||
message="Invalid PDF",
|
||||
details=str(e),
|
||||
)
|
||||
```
|
||||
|
||||
### Exception Hierarchy
|
||||
```
|
||||
DomainException (Base)
|
||||
├── ExtractionError
|
||||
│ ├── UnsupportedFileTypeError
|
||||
│ └── EmptyContentError
|
||||
├── ChunkingError
|
||||
├── ProcessingError
|
||||
├── ValidationError
|
||||
└── RepositoryError
|
||||
└── DocumentNotFoundError
|
||||
```
|
||||
|
||||
### HTTP Error Mapping
|
||||
FastAPI adapter maps domain exceptions to HTTP status codes:
|
||||
- `UnsupportedFileTypeError` → 400 Bad Request
|
||||
- `ExtractionError` → 422 Unprocessable Entity
|
||||
- `DocumentNotFoundError` → 404 Not Found
|
||||
- `ProcessingError` → 500 Internal Server Error
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### Unit Tests (Core)
|
||||
- Test domain models in isolation
|
||||
- Test logic utils (pure functions)
|
||||
- Test services with mock ports
|
||||
|
||||
### Integration Tests (Adapters)
|
||||
- Test extractors with real files
|
||||
- Test chunkers with real text
|
||||
- Test repository operations
|
||||
|
||||
### API Tests (End-to-End)
|
||||
- Test FastAPI routes
|
||||
- Test complete workflows
|
||||
- Test error scenarios
|
||||
|
||||
### Example Test Structure
|
||||
```python
|
||||
def test_document_processor_service():
|
||||
# Arrange: Create mocks
|
||||
mock_repository = MockRepository()
|
||||
mock_factory = MockExtractorFactory()
|
||||
mock_context = MockChunkingContext()
|
||||
|
||||
# Act: Inject mocks
|
||||
service = DocumentProcessorService(
|
||||
extractor_factory=mock_factory,
|
||||
chunking_context=mock_context,
|
||||
repository=mock_repository,
|
||||
)
|
||||
|
||||
# Assert: Test behavior
|
||||
result = service.process_document(...)
|
||||
assert result.is_processed
|
||||
```
|
||||
|
||||
## Extensibility Examples
|
||||
|
||||
### Adding a New Extractor (HTML)
|
||||
1. Create `html_extractor.py`:
|
||||
```python
|
||||
class HTMLExtractor(BaseExtractor):
|
||||
def __init__(self):
|
||||
super().__init__(supported_extensions=['html', 'htm'])
|
||||
|
||||
def _extract_text(self, file_path: Path) -> str:
|
||||
from bs4 import BeautifulSoup
|
||||
html = file_path.read_text()
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
return soup.get_text()
|
||||
```
|
||||
|
||||
2. Register in `bootstrap.py`:
|
||||
```python
|
||||
factory.register_extractor(HTMLExtractor())
|
||||
```
|
||||
|
||||
### Adding a New Chunking Strategy (Sentence)
|
||||
1. Create `sentence_chunker.py`:
|
||||
```python
|
||||
class SentenceChunker(BaseChunker):
|
||||
def __init__(self):
|
||||
super().__init__(strategy_name="sentence")
|
||||
|
||||
def _split_text(self, text: str, strategy: ChunkingStrategy) -> List[tuple[str, int, int]]:
|
||||
# Use NLTK to split into sentences
|
||||
sentences = nltk.sent_tokenize(text)
|
||||
# Group sentences to reach chunk_size
|
||||
return grouped_segments
|
||||
```
|
||||
|
||||
2. Register in `bootstrap.py`:
|
||||
```python
|
||||
context.register_chunker(SentenceChunker())
|
||||
```
|
||||
|
||||
### Adding Database Persistence
|
||||
1. Create `postgres_repository.py`:
|
||||
```python
|
||||
class PostgresDocumentRepository(IDocumentRepository):
|
||||
def __init__(self, connection_string: str):
|
||||
self.engine = create_engine(connection_string)
|
||||
|
||||
def save(self, document: Document) -> Document:
|
||||
# Save to PostgreSQL
|
||||
pass
|
||||
```
|
||||
|
||||
2. Swap in `bootstrap.py`:
|
||||
```python
|
||||
def _create_repository(self):
|
||||
return PostgresDocumentRepository("postgresql://...")
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Current Implementation
|
||||
- In-memory storage: O(1) lookups, limited by RAM
|
||||
- Synchronous processing: Sequential file processing
|
||||
- Thread-safe: Uses locks for concurrent access
|
||||
|
||||
### Future Optimizations
|
||||
- **Async Processing**: Use `asyncio` for concurrent document processing
|
||||
- **Caching**: Add Redis for frequently accessed documents
|
||||
- **Streaming**: Process large files in chunks
|
||||
- **Database**: Use PostgreSQL with indexes for better queries
|
||||
- **Message Queue**: Use Celery/RabbitMQ for background processing
|
||||
|
||||
## Deployment Considerations
|
||||
|
||||
### Configuration
|
||||
- Use environment variables for settings
|
||||
- Externalize file paths, database connections
|
||||
- Use `pydantic-settings` for config management
|
||||
|
||||
### Monitoring
|
||||
- Add structured logging (JSON format)
|
||||
- Track metrics: processing time, error rates
|
||||
- Use APM tools (DataDog, New Relic)
|
||||
|
||||
### Scaling
|
||||
- Horizontal: Run multiple FastAPI instances behind load balancer
|
||||
- Vertical: Increase resources for compute-heavy extraction
|
||||
- Database: Use connection pooling, read replicas
|
||||
@ -1,408 +0,0 @@
|
||||
# Architecture Corrections Summary
|
||||
|
||||
## What Was Fixed
|
||||
|
||||
This document summarizes the corrections made to ensure **strict Hexagonal Architecture compliance**.
|
||||
|
||||
---
|
||||
|
||||
## ❌ Problems Found
|
||||
|
||||
### 1. Base Classes in Wrong Layer
|
||||
**Problem**: Abstract base classes (`base.py`) were located in the Adapters layer.
|
||||
|
||||
**Files Removed**:
|
||||
- `src/adapters/outgoing/extractors/base.py` ❌
|
||||
- `src/adapters/outgoing/chunkers/base.py` ❌
|
||||
|
||||
**Why This Was Wrong**:
|
||||
- Abstract base classes define **contracts** (interfaces)
|
||||
- Contracts belong in the **Core Ports** layer, NOT Adapters
|
||||
- Adapters should only contain **concrete implementations**
|
||||
|
||||
### 2. Missing Port Interfaces
|
||||
**Problem**: Factory and Context interfaces were defined in Adapters.
|
||||
|
||||
**What Was Missing**:
|
||||
- No `IExtractorFactory` interface in Core Ports
|
||||
- No `IChunkingContext` interface in Core Ports
|
||||
|
||||
**Why This Was Wrong**:
|
||||
- Service layer was importing from Adapters (violates dependency rules)
|
||||
- Core → Adapters dependency is **strictly forbidden**
|
||||
|
||||
### 3. Incorrect Imports in Service
|
||||
**Problem**: Core Service imported from Adapters layer.
|
||||
|
||||
```python
|
||||
# WRONG ❌
|
||||
from ...adapters.outgoing.extractors.factory import IExtractorFactory
|
||||
from ...adapters.outgoing.chunkers.context import IChunkingContext
|
||||
```
|
||||
|
||||
**Why This Was Wrong**:
|
||||
- Core must NEVER import from Adapters
|
||||
- Creates circular dependency risk
|
||||
- Violates Dependency Inversion Principle
|
||||
|
||||
---
|
||||
|
||||
## ✅ Solutions Implemented
|
||||
|
||||
### 1. Created Port Interfaces in Core
|
||||
|
||||
**New Files Created**:
|
||||
```
|
||||
src/core/ports/outgoing/extractor_factory.py ✅
|
||||
src/core/ports/outgoing/chunking_context.py ✅
|
||||
```
|
||||
|
||||
**Content**:
|
||||
```python
|
||||
# src/core/ports/outgoing/extractor_factory.py
|
||||
class IExtractorFactory(ABC):
|
||||
"""Interface for extractor factory (PORT)."""
|
||||
|
||||
@abstractmethod
|
||||
def create_extractor(self, file_path: Path) -> IExtractor:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def register_extractor(self, extractor: IExtractor) -> None:
|
||||
pass
|
||||
```
|
||||
|
||||
```python
|
||||
# src/core/ports/outgoing/chunking_context.py
|
||||
class IChunkingContext(ABC):
|
||||
"""Interface for chunking context (PORT)."""
|
||||
|
||||
@abstractmethod
|
||||
def set_strategy(self, strategy_name: str) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def execute_chunking(...) -> List[Chunk]:
|
||||
pass
|
||||
```
|
||||
|
||||
### 2. Updated Concrete Implementations
|
||||
|
||||
**Extractors** - Now directly implement `IExtractor` port:
|
||||
```python
|
||||
# src/adapters/outgoing/extractors/pdf_extractor.py
|
||||
from ....core.ports.outgoing.extractor import IExtractor ✅
|
||||
|
||||
class PDFExtractor(IExtractor):
|
||||
"""Concrete PDF extractor implementing IExtractor port."""
|
||||
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
# Direct implementation, no base class needed
|
||||
pass
|
||||
```
|
||||
|
||||
**Chunkers** - Now directly implement `IChunker` port:
|
||||
```python
|
||||
# src/adapters/outgoing/chunkers/fixed_size_chunker.py
|
||||
from ....core.ports.outgoing.chunker import IChunker ✅
|
||||
|
||||
class FixedSizeChunker(IChunker):
|
||||
"""Concrete fixed-size chunker implementing IChunker port."""
|
||||
|
||||
def chunk(self, text: str, ...) -> List[Chunk]:
|
||||
# Direct implementation, no base class needed
|
||||
pass
|
||||
```
|
||||
|
||||
**Factory** - Now implements `IExtractorFactory` port:
|
||||
```python
|
||||
# src/adapters/outgoing/extractors/factory.py
|
||||
from ....core.ports.outgoing.extractor_factory import IExtractorFactory ✅
|
||||
|
||||
class ExtractorFactory(IExtractorFactory):
|
||||
"""Concrete factory implementing IExtractorFactory port."""
|
||||
pass
|
||||
```
|
||||
|
||||
**Context** - Now implements `IChunkingContext` port:
|
||||
```python
|
||||
# src/adapters/outgoing/chunkers/context.py
|
||||
from ....core.ports.outgoing.chunking_context import IChunkingContext ✅
|
||||
|
||||
class ChunkingContext(IChunkingContext):
|
||||
"""Concrete context implementing IChunkingContext port."""
|
||||
pass
|
||||
```
|
||||
|
||||
### 3. Fixed Service Layer Imports
|
||||
|
||||
**Before** (WRONG ❌):
|
||||
```python
|
||||
# src/core/services/document_processor_service.py
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ...adapters.outgoing.extractors.factory import IExtractorFactory
|
||||
from ...adapters.outgoing.chunkers.context import IChunkingContext
|
||||
```
|
||||
|
||||
**After** (CORRECT ✅):
|
||||
```python
|
||||
# src/core/services/document_processor_service.py
|
||||
from ..ports.outgoing.chunking_context import IChunkingContext
|
||||
from ..ports.outgoing.extractor_factory import IExtractorFactory
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Final Architecture
|
||||
|
||||
### Core Layer (Pure Domain)
|
||||
```
|
||||
src/core/
|
||||
├── domain/
|
||||
│ ├── models.py # Pydantic v2 entities
|
||||
│ ├── exceptions.py # Domain exceptions
|
||||
│ └── logic_utils.py # Pure functions
|
||||
├── ports/
|
||||
│ ├── incoming/
|
||||
│ │ └── text_processor.py # ITextProcessor
|
||||
│ └── outgoing/
|
||||
│ ├── extractor.py # IExtractor
|
||||
│ ├── extractor_factory.py # IExtractorFactory ✅ NEW
|
||||
│ ├── chunker.py # IChunker
|
||||
│ ├── chunking_context.py # IChunkingContext ✅ NEW
|
||||
│ └── repository.py # IDocumentRepository
|
||||
└── services/
|
||||
└── document_processor_service.py # Orchestrator
|
||||
```
|
||||
|
||||
### Adapters Layer (Infrastructure)
|
||||
```
|
||||
src/adapters/
|
||||
├── incoming/
|
||||
│ ├── api_routes.py # FastAPI (implements incoming port)
|
||||
│ └── api_schemas.py # API DTOs
|
||||
└── outgoing/
|
||||
├── extractors/
|
||||
│ ├── pdf_extractor.py # Implements IExtractor
|
||||
│ ├── docx_extractor.py # Implements IExtractor
|
||||
│ ├── txt_extractor.py # Implements IExtractor
|
||||
│ └── factory.py # Implements IExtractorFactory
|
||||
├── chunkers/
|
||||
│ ├── fixed_size_chunker.py # Implements IChunker
|
||||
│ ├── paragraph_chunker.py # Implements IChunker
|
||||
│ └── context.py # Implements IChunkingContext
|
||||
└── persistence/
|
||||
└── in_memory_repository.py # Implements IDocumentRepository
|
||||
```
|
||||
|
||||
### Bootstrap Layer (Wiring)
|
||||
```
|
||||
src/bootstrap.py # Dependency Injection
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ✅ Verification Results
|
||||
|
||||
### 1. No Adapters Imports in Core
|
||||
```bash
|
||||
$ grep -r "from.*adapters" src/core/
|
||||
# Result: NO MATCHES ✅
|
||||
```
|
||||
|
||||
### 2. No External Libraries in Core
|
||||
```bash
|
||||
$ grep -rE "import (PyPDF2|docx|fastapi)" src/core/
|
||||
# Result: NO MATCHES ✅
|
||||
```
|
||||
|
||||
### 3. All Interfaces in Core Ports
|
||||
```bash
|
||||
$ find src/core/ports -name "*.py" | grep -v __init__
|
||||
src/core/ports/incoming/text_processor.py
|
||||
src/core/ports/outgoing/extractor.py
|
||||
src/core/ports/outgoing/extractor_factory.py ✅ NEW
|
||||
src/core/ports/outgoing/chunker.py
|
||||
src/core/ports/outgoing/chunking_context.py ✅ NEW
|
||||
src/core/ports/outgoing/repository.py
|
||||
# Result: ALL INTERFACES IN PORTS ✅
|
||||
```
|
||||
|
||||
### 4. No Base Classes in Adapters
|
||||
```bash
|
||||
$ find src/adapters -name "base.py"
|
||||
# Result: NO MATCHES ✅
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Dependency Direction
|
||||
|
||||
### ✅ Correct Flow (Inward)
|
||||
```
|
||||
FastAPI Routes
|
||||
│
|
||||
▼
|
||||
ITextProcessor (PORT)
|
||||
│
|
||||
▼
|
||||
DocumentProcessorService (CORE)
|
||||
│
|
||||
├──► IExtractor (PORT)
|
||||
│ │
|
||||
│ ▼
|
||||
│ PDFExtractor (ADAPTER)
|
||||
│
|
||||
├──► IChunker (PORT)
|
||||
│ │
|
||||
│ ▼
|
||||
│ FixedSizeChunker (ADAPTER)
|
||||
│
|
||||
└──► IDocumentRepository (PORT)
|
||||
│
|
||||
▼
|
||||
InMemoryRepository (ADAPTER)
|
||||
```
|
||||
|
||||
### ❌ What We Avoided
|
||||
```
|
||||
Core Service ──X──> Adapters # NEVER!
|
||||
Core Service ──X──> PyPDF2 # NEVER!
|
||||
Core Service ──X──> FastAPI # NEVER!
|
||||
Domain Models ──X──> Services # NEVER!
|
||||
Domain Models ──X──> Ports # NEVER!
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🏆 Benefits Achieved
|
||||
|
||||
### 1. **Pure Core Domain**
|
||||
- Core has ZERO framework dependencies
|
||||
- Core can be tested without ANY infrastructure
|
||||
- Core is completely portable
|
||||
|
||||
### 2. **True Dependency Inversion**
|
||||
- Core depends on abstractions (Ports)
|
||||
- Adapters depend on Core Ports
|
||||
- NO Core → Adapter dependencies
|
||||
|
||||
### 3. **Easy Testing**
|
||||
```python
|
||||
# Test Core without ANY adapters
|
||||
def test_service():
|
||||
mock_factory = MockExtractorFactory() # Mock Port
|
||||
mock_context = MockChunkingContext() # Mock Port
|
||||
mock_repo = MockRepository() # Mock Port
|
||||
|
||||
service = DocumentProcessorService(
|
||||
extractor_factory=mock_factory,
|
||||
chunking_context=mock_context,
|
||||
repository=mock_repo,
|
||||
)
|
||||
|
||||
# Test pure business logic
|
||||
result = service.process_document(...)
|
||||
assert result.is_processed
|
||||
```
|
||||
|
||||
### 4. **Easy Extension**
|
||||
```python
|
||||
# Add new file type - NO Core changes needed
|
||||
class HTMLExtractor(IExtractor):
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
# Implementation
|
||||
pass
|
||||
|
||||
# Register in Bootstrap
|
||||
factory.register_extractor(HTMLExtractor())
|
||||
```
|
||||
|
||||
### 5. **Swappable Implementations**
|
||||
```python
|
||||
# Swap repository - ONE line change in Bootstrap
|
||||
# Before:
|
||||
self._repository = InMemoryDocumentRepository()
|
||||
|
||||
# After:
|
||||
self._repository = PostgresDocumentRepository(connection_string)
|
||||
|
||||
# NO other code changes needed!
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📝 Summary of Changes
|
||||
|
||||
### Files Deleted
|
||||
- ❌ `src/adapters/outgoing/extractors/base.py`
|
||||
- ❌ `src/adapters/outgoing/chunkers/base.py`
|
||||
|
||||
### Files Created
|
||||
- ✅ `src/core/ports/outgoing/extractor_factory.py`
|
||||
- ✅ `src/core/ports/outgoing/chunking_context.py`
|
||||
- ✅ `HEXAGONAL_ARCHITECTURE_COMPLIANCE.md`
|
||||
- ✅ `ARCHITECTURE_CORRECTIONS_SUMMARY.md`
|
||||
|
||||
### Files Modified
|
||||
- 🔧 `src/core/services/document_processor_service.py` (fixed imports)
|
||||
- 🔧 `src/adapters/outgoing/extractors/pdf_extractor.py` (implement port directly)
|
||||
- 🔧 `src/adapters/outgoing/extractors/docx_extractor.py` (implement port directly)
|
||||
- 🔧 `src/adapters/outgoing/extractors/txt_extractor.py` (implement port directly)
|
||||
- 🔧 `src/adapters/outgoing/extractors/factory.py` (implement port from Core)
|
||||
- 🔧 `src/adapters/outgoing/chunkers/fixed_size_chunker.py` (implement port directly)
|
||||
- 🔧 `src/adapters/outgoing/chunkers/paragraph_chunker.py` (implement port directly)
|
||||
- 🔧 `src/adapters/outgoing/chunkers/context.py` (implement port from Core)
|
||||
|
||||
---
|
||||
|
||||
## 🎓 Key Learnings
|
||||
|
||||
### What is a "Port"?
|
||||
- An **interface** (abstract base class)
|
||||
- Defines a **contract**
|
||||
- Lives in **Core** layer
|
||||
- Independent of implementation details
|
||||
|
||||
### What is an "Adapter"?
|
||||
- A **concrete implementation**
|
||||
- Implements a **Port** interface
|
||||
- Lives in **Adapters** layer
|
||||
- Contains technology-specific code
|
||||
|
||||
### Where Do Factories/Contexts Live?
|
||||
- **Interfaces** (IExtractorFactory, IChunkingContext) → **Core Ports**
|
||||
- **Implementations** (ExtractorFactory, ChunkingContext) → **Adapters**
|
||||
- Bootstrap injects implementations into Core Service
|
||||
|
||||
### Dependency Rule
|
||||
```
|
||||
Adapters → Ports (Core) ✅
|
||||
Core → Ports (Core) ✅
|
||||
Core → Adapters ❌ NEVER!
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ✅ Final Certification
|
||||
|
||||
This codebase now **STRICTLY ADHERES** to Hexagonal Architecture:
|
||||
|
||||
- ✅ All interfaces in Core Ports
|
||||
- ✅ All implementations in Adapters
|
||||
- ✅ Zero Core → Adapter dependencies
|
||||
- ✅ Pure domain layer
|
||||
- ✅ Proper dependency inversion
|
||||
- ✅ Easy to test
|
||||
- ✅ Easy to extend
|
||||
- ✅ Production-ready
|
||||
|
||||
**Architecture Compliance**: **GOLD STANDARD** ⭐⭐⭐⭐⭐
|
||||
|
||||
---
|
||||
|
||||
*Corrections Applied: 2026-01-07*
|
||||
*Architecture Review: APPROVED*
|
||||
*Compliance Status: CERTIFIED*
|
||||
@ -1,230 +0,0 @@
|
||||
TEXT PROCESSOR - HEXAGONAL ARCHITECTURE
|
||||
Complete Directory Structure
|
||||
|
||||
text_processor_hex/
|
||||
│
|
||||
├── 📄 README.md Project documentation and overview
|
||||
├── 📄 QUICK_START.md Quick start guide for users
|
||||
├── 📄 ARCHITECTURE.md Detailed architecture documentation
|
||||
├── 📄 PROJECT_SUMMARY.md Complete project summary
|
||||
├── 📄 DIRECTORY_TREE.txt This file
|
||||
│
|
||||
├── 📄 requirements.txt Python dependencies
|
||||
├── 🚀 main.py FastAPI application entry point
|
||||
├── 📝 example_usage.py Programmatic usage examples
|
||||
│
|
||||
└── 📁 src/
|
||||
├── 📄 __init__.py
|
||||
├── 🔧 bootstrap.py ⚙️ DEPENDENCY INJECTION CONTAINER
|
||||
│
|
||||
├── 📁 core/ ⭐ DOMAIN LAYER (Pure Business Logic)
|
||||
│ ├── 📄 __init__.py
|
||||
│ │
|
||||
│ ├── 📁 domain/ Domain Models & Logic
|
||||
│ │ ├── 📄 __init__.py
|
||||
│ │ ├── 📦 models.py Rich Pydantic v2 Entities
|
||||
│ │ │ - Document
|
||||
│ │ │ - DocumentMetadata
|
||||
│ │ │ - Chunk
|
||||
│ │ │ - ChunkingStrategy
|
||||
│ │ ├── ⚠️ exceptions.py Domain Exceptions
|
||||
│ │ │ - ExtractionError
|
||||
│ │ │ - ChunkingError
|
||||
│ │ │ - ProcessingError
|
||||
│ │ │ - ValidationError
|
||||
│ │ │ - RepositoryError
|
||||
│ │ └── 🔨 logic_utils.py Pure Functions
|
||||
│ │ - normalize_whitespace()
|
||||
│ │ - clean_text()
|
||||
│ │ - split_into_paragraphs()
|
||||
│ │ - truncate_to_word_boundary()
|
||||
│ │
|
||||
│ ├── 📁 ports/ Port Interfaces (Abstractions)
|
||||
│ │ ├── 📄 __init__.py
|
||||
│ │ │
|
||||
│ │ ├── 📁 incoming/ Service Interfaces (Use Cases)
|
||||
│ │ │ ├── 📄 __init__.py
|
||||
│ │ │ └── 🔌 text_processor.py ITextProcessor
|
||||
│ │ │ - process_document()
|
||||
│ │ │ - extract_and_chunk()
|
||||
│ │ │ - get_document()
|
||||
│ │ │ - list_documents()
|
||||
│ │ │
|
||||
│ │ └── 📁 outgoing/ SPIs (Service Provider Interfaces)
|
||||
│ │ ├── 📄 __init__.py
|
||||
│ │ ├── 🔌 extractor.py IExtractor
|
||||
│ │ │ - extract()
|
||||
│ │ │ - supports_file_type()
|
||||
│ │ ├── 🔌 chunker.py IChunker
|
||||
│ │ │ - chunk()
|
||||
│ │ │ - supports_strategy()
|
||||
│ │ └── 🔌 repository.py IDocumentRepository
|
||||
│ │ - save()
|
||||
│ │ - find_by_id()
|
||||
│ │ - delete()
|
||||
│ │
|
||||
│ └── 📁 services/ Business Logic Orchestration
|
||||
│ ├── 📄 __init__.py
|
||||
│ └── ⚙️ document_processor_service.py
|
||||
│ DocumentProcessorService
|
||||
│ Implements: ITextProcessor
|
||||
│ Workflow: Extract → Clean → Chunk → Save
|
||||
│
|
||||
├── 📁 adapters/ 🔌 ADAPTER LAYER (External Concerns)
|
||||
│ ├── 📄 __init__.py
|
||||
│ │
|
||||
│ ├── 📁 incoming/ Driving Adapters (Primary)
|
||||
│ │ ├── 📄 __init__.py
|
||||
│ │ ├── 🌐 api_routes.py FastAPI Routes (HTTP Adapter)
|
||||
│ │ │ - POST /process
|
||||
│ │ │ - POST /extract-and-chunk
|
||||
│ │ │ - GET /documents/{id}
|
||||
│ │ │ - GET /documents
|
||||
│ │ │ - DELETE /documents/{id}
|
||||
│ │ └── 📋 api_schemas.py Pydantic Request/Response Models
|
||||
│ │ - ProcessDocumentRequest
|
||||
│ │ - DocumentResponse
|
||||
│ │ - ChunkResponse
|
||||
│ │
|
||||
│ └── 📁 outgoing/ Driven Adapters (Secondary)
|
||||
│ ├── 📄 __init__.py
|
||||
│ │
|
||||
│ ├── 📁 extractors/ Text Extraction Adapters
|
||||
│ │ ├── 📄 __init__.py
|
||||
│ │ ├── 📑 base.py BaseExtractor (Template Method)
|
||||
│ │ ├── 📕 pdf_extractor.py PDFExtractor
|
||||
│ │ │ Uses: PyPDF2
|
||||
│ │ │ Supports: .pdf
|
||||
│ │ ├── 📘 docx_extractor.py DocxExtractor
|
||||
│ │ │ Uses: python-docx
|
||||
│ │ │ Supports: .docx
|
||||
│ │ ├── 📄 txt_extractor.py TxtExtractor
|
||||
│ │ │ Uses: built-in
|
||||
│ │ │ Supports: .txt, .md
|
||||
│ │ └── 🏭 factory.py ExtractorFactory (Factory Pattern)
|
||||
│ │ - create_extractor()
|
||||
│ │ - register_extractor()
|
||||
│ │
|
||||
│ ├── 📁 chunkers/ Text Chunking Adapters
|
||||
│ │ ├── 📄 __init__.py
|
||||
│ │ ├── 📑 base.py BaseChunker (Template Method)
|
||||
│ │ ├── ✂️ fixed_size_chunker.py FixedSizeChunker
|
||||
│ │ │ Strategy: Fixed-size chunks
|
||||
│ │ │ Features: Overlap, boundaries
|
||||
│ │ ├── 📝 paragraph_chunker.py ParagraphChunker
|
||||
│ │ │ Strategy: Paragraph-based
|
||||
│ │ │ Features: Respect paragraphs
|
||||
│ │ └── 🎯 context.py ChunkingContext (Strategy Pattern)
|
||||
│ │ - set_strategy()
|
||||
│ │ - execute_chunking()
|
||||
│ │
|
||||
│ └── 📁 persistence/ Data Persistence Adapters
|
||||
│ ├── 📄 __init__.py
|
||||
│ └── 💾 in_memory_repository.py
|
||||
│ InMemoryDocumentRepository
|
||||
│ Features: Thread-safe, Dict storage
|
||||
│
|
||||
└── 📁 shared/ 🛠️ SHARED LAYER (Cross-Cutting)
|
||||
├── 📄 __init__.py
|
||||
├── 🎛️ constants.py Application Constants
|
||||
│ - File types
|
||||
│ - Chunk sizes
|
||||
│ - API config
|
||||
└── 📋 logging_config.py Logging Configuration
|
||||
- setup_logging()
|
||||
- get_logger()
|
||||
|
||||
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
📊 PROJECT STATISTICS
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
Total Files: 44
|
||||
- Python files: 42
|
||||
- Documentation: 4 (README, ARCHITECTURE, SUMMARY, QUICK_START)
|
||||
- Configuration: 1 (requirements.txt)
|
||||
- Other: 1 (this tree)
|
||||
|
||||
Lines of Code: ~3,800
|
||||
- Core Domain: ~1,200 lines
|
||||
- Adapters: ~1,400 lines
|
||||
- Bootstrap/Main: ~200 lines
|
||||
- Documentation: ~1,000 lines
|
||||
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
🏗️ ARCHITECTURE LAYERS
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
1. CORE (Domain Layer)
|
||||
- Pure business logic
|
||||
- No external dependencies
|
||||
- Rich domain models
|
||||
- Pure functions
|
||||
|
||||
2. ADAPTERS (Infrastructure Layer)
|
||||
- Incoming: FastAPI (HTTP)
|
||||
- Outgoing: Extractors, Chunkers, Repository
|
||||
- Technology-specific implementations
|
||||
|
||||
3. BOOTSTRAP (Wiring Layer)
|
||||
- Dependency injection
|
||||
- Configuration
|
||||
- Application assembly
|
||||
|
||||
4. SHARED (Utilities Layer)
|
||||
- Cross-cutting concerns
|
||||
- Logging, constants
|
||||
- No business logic
|
||||
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
🎨 DESIGN PATTERNS
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
✓ Hexagonal Architecture (Ports & Adapters)
|
||||
✓ Factory Pattern (ExtractorFactory)
|
||||
✓ Strategy Pattern (ChunkingContext)
|
||||
✓ Repository Pattern (IDocumentRepository)
|
||||
✓ Template Method Pattern (BaseExtractor, BaseChunker)
|
||||
✓ Dependency Injection (ApplicationContainer)
|
||||
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
💎 SOLID PRINCIPLES
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
✓ Single Responsibility: Each class has one job
|
||||
✓ Open/Closed: Extend via interfaces, not modification
|
||||
✓ Liskov Substitution: All implementations are interchangeable
|
||||
✓ Interface Segregation: Small, focused interfaces
|
||||
✓ Dependency Inversion: Depend on abstractions, not concretions
|
||||
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
🎯 KEY FEATURES
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
✓ Multiple file types (PDF, DOCX, TXT)
|
||||
✓ Multiple chunking strategies (Fixed, Paragraph)
|
||||
✓ Rich domain models with validation
|
||||
✓ Comprehensive error handling
|
||||
✓ RESTful API with FastAPI
|
||||
✓ Thread-safe repository
|
||||
✓ 100% type hints
|
||||
✓ Google-style docstrings
|
||||
✓ Complete documentation
|
||||
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
📚 DOCUMENTATION FILES
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
README.md - Project overview and installation
|
||||
QUICK_START.md - Quick start guide for users
|
||||
ARCHITECTURE.md - Detailed architecture documentation with diagrams
|
||||
PROJECT_SUMMARY.md - Complete project summary and statistics
|
||||
DIRECTORY_TREE.txt - This file
|
||||
|
||||
═══════════════════════════════════════════════════════════════════════════
|
||||
@ -1,590 +0,0 @@
|
||||
# Hexagonal Architecture Compliance Report
|
||||
|
||||
## Overview
|
||||
This document certifies that the Text Processor codebase strictly adheres to **Hexagonal Architecture** (Ports & Adapters) principles as defined by Alistair Cockburn.
|
||||
|
||||
---
|
||||
|
||||
## ✅ Architectural Compliance Checklist
|
||||
|
||||
### 1. Core Domain Isolation
|
||||
- [x] **Core has ZERO dependencies on Adapters**
|
||||
- [x] **Core depends ONLY on standard library and Pydantic**
|
||||
- [x] **No framework dependencies in Core** (no FastAPI, no PyPDF2, no python-docx)
|
||||
- [x] **All external tool usage is in Adapters**
|
||||
|
||||
### 2. Port Definitions (Interfaces)
|
||||
- [x] **ALL interfaces defined in `src/core/ports/`**
|
||||
- [x] **NO abstract base classes in `src/adapters/`**
|
||||
- [x] **Incoming Ports**: `ITextProcessor` (Service Interface)
|
||||
- [x] **Outgoing Ports**: `IExtractor`, `IChunker`, `IDocumentRepository`
|
||||
|
||||
### 3. Adapter Implementation
|
||||
- [x] **ALL concrete implementations in `src/adapters/`**
|
||||
- [x] **Adapters implement Core Ports**
|
||||
- [x] **Adapters catch technical errors and raise Domain exceptions**
|
||||
- [x] **NO business logic in Adapters**
|
||||
|
||||
### 4. Dependency Direction
|
||||
- [x] **Dependencies point INWARD** (Adapters → Core, never Core → Adapters)
|
||||
- [x] **Dependency Inversion Principle satisfied**
|
||||
- [x] **Bootstrap is ONLY place that knows about both Core and Adapters**
|
||||
|
||||
### 5. Factory & Strategy Patterns
|
||||
- [x] **ExtractorFactory in Adapters layer** (not Core)
|
||||
- [x] **ChunkingContext in Adapters layer** (not Core)
|
||||
- [x] **Factories/Contexts registered in Bootstrap**
|
||||
|
||||
---
|
||||
|
||||
## 📂 Corrected Directory Structure
|
||||
|
||||
```
|
||||
src/
|
||||
├── core/ # DOMAIN LAYER (Pure Logic)
|
||||
│ ├── domain/
|
||||
│ │ ├── models.py # Rich Pydantic entities
|
||||
│ │ ├── exceptions.py # Domain exceptions
|
||||
│ │ └── logic_utils.py # Pure functions
|
||||
│ ├── ports/
|
||||
│ │ ├── incoming/
|
||||
│ │ │ └── text_processor.py # ITextProcessor (USE CASE)
|
||||
│ │ └── outgoing/
|
||||
│ │ ├── extractor.py # IExtractor (SPI)
|
||||
│ │ ├── chunker.py # IChunker (SPI)
|
||||
│ │ └── repository.py # IDocumentRepository (SPI)
|
||||
│ └── services/
|
||||
│ └── document_processor_service.py # Orchestrator (depends on Ports)
|
||||
│
|
||||
├── adapters/ # INFRASTRUCTURE LAYER
|
||||
│ ├── incoming/
|
||||
│ │ ├── api_routes.py # FastAPI adapter
|
||||
│ │ └── api_schemas.py # API DTOs
|
||||
│ └── outgoing/
|
||||
│ ├── extractors/
|
||||
│ │ ├── pdf_extractor.py # Implements IExtractor
|
||||
│ │ ├── docx_extractor.py # Implements IExtractor
|
||||
│ │ ├── txt_extractor.py # Implements IExtractor
|
||||
│ │ └── factory.py # Factory (ADAPTER LAYER)
|
||||
│ ├── chunkers/
|
||||
│ │ ├── fixed_size_chunker.py # Implements IChunker
|
||||
│ │ ├── paragraph_chunker.py # Implements IChunker
|
||||
│ │ └── context.py # Strategy Context (ADAPTER LAYER)
|
||||
│ └── persistence/
|
||||
│ └── in_memory_repository.py # Implements IDocumentRepository
|
||||
│
|
||||
├── shared/ # UTILITIES
|
||||
│ ├── constants.py
|
||||
│ └── logging_config.py
|
||||
│
|
||||
└── bootstrap.py # DEPENDENCY INJECTION
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔍 Key Corrections Made
|
||||
|
||||
### ❌ REMOVED: `base.py` files from Adapters
|
||||
**Before (WRONG)**:
|
||||
```
|
||||
src/adapters/outgoing/extractors/base.py # Abstract base in Adapters ❌
|
||||
src/adapters/outgoing/chunkers/base.py # Abstract base in Adapters ❌
|
||||
```
|
||||
|
||||
**After (CORRECT)**:
|
||||
- Removed all `base.py` files from adapters
|
||||
- Abstract interfaces exist ONLY in `src/core/ports/outgoing/`
|
||||
|
||||
### ✅ Concrete Implementations Directly Implement Ports
|
||||
|
||||
**Before (WRONG)**:
|
||||
```python
|
||||
# In src/adapters/outgoing/extractors/pdf_extractor.py
|
||||
from .base import BaseExtractor # Inheriting from adapter base ❌
|
||||
|
||||
class PDFExtractor(BaseExtractor):
|
||||
pass
|
||||
```
|
||||
|
||||
**After (CORRECT)**:
|
||||
```python
|
||||
# In src/adapters/outgoing/extractors/pdf_extractor.py
|
||||
from ....core.ports.outgoing.extractor import IExtractor # Port from Core ✅
|
||||
|
||||
class PDFExtractor(IExtractor):
|
||||
"""Concrete implementation of IExtractor for PDF files."""
|
||||
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
# Implementation
|
||||
pass
|
||||
|
||||
def supports_file_type(self, file_extension: str) -> bool:
|
||||
# Implementation
|
||||
pass
|
||||
|
||||
def get_supported_types(self) -> List[str]:
|
||||
# Implementation
|
||||
pass
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Dependency Graph
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────────┐
|
||||
│ HTTP Request (FastAPI) │
|
||||
└────────────────────────┬─────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────────────────────────────────────────────┐
|
||||
│ INCOMING ADAPTER (api_routes.py) │
|
||||
│ Depends on: ITextProcessor (Port) │
|
||||
└────────────────────────┬─────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────────────────────────────────────────────┐
|
||||
│ CORE DOMAIN LAYER │
|
||||
│ ┌────────────────────────────────────────────────────────┐ │
|
||||
│ │ DocumentProcessorService (implements ITextProcessor) │ │
|
||||
│ │ Depends on: │ │
|
||||
│ │ - IExtractor (Port) │ │
|
||||
│ │ - IChunker (Port) │ │
|
||||
│ │ - IDocumentRepository (Port) │ │
|
||||
│ │ - Domain Models │ │
|
||||
│ │ - Domain Logic Utils │ │
|
||||
│ └────────────────────────────────────────────────────────┘ │
|
||||
└────────────────────────┬─────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────────────────────────────────────────────┐
|
||||
│ OUTGOING ADAPTERS │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │PDFExtractor │ │FixedSizeChkr │ │InMemoryRepo │ │
|
||||
│ │(IExtractor) │ │(IChunker) │ │(IRepository) │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │
|
||||
│ Uses: PyPDF2 Uses: Logic Uses: Dict │
|
||||
│ Utils │
|
||||
└──────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔒 Dependency Rules Enforcement
|
||||
|
||||
### ✅ ALLOWED Dependencies
|
||||
|
||||
```
|
||||
Core Domain ──→ Standard Library
|
||||
Core Domain ──→ Pydantic (Data Validation)
|
||||
Core Services ──→ Core Ports (Interfaces)
|
||||
Core Services ──→ Core Domain Models
|
||||
Core Services ──→ Core Logic Utils
|
||||
|
||||
Adapters ──→ Core Ports (Implement interfaces)
|
||||
Adapters ──→ Core Domain Models (Use entities)
|
||||
Adapters ──→ Core Exceptions (Raise domain errors)
|
||||
Adapters ──→ External Libraries (PyPDF2, python-docx, FastAPI)
|
||||
|
||||
Bootstrap ──→ Core (Services, Ports)
|
||||
Bootstrap ──→ Adapters (Concrete implementations)
|
||||
```
|
||||
|
||||
### ❌ FORBIDDEN Dependencies
|
||||
|
||||
```
|
||||
Core ──X──> Adapters (NEVER!)
|
||||
Core ──X──> External Libraries (ONLY via Adapters)
|
||||
Core ──X──> FastAPI (ONLY in Adapters)
|
||||
Core ──X──> PyPDF2 (ONLY in Adapters)
|
||||
Core ──X──> python-docx (ONLY in Adapters)
|
||||
|
||||
Domain Models ──X──> Services
|
||||
Domain Models ──X──> Ports
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📋 Port Interfaces (Core Layer)
|
||||
|
||||
### Incoming Port: ITextProcessor
|
||||
```python
|
||||
# src/core/ports/incoming/text_processor.py
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
class ITextProcessor(ABC):
|
||||
"""Service interface for text processing use cases."""
|
||||
|
||||
@abstractmethod
|
||||
def process_document(self, file_path: Path, strategy: ChunkingStrategy) -> Document:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def extract_and_chunk(self, file_path: Path, strategy: ChunkingStrategy) -> List[Chunk]:
|
||||
pass
|
||||
```
|
||||
|
||||
### Outgoing Port: IExtractor
|
||||
```python
|
||||
# src/core/ports/outgoing/extractor.py
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
class IExtractor(ABC):
|
||||
"""Interface for text extraction from documents."""
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def supports_file_type(self, file_extension: str) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_supported_types(self) -> List[str]:
|
||||
pass
|
||||
```
|
||||
|
||||
### Outgoing Port: IChunker
|
||||
```python
|
||||
# src/core/ports/outgoing/chunker.py
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
class IChunker(ABC):
|
||||
"""Interface for text chunking strategies."""
|
||||
|
||||
@abstractmethod
|
||||
def chunk(self, text: str, document_id: UUID, strategy: ChunkingStrategy) -> List[Chunk]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def supports_strategy(self, strategy_name: str) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_strategy_name(self) -> str:
|
||||
pass
|
||||
```
|
||||
|
||||
### Outgoing Port: IDocumentRepository
|
||||
```python
|
||||
# src/core/ports/outgoing/repository.py
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
class IDocumentRepository(ABC):
|
||||
"""Interface for document persistence."""
|
||||
|
||||
@abstractmethod
|
||||
def save(self, document: Document) -> Document:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def find_by_id(self, document_id: UUID) -> Optional[Document]:
|
||||
pass
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Adapter Implementations
|
||||
|
||||
### PDF Extractor
|
||||
```python
|
||||
# src/adapters/outgoing/extractors/pdf_extractor.py
|
||||
from ....core.ports.outgoing.extractor import IExtractor
|
||||
from ....core.domain.models import Document
|
||||
from ....core.domain.exceptions import ExtractionError
|
||||
|
||||
class PDFExtractor(IExtractor):
|
||||
"""Concrete PDF extractor using PyPDF2."""
|
||||
|
||||
def extract(self, file_path: Path) -> Document:
|
||||
try:
|
||||
import PyPDF2 # External library ONLY in adapter
|
||||
# ... extraction logic
|
||||
except PyPDF2.errors.PdfReadError as e:
|
||||
# Map technical error to domain error
|
||||
raise ExtractionError(
|
||||
message="Invalid PDF file",
|
||||
details=str(e),
|
||||
file_path=str(file_path),
|
||||
)
|
||||
```
|
||||
|
||||
### Fixed Size Chunker
|
||||
```python
|
||||
# src/adapters/outgoing/chunkers/fixed_size_chunker.py
|
||||
from ....core.ports.outgoing.chunker import IChunker
|
||||
from ....core.domain.models import Chunk, ChunkingStrategy
|
||||
from ....core.domain import logic_utils # Pure functions from Core
|
||||
|
||||
class FixedSizeChunker(IChunker):
|
||||
"""Concrete fixed-size chunker."""
|
||||
|
||||
def chunk(self, text: str, document_id: UUID, strategy: ChunkingStrategy) -> List[Chunk]:
|
||||
# Uses pure functions from Core (logic_utils)
|
||||
# Creates Chunk entities from Core domain
|
||||
pass
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🎨 Design Pattern Locations
|
||||
|
||||
### Factory Pattern
|
||||
**Location**: `src/adapters/outgoing/extractors/factory.py`
|
||||
```python
|
||||
class ExtractorFactory:
|
||||
"""Factory for creating extractors (ADAPTER LAYER)."""
|
||||
|
||||
def create_extractor(self, file_path: Path) -> IExtractor:
|
||||
# Returns implementations of IExtractor port
|
||||
pass
|
||||
```
|
||||
|
||||
**Why in Adapters?**
|
||||
- Factory knows about concrete implementations (PDFExtractor, DocxExtractor)
|
||||
- Core should NOT know about concrete implementations
|
||||
- Factory registered in Bootstrap, injected into Service
|
||||
|
||||
### Strategy Pattern
|
||||
**Location**: `src/adapters/outgoing/chunkers/context.py`
|
||||
```python
|
||||
class ChunkingContext:
|
||||
"""Strategy context for chunking (ADAPTER LAYER)."""
|
||||
|
||||
def set_strategy(self, strategy_name: str) -> None:
|
||||
# Selects concrete IChunker implementation
|
||||
pass
|
||||
|
||||
def execute_chunking(self, ...) -> List[Chunk]:
|
||||
# Delegates to selected strategy
|
||||
pass
|
||||
```
|
||||
|
||||
**Why in Adapters?**
|
||||
- Context knows about concrete strategies (FixedSizeChunker, ParagraphChunker)
|
||||
- Core should NOT know about concrete strategies
|
||||
- Context registered in Bootstrap, injected into Service
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Error Handling: Adapter → Domain
|
||||
|
||||
Adapters catch technical errors and map them to domain exceptions:
|
||||
|
||||
```python
|
||||
# In PDFExtractor (Adapter)
|
||||
try:
|
||||
import PyPDF2
|
||||
# ... PyPDF2 operations
|
||||
except PyPDF2.errors.PdfReadError as e: # Technical error
|
||||
raise ExtractionError( # Domain error
|
||||
message="Invalid PDF file",
|
||||
details=str(e),
|
||||
)
|
||||
|
||||
# In DocxExtractor (Adapter)
|
||||
try:
|
||||
import docx
|
||||
# ... python-docx operations
|
||||
except Exception as e: # Technical error
|
||||
raise ExtractionError( # Domain error
|
||||
message="DOCX extraction failed",
|
||||
details=str(e),
|
||||
)
|
||||
```
|
||||
|
||||
**Why?**
|
||||
- Core defines domain exceptions (ExtractionError, ChunkingError, etc.)
|
||||
- Adapters catch library-specific errors (PyPDF2.errors, etc.)
|
||||
- Service layer only deals with domain exceptions
|
||||
- Clean separation of technical vs. business concerns
|
||||
|
||||
---
|
||||
|
||||
## 🏗️ Bootstrap: The Wiring Layer
|
||||
|
||||
**Location**: `src/bootstrap.py`
|
||||
|
||||
```python
|
||||
class ApplicationContainer:
|
||||
"""Dependency injection container."""
|
||||
|
||||
def __init__(self):
|
||||
# Create ADAPTERS (knows about concrete implementations)
|
||||
self._repository = InMemoryDocumentRepository()
|
||||
self._extractor_factory = self._create_extractor_factory()
|
||||
self._chunking_context = self._create_chunking_context()
|
||||
|
||||
# Inject into CORE SERVICE (only knows about Ports)
|
||||
self._service = DocumentProcessorService(
|
||||
extractor_factory=self._extractor_factory, # IExtractorFactory
|
||||
chunking_context=self._chunking_context, # IChunkingContext
|
||||
repository=self._repository, # IDocumentRepository
|
||||
)
|
||||
|
||||
def _create_extractor_factory(self) -> ExtractorFactory:
|
||||
factory = ExtractorFactory()
|
||||
factory.register_extractor(PDFExtractor()) # Concrete
|
||||
factory.register_extractor(DocxExtractor()) # Concrete
|
||||
factory.register_extractor(TxtExtractor()) # Concrete
|
||||
return factory
|
||||
|
||||
def _create_chunking_context(self) -> ChunkingContext:
|
||||
context = ChunkingContext()
|
||||
context.register_chunker(FixedSizeChunker()) # Concrete
|
||||
context.register_chunker(ParagraphChunker()) # Concrete
|
||||
return context
|
||||
```
|
||||
|
||||
**Key Points**:
|
||||
1. Bootstrap is the ONLY place that imports both Core and Adapters
|
||||
2. Core Service receives interfaces (Ports), not concrete implementations
|
||||
3. Adapters are created and registered here
|
||||
4. Perfect Dependency Inversion
|
||||
|
||||
---
|
||||
|
||||
## ✅ SOLID Principles Compliance
|
||||
|
||||
### Single Responsibility Principle
|
||||
- [x] Each extractor handles ONE file type
|
||||
- [x] Each chunker handles ONE strategy
|
||||
- [x] Each service method has ONE responsibility
|
||||
- [x] Functions are max 15-20 lines
|
||||
|
||||
### Open/Closed Principle
|
||||
- [x] Add new extractors without modifying Core
|
||||
- [x] Add new chunkers without modifying Core
|
||||
- [x] Extend via Ports, not modification
|
||||
|
||||
### Liskov Substitution Principle
|
||||
- [x] All IExtractor implementations are interchangeable
|
||||
- [x] All IChunker implementations are interchangeable
|
||||
- [x] Polymorphism works correctly
|
||||
|
||||
### Interface Segregation Principle
|
||||
- [x] Small, focused Port interfaces
|
||||
- [x] IExtractor: Only extraction concerns
|
||||
- [x] IChunker: Only chunking concerns
|
||||
- [x] No fat interfaces
|
||||
|
||||
### Dependency Inversion Principle
|
||||
- [x] Core depends on IExtractor (abstraction), not PDFExtractor (concrete)
|
||||
- [x] Core depends on IChunker (abstraction), not FixedSizeChunker (concrete)
|
||||
- [x] High-level modules don't depend on low-level modules
|
||||
- [x] Both depend on abstractions (Ports)
|
||||
|
||||
---
|
||||
|
||||
## 🧪 Testing Benefits
|
||||
|
||||
### Unit Tests (Core)
|
||||
```python
|
||||
def test_document_processor_service():
|
||||
# Mock the Ports (interfaces)
|
||||
mock_factory = MockExtractorFactory()
|
||||
mock_context = MockChunkingContext()
|
||||
mock_repo = MockRepository()
|
||||
|
||||
# Inject mocks (Dependency Inversion)
|
||||
service = DocumentProcessorService(
|
||||
extractor_factory=mock_factory,
|
||||
chunking_context=mock_context,
|
||||
repository=mock_repo,
|
||||
)
|
||||
|
||||
# Test business logic WITHOUT any infrastructure
|
||||
result = service.process_document(...)
|
||||
assert result.is_processed
|
||||
```
|
||||
|
||||
### Integration Tests (Adapters)
|
||||
```python
|
||||
def test_pdf_extractor():
|
||||
# Test concrete implementation with real PDF
|
||||
extractor = PDFExtractor()
|
||||
document = extractor.extract(Path("test.pdf"))
|
||||
assert len(document.content) > 0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Verification Checklist
|
||||
|
||||
Run these checks to verify architecture compliance:
|
||||
|
||||
### 1. Import Analysis
|
||||
```bash
|
||||
# Core should NOT import from adapters
|
||||
grep -r "from.*adapters" src/core/
|
||||
# Expected: NO RESULTS ✅
|
||||
|
||||
# Core should NOT import external libs (except Pydantic)
|
||||
grep -r "import PyPDF2\|import docx\|import fastapi" src/core/
|
||||
# Expected: NO RESULTS ✅
|
||||
```
|
||||
|
||||
### 2. Dependency Direction
|
||||
```bash
|
||||
# All imports should point inward (toward Core)
|
||||
# Adapters → Core: YES ✅
|
||||
# Core → Adapters: NO ❌
|
||||
```
|
||||
|
||||
### 3. Abstract Base Classes
|
||||
```bash
|
||||
# NO base.py files in adapters
|
||||
find src/adapters -name "base.py"
|
||||
# Expected: NO RESULTS ✅
|
||||
|
||||
# All interfaces in Core ports
|
||||
find src/core/ports -name "*.py" | grep -v __init__
|
||||
# Expected: extractor.py, chunker.py, repository.py, text_processor.py ✅
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Summary
|
||||
|
||||
### What Changed
|
||||
1. **Removed** `base.py` from `src/adapters/outgoing/extractors/`
|
||||
2. **Removed** `base.py` from `src/adapters/outgoing/chunkers/`
|
||||
3. **Updated** all concrete implementations to directly implement Core Ports
|
||||
4. **Confirmed** Factory and Context are in Adapters layer (correct location)
|
||||
5. **Verified** Core has ZERO dependencies on Adapters
|
||||
|
||||
### Architecture Guarantees
|
||||
- ✅ Core is **100% pure** (no framework dependencies)
|
||||
- ✅ Core depends ONLY on **abstractions** (Ports)
|
||||
- ✅ Adapters implement **Core Ports**
|
||||
- ✅ Bootstrap performs **Dependency Injection**
|
||||
- ✅ **Zero circular dependencies**
|
||||
- ✅ **Perfect Dependency Inversion**
|
||||
|
||||
### Benefits Achieved
|
||||
1. **Testability**: Core can be tested with mocks, no infrastructure needed
|
||||
2. **Flexibility**: Swap implementations (in-memory → PostgreSQL) with one line
|
||||
3. **Maintainability**: Clear separation of concerns
|
||||
4. **Extensibility**: Add new file types/strategies without touching Core
|
||||
|
||||
---
|
||||
|
||||
## 🏆 Certification
|
||||
|
||||
This codebase is **CERTIFIED** as a true Hexagonal Architecture implementation:
|
||||
|
||||
- ✅ Adheres to Alistair Cockburn's Ports & Adapters pattern
|
||||
- ✅ Satisfies all SOLID principles
|
||||
- ✅ Maintains proper dependency direction
|
||||
- ✅ Zero Core → Adapter dependencies
|
||||
- ✅ All interfaces in Core, all implementations in Adapters
|
||||
- ✅ Bootstrap handles all dependency injection
|
||||
|
||||
**Compliance Level**: **GOLD STANDARD** ⭐⭐⭐⭐⭐
|
||||
|
||||
---
|
||||
|
||||
*Last Updated: 2026-01-07*
|
||||
*Architecture Review Status: APPROVED*
|
||||
@ -1,419 +0,0 @@
|
||||
# Project Summary: Text Processor - Hexagonal Architecture
|
||||
|
||||
## Overview
|
||||
This is a **production-ready, "Gold Standard" implementation** of a text extraction and chunking system built with **Hexagonal Architecture** (Ports & Adapters pattern).
|
||||
|
||||
## Complete File Structure
|
||||
|
||||
```
|
||||
text_processor_hex/
|
||||
├── README.md # Project documentation
|
||||
├── ARCHITECTURE.md # Detailed architecture guide
|
||||
├── PROJECT_SUMMARY.md # This file
|
||||
├── requirements.txt # Python dependencies
|
||||
├── main.py # FastAPI application entry point
|
||||
├── example_usage.py # Programmatic usage example
|
||||
│
|
||||
└── src/
|
||||
├── __init__.py
|
||||
├── bootstrap.py # Dependency Injection Container
|
||||
│
|
||||
├── core/ # DOMAIN LAYER (Pure Business Logic)
|
||||
│ ├── __init__.py
|
||||
│ ├── domain/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── models.py # Rich Pydantic v2 Entities
|
||||
│ │ ├── exceptions.py # Domain Exceptions
|
||||
│ │ └── logic_utils.py # Pure Functions
|
||||
│ ├── ports/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── incoming/
|
||||
│ │ │ ├── __init__.py
|
||||
│ │ │ └── text_processor.py # Service Interface (Use Case)
|
||||
│ │ └── outgoing/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── extractor.py # Extractor Interface (SPI)
|
||||
│ │ ├── chunker.py # Chunker Interface (SPI)
|
||||
│ │ └── repository.py # Repository Interface (SPI)
|
||||
│ └── services/
|
||||
│ ├── __init__.py
|
||||
│ └── document_processor_service.py # Business Logic Orchestration
|
||||
│
|
||||
├── adapters/ # ADAPTER LAYER (External Concerns)
|
||||
│ ├── __init__.py
|
||||
│ ├── incoming/ # Driving Adapters (HTTP)
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── api_routes.py # FastAPI Routes
|
||||
│ │ └── api_schemas.py # Pydantic Request/Response Models
|
||||
│ └── outgoing/ # Driven Adapters (Infrastructure)
|
||||
│ ├── __init__.py
|
||||
│ ├── extractors/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── base.py # Abstract Base Extractor
|
||||
│ │ ├── pdf_extractor.py # PDF Implementation (PyPDF2)
|
||||
│ │ ├── docx_extractor.py # DOCX Implementation (python-docx)
|
||||
│ │ ├── txt_extractor.py # TXT Implementation (built-in)
|
||||
│ │ └── factory.py # Extractor Factory (Factory Pattern)
|
||||
│ ├── chunkers/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── base.py # Abstract Base Chunker
|
||||
│ │ ├── fixed_size_chunker.py # Fixed Size Strategy
|
||||
│ │ ├── paragraph_chunker.py # Paragraph Strategy
|
||||
│ │ └── context.py # Chunking Context (Strategy Pattern)
|
||||
│ └── persistence/
|
||||
│ ├── __init__.py
|
||||
│ └── in_memory_repository.py # In-Memory Repository (Thread-Safe)
|
||||
│
|
||||
└── shared/ # SHARED LAYER (Cross-Cutting)
|
||||
├── __init__.py
|
||||
├── constants.py # Application Constants
|
||||
└── logging_config.py # Logging Configuration
|
||||
```
|
||||
|
||||
## File Count & Statistics
|
||||
|
||||
### Total Files
|
||||
- **42 Python files** (.py)
|
||||
- **3 Documentation files** (.md)
|
||||
- **1 Requirements file** (.txt)
|
||||
- **Total: 46 files**
|
||||
|
||||
### Lines of Code (Approximate)
|
||||
- Core Domain: ~1,200 lines
|
||||
- Adapters: ~1,400 lines
|
||||
- Bootstrap & Main: ~200 lines
|
||||
- Documentation: ~1,000 lines
|
||||
- **Total: ~3,800 lines**
|
||||
|
||||
## Architecture Layers
|
||||
|
||||
### 1. Core Domain (src/core/)
|
||||
**Responsibility**: Pure business logic, no external dependencies
|
||||
|
||||
#### Domain Models (models.py)
|
||||
- `Document`: Rich entity with validation and business methods
|
||||
- `DocumentMetadata`: Value object for file information
|
||||
- `Chunk`: Immutable chunk entity
|
||||
- `ChunkingStrategy`: Strategy configuration
|
||||
|
||||
**Features**:
|
||||
- Pydantic v2 validation
|
||||
- Business methods: `validate_content()`, `get_metadata_summary()`
|
||||
- Immutability where appropriate
|
||||
|
||||
#### Domain Exceptions (exceptions.py)
|
||||
- `DomainException`: Base exception
|
||||
- `ExtractionError`, `ChunkingError`, `ProcessingError`
|
||||
- `ValidationError`, `RepositoryError`
|
||||
- `UnsupportedFileTypeError`, `DocumentNotFoundError`, `EmptyContentError`
|
||||
|
||||
#### Domain Logic Utils (logic_utils.py)
|
||||
Pure functions for text processing:
|
||||
- `normalize_whitespace()`, `clean_text()`
|
||||
- `split_into_sentences()`, `split_into_paragraphs()`
|
||||
- `truncate_to_word_boundary()`
|
||||
- `find_sentence_boundary_before()`
|
||||
|
||||
#### Ports (Interfaces)
|
||||
**Incoming**:
|
||||
- `ITextProcessor`: Service interface (use cases)
|
||||
|
||||
**Outgoing**:
|
||||
- `IExtractor`: Text extraction interface
|
||||
- `IChunker`: Chunking strategy interface
|
||||
- `IDocumentRepository`: Persistence interface
|
||||
|
||||
#### Services (document_processor_service.py)
|
||||
- `DocumentProcessorService`: Orchestrates Extract → Clean → Chunk → Save
|
||||
- Depends ONLY on port interfaces
|
||||
- Implements ITextProcessor
|
||||
|
||||
### 2. Adapters (src/adapters/)
|
||||
**Responsibility**: Connect core to external world
|
||||
|
||||
#### Incoming Adapters (incoming/)
|
||||
**FastAPI HTTP Adapter**:
|
||||
- `api_routes.py`: HTTP endpoints
|
||||
- `api_schemas.py`: Pydantic request/response models
|
||||
- Maps HTTP requests to domain operations
|
||||
- Maps domain exceptions to HTTP status codes
|
||||
|
||||
**Endpoints**:
|
||||
- `POST /api/v1/process`: Process document
|
||||
- `POST /api/v1/extract-and-chunk`: Extract and chunk
|
||||
- `GET /api/v1/documents/{id}`: Get document
|
||||
- `GET /api/v1/documents`: List documents
|
||||
- `DELETE /api/v1/documents/{id}`: Delete document
|
||||
- `GET /api/v1/health`: Health check
|
||||
|
||||
#### Outgoing Adapters (outgoing/)
|
||||
|
||||
**Extractors (extractors/)**:
|
||||
- `base.py`: Template method pattern base class
|
||||
- `pdf_extractor.py`: PDF extraction using PyPDF2
|
||||
- `docx_extractor.py`: DOCX extraction using python-docx
|
||||
- `txt_extractor.py`: Plain text extraction (multi-encoding)
|
||||
- `factory.py`: Factory pattern for extractor selection
|
||||
|
||||
**Chunkers (chunkers/)**:
|
||||
- `base.py`: Template method pattern base class
|
||||
- `fixed_size_chunker.py`: Fixed-size chunks with overlap
|
||||
- `paragraph_chunker.py`: Paragraph-based chunking
|
||||
- `context.py`: Strategy pattern context
|
||||
|
||||
**Persistence (persistence/)**:
|
||||
- `in_memory_repository.py`: Thread-safe in-memory storage
|
||||
|
||||
### 3. Bootstrap (src/bootstrap.py)
|
||||
**Responsibility**: Dependency injection and wiring
|
||||
|
||||
**ApplicationContainer**:
|
||||
- Creates all adapters
|
||||
- Injects dependencies into core
|
||||
- ONLY place where concrete implementations are instantiated
|
||||
- Provides factory method: `create_application()`
|
||||
|
||||
### 4. Shared (src/shared/)
|
||||
**Responsibility**: Cross-cutting concerns
|
||||
|
||||
- `constants.py`: Application constants
|
||||
- `logging_config.py`: Centralized logging setup
|
||||
|
||||
## Design Patterns Implemented
|
||||
|
||||
### 1. Hexagonal Architecture (Ports & Adapters)
|
||||
- Core isolated from external concerns
|
||||
- Dependency inversion at boundaries
|
||||
- Easy to swap implementations
|
||||
|
||||
### 2. Factory Pattern
|
||||
- `ExtractorFactory`: Creates appropriate extractor based on file type
|
||||
- Centralized management
|
||||
- Easy to add new file types
|
||||
|
||||
### 3. Strategy Pattern
|
||||
- `ChunkingContext`: Runtime strategy selection
|
||||
- `FixedSizeChunker`, `ParagraphChunker`
|
||||
- Easy to add new strategies
|
||||
|
||||
### 4. Repository Pattern
|
||||
- `IDocumentRepository`: Abstract persistence
|
||||
- `InMemoryDocumentRepository`: Concrete implementation
|
||||
- Easy to swap storage (memory → DB)
|
||||
|
||||
### 5. Template Method Pattern
|
||||
- `BaseExtractor`: Common extraction workflow
|
||||
- `BaseChunker`: Common chunking workflow
|
||||
- Subclasses fill in specific details
|
||||
|
||||
### 6. Dependency Injection
|
||||
- `ApplicationContainer`: Constructor injection
|
||||
- Loose coupling
|
||||
- Easy testing with mocks
|
||||
|
||||
## SOLID Principles Compliance
|
||||
|
||||
### Single Responsibility Principle ✓
|
||||
- Each class has one reason to change
|
||||
- Each function does ONE thing
|
||||
- Maximum 15-20 lines per function
|
||||
|
||||
### Open/Closed Principle ✓
|
||||
- Open for extension (add extractors, chunkers)
|
||||
- Closed for modification (core unchanged)
|
||||
|
||||
### Liskov Substitution Principle ✓
|
||||
- All IExtractor implementations are interchangeable
|
||||
- All IChunker implementations are interchangeable
|
||||
|
||||
### Interface Segregation Principle ✓
|
||||
- Small, focused interfaces
|
||||
- No fat interfaces
|
||||
|
||||
### Dependency Inversion Principle ✓
|
||||
- Core depends on abstractions (ports)
|
||||
- Core does NOT depend on concrete implementations
|
||||
- High-level modules independent of low-level modules
|
||||
|
||||
## Clean Code Principles
|
||||
|
||||
### DRY (Don't Repeat Yourself) ✓
|
||||
- Base classes for common functionality
|
||||
- Pure functions for reusable logic
|
||||
- No code duplication
|
||||
|
||||
### KISS (Keep It Simple, Stupid) ✓
|
||||
- Simple, readable solutions
|
||||
- No over-engineering
|
||||
- Clear naming
|
||||
|
||||
### YAGNI (You Aren't Gonna Need It) ✓
|
||||
- Implements only required features
|
||||
- No speculative generality
|
||||
- Focused on current needs
|
||||
|
||||
## Type Safety
|
||||
|
||||
- **100% type hints** on all functions
|
||||
- Python 3.10+ type annotations
|
||||
- Pydantic for runtime validation
|
||||
- Mypy compatible
|
||||
|
||||
## Documentation Standards
|
||||
|
||||
- **Google-style docstrings** on all public APIs
|
||||
- Module-level documentation
|
||||
- Inline comments for complex logic
|
||||
- Architecture documentation
|
||||
- Usage examples
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### Unit Tests
|
||||
- Test domain models in isolation
|
||||
- Test pure functions
|
||||
- Test services with mocks
|
||||
|
||||
### Integration Tests
|
||||
- Test extractors with real files
|
||||
- Test chunkers with real text
|
||||
- Test repository operations
|
||||
|
||||
### API Tests
|
||||
- Test FastAPI endpoints
|
||||
- Test error scenarios
|
||||
- Test complete workflows
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Domain Exceptions
|
||||
- All external errors wrapped in domain exceptions
|
||||
- Rich error context (file path, operation, details)
|
||||
- Hierarchical exception structure
|
||||
|
||||
### HTTP Error Mapping
|
||||
- 400: Invalid request, unsupported file type
|
||||
- 404: Document not found
|
||||
- 422: Extraction/chunking failed
|
||||
- 500: Internal processing error
|
||||
|
||||
## Extensibility
|
||||
|
||||
### Adding New File Type (Example: HTML)
|
||||
1. Create `html_extractor.py` extending `BaseExtractor`
|
||||
2. Register in `bootstrap.py`: `factory.register_extractor(HTMLExtractor())`
|
||||
3. Done! No changes to core required
|
||||
|
||||
### Adding New Chunking Strategy (Example: Sentence)
|
||||
1. Create `sentence_chunker.py` extending `BaseChunker`
|
||||
2. Register in `bootstrap.py`: `context.register_chunker(SentenceChunker())`
|
||||
3. Done! No changes to core required
|
||||
|
||||
### Swapping Storage (Example: PostgreSQL)
|
||||
1. Create `postgres_repository.py` implementing `IDocumentRepository`
|
||||
2. Swap in `bootstrap.py`: `return PostgresDocumentRepository(...)`
|
||||
3. Done! No changes to core or API required
|
||||
|
||||
## Dependencies
|
||||
|
||||
### Production
|
||||
- `pydantic==2.10.5`: Data validation and models
|
||||
- `fastapi==0.115.6`: Web framework
|
||||
- `uvicorn==0.34.0`: ASGI server
|
||||
- `PyPDF2==3.0.1`: PDF extraction
|
||||
- `python-docx==1.1.2`: DOCX extraction
|
||||
|
||||
### Development
|
||||
- `pytest==8.3.4`: Testing framework
|
||||
- `black==24.10.0`: Code formatting
|
||||
- `ruff==0.8.5`: Linting
|
||||
- `mypy==1.14.0`: Type checking
|
||||
|
||||
## Running the Application
|
||||
|
||||
### Install Dependencies
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Run FastAPI Server
|
||||
```bash
|
||||
python main.py
|
||||
# or
|
||||
uvicorn main:app --reload
|
||||
```
|
||||
|
||||
### Run Example Script
|
||||
```bash
|
||||
python example_usage.py
|
||||
```
|
||||
|
||||
### Access API Documentation
|
||||
- Swagger UI: http://localhost:8000/docs
|
||||
- ReDoc: http://localhost:8000/redoc
|
||||
|
||||
## Key Achievements
|
||||
|
||||
### Architecture
|
||||
✓ Pure hexagonal architecture implementation
|
||||
✓ Zero circular dependencies
|
||||
✓ Core completely isolated from adapters
|
||||
✓ Perfect dependency inversion
|
||||
|
||||
### Code Quality
|
||||
✓ 100% type-hinted
|
||||
✓ Google-style docstrings on all APIs
|
||||
✓ Functions ≤ 15-20 lines
|
||||
✓ DRY, KISS, YAGNI principles
|
||||
|
||||
### Design Patterns
|
||||
✓ 6 patterns implemented correctly
|
||||
✓ Factory for extractors
|
||||
✓ Strategy for chunkers
|
||||
✓ Repository for persistence
|
||||
✓ Template method for base classes
|
||||
|
||||
### SOLID Principles
|
||||
✓ All 5 principles demonstrated
|
||||
✓ Single Responsibility throughout
|
||||
✓ Open/Closed via interfaces
|
||||
✓ Dependency Inversion at boundaries
|
||||
|
||||
### Features
|
||||
✓ Multiple file type support (PDF, DOCX, TXT)
|
||||
✓ Multiple chunking strategies
|
||||
✓ Rich domain models with validation
|
||||
✓ Comprehensive error handling
|
||||
✓ Thread-safe repository
|
||||
✓ RESTful API with FastAPI
|
||||
✓ Complete documentation
|
||||
|
||||
## Next Steps (Future Enhancements)
|
||||
|
||||
1. **Database Persistence**: PostgreSQL/MongoDB repository
|
||||
2. **Async Processing**: Async extractors and chunkers
|
||||
3. **Caching**: Redis for frequently accessed documents
|
||||
4. **More Strategies**: Sentence-based, semantic chunking
|
||||
5. **Batch Processing**: Process multiple documents at once
|
||||
6. **Search**: Full-text search integration
|
||||
7. **Monitoring**: Structured logging, metrics, APM
|
||||
8. **Testing**: Add comprehensive test suite
|
||||
|
||||
## Conclusion
|
||||
|
||||
This implementation represents a **"Gold Standard"** hexagonal architecture:
|
||||
|
||||
- **Clean**: Clear separation of concerns
|
||||
- **Testable**: Easy to mock and test
|
||||
- **Flexible**: Easy to extend and modify
|
||||
- **Maintainable**: Well-documented and organized
|
||||
- **Production-Ready**: Error handling, logging, type safety
|
||||
|
||||
The architecture allows you to:
|
||||
- Add new file types without touching core logic
|
||||
- Swap storage implementations with one line change
|
||||
- Add new chunking algorithms independently
|
||||
- Test business logic without any infrastructure
|
||||
- Scale horizontally or vertically as needed
|
||||
|
||||
This is how professional, enterprise-grade software should be built.
|
||||
256
QUICK_START.md
256
QUICK_START.md
@ -1,256 +0,0 @@
|
||||
# Quick Start Guide
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Navigate to project directory
|
||||
cd text_processor_hex
|
||||
|
||||
# Create virtual environment
|
||||
python -m venv venv
|
||||
|
||||
# Activate virtual environment
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Run the Application
|
||||
|
||||
### Option 1: FastAPI Server
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
Then visit: http://localhost:8000/docs
|
||||
|
||||
### Option 2: Programmatic Usage
|
||||
```bash
|
||||
python example_usage.py
|
||||
```
|
||||
|
||||
## Basic Usage Examples
|
||||
|
||||
### 1. Using the API (cURL)
|
||||
|
||||
**Process a Document:**
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/api/v1/process" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"file_path": "/path/to/document.pdf",
|
||||
"chunking_strategy": {
|
||||
"strategy_name": "fixed_size",
|
||||
"chunk_size": 1000,
|
||||
"overlap_size": 100,
|
||||
"respect_boundaries": true
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
**Extract and Chunk:**
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/api/v1/extract-and-chunk" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"file_path": "/path/to/document.pdf",
|
||||
"chunking_strategy": {
|
||||
"strategy_name": "paragraph",
|
||||
"chunk_size": 1000,
|
||||
"overlap_size": 0,
|
||||
"respect_boundaries": true
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
**Get Document:**
|
||||
```bash
|
||||
curl -X GET "http://localhost:8000/api/v1/documents/{document_id}"
|
||||
```
|
||||
|
||||
**List Documents:**
|
||||
```bash
|
||||
curl -X GET "http://localhost:8000/api/v1/documents?limit=10&offset=0"
|
||||
```
|
||||
|
||||
**Delete Document:**
|
||||
```bash
|
||||
curl -X DELETE "http://localhost:8000/api/v1/documents/{document_id}"
|
||||
```
|
||||
|
||||
### 2. Using Python Code
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from src.bootstrap import create_application
|
||||
from src.core.domain.models import ChunkingStrategy
|
||||
|
||||
# Initialize
|
||||
container = create_application()
|
||||
service = container.text_processor_service
|
||||
|
||||
# Process a PDF
|
||||
strategy = ChunkingStrategy(
|
||||
strategy_name="fixed_size",
|
||||
chunk_size=1000,
|
||||
overlap_size=100,
|
||||
respect_boundaries=True,
|
||||
)
|
||||
|
||||
document = service.process_document(
|
||||
file_path=Path("example.pdf"),
|
||||
chunking_strategy=strategy,
|
||||
)
|
||||
|
||||
print(f"Document ID: {document.id}")
|
||||
print(f"Metadata: {document.get_metadata_summary()}")
|
||||
|
||||
# Extract and chunk
|
||||
chunks = service.extract_and_chunk(
|
||||
file_path=Path("example.pdf"),
|
||||
chunking_strategy=strategy,
|
||||
)
|
||||
|
||||
for chunk in chunks:
|
||||
print(f"Chunk {chunk.sequence_number}: {chunk.get_length()} chars")
|
||||
```
|
||||
|
||||
## Available Chunking Strategies
|
||||
|
||||
### 1. Fixed Size
|
||||
Splits text into equal-sized chunks with optional overlap.
|
||||
|
||||
```python
|
||||
ChunkingStrategy(
|
||||
strategy_name="fixed_size",
|
||||
chunk_size=1000, # Target size in characters
|
||||
overlap_size=100, # Overlap between chunks
|
||||
respect_boundaries=True # Try to break at sentences
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Paragraph
|
||||
Splits text by paragraph boundaries, combining paragraphs to reach target size.
|
||||
|
||||
```python
|
||||
ChunkingStrategy(
|
||||
strategy_name="paragraph",
|
||||
chunk_size=1000,
|
||||
overlap_size=0,
|
||||
respect_boundaries=True
|
||||
)
|
||||
```
|
||||
|
||||
## Supported File Types
|
||||
|
||||
- **PDF** (.pdf) - using PyPDF2
|
||||
- **DOCX** (.docx) - using python-docx
|
||||
- **Text** (.txt, .md, .text) - native Python
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
text_processor_hex/
|
||||
├── main.py # FastAPI entry point
|
||||
├── example_usage.py # Usage examples
|
||||
├── requirements.txt # Dependencies
|
||||
│
|
||||
└── src/
|
||||
├── core/ # Business logic (NO external dependencies)
|
||||
│ ├── domain/ # Models, exceptions, logic
|
||||
│ ├── ports/ # Interface definitions
|
||||
│ └── services/ # Orchestration
|
||||
│
|
||||
├── adapters/ # External integrations
|
||||
│ ├── incoming/ # FastAPI routes
|
||||
│ └── outgoing/ # Extractors, chunkers, storage
|
||||
│
|
||||
├── shared/ # Utilities
|
||||
└── bootstrap.py # Dependency injection
|
||||
```
|
||||
|
||||
## Common Tasks
|
||||
|
||||
### Add a New File Type
|
||||
1. Create extractor in `src/adapters/outgoing/extractors/`
|
||||
2. Extend `BaseExtractor`
|
||||
3. Register in `bootstrap.py`
|
||||
|
||||
### Add a New Chunking Strategy
|
||||
1. Create chunker in `src/adapters/outgoing/chunkers/`
|
||||
2. Extend `BaseChunker`
|
||||
3. Register in `bootstrap.py`
|
||||
|
||||
### Change Storage
|
||||
1. Implement `IDocumentRepository` interface
|
||||
2. Swap implementation in `bootstrap.py`
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
# Run example
|
||||
python example_usage.py
|
||||
|
||||
# Test API with curl
|
||||
curl http://localhost:8000/health
|
||||
|
||||
# Check API docs
|
||||
# Visit: http://localhost:8000/docs
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Import Errors
|
||||
```bash
|
||||
# Make sure you're in the right directory
|
||||
cd text_processor_hex
|
||||
|
||||
# Activate virtual environment
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
### Missing Dependencies
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### File Not Found Errors
|
||||
Use absolute paths for file_path in API requests:
|
||||
```json
|
||||
{
|
||||
"file_path": "/absolute/path/to/file.pdf"
|
||||
}
|
||||
```
|
||||
|
||||
## Architecture Highlights
|
||||
|
||||
**Hexagonal Architecture:**
|
||||
- Core business logic is isolated
|
||||
- Easy to test without infrastructure
|
||||
- Easy to swap implementations
|
||||
|
||||
**Design Patterns:**
|
||||
- Factory: ExtractorFactory selects extractor by file type
|
||||
- Strategy: ChunkingContext selects chunking strategy
|
||||
- Repository: Abstract data storage
|
||||
- Dependency Injection: All dependencies injected via bootstrap
|
||||
|
||||
**SOLID Principles:**
|
||||
- Single Responsibility: Each class does one thing
|
||||
- Open/Closed: Add features without modifying core
|
||||
- Dependency Inversion: Core depends on abstractions
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Read `README.md` for detailed documentation
|
||||
2. Read `ARCHITECTURE.md` for architecture details
|
||||
3. Run `example_usage.py` to see it in action
|
||||
4. Explore the code starting from `bootstrap.py`
|
||||
5. Try the API using the Swagger docs at `/docs`
|
||||
|
||||
## Need Help?
|
||||
|
||||
- Check `README.md` for detailed docs
|
||||
- Check `ARCHITECTURE.md` for architecture diagrams
|
||||
- Check `PROJECT_SUMMARY.md` for complete overview
|
||||
- Look at `example_usage.py` for usage patterns
|
||||
157
example_usage.py
157
example_usage.py
@ -1,157 +0,0 @@
|
||||
"""
|
||||
Example Usage Script - Demonstrates how to use the Text Processor.
|
||||
|
||||
This script shows how to use the text processor programmatically
|
||||
without going through the HTTP API.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
from src.bootstrap import create_application
|
||||
from src.core.domain.models import ChunkingStrategy
|
||||
|
||||
|
||||
def main():
|
||||
"""Main example function."""
|
||||
print("=" * 70)
|
||||
print("Text Processor - Hexagonal Architecture Example")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
# Step 1: Create application container with dependency injection
|
||||
print("1. Initializing application container...")
|
||||
container = create_application(log_level="INFO")
|
||||
service = container.text_processor_service
|
||||
print(" ✓ Container initialized\n")
|
||||
|
||||
# Step 2: Create a sample text file for demonstration
|
||||
print("2. Creating sample text file...")
|
||||
sample_text = """
|
||||
The Hexagonal Architecture Pattern
|
||||
|
||||
Introduction
|
||||
Hexagonal Architecture, also known as Ports and Adapters, is a software design
|
||||
pattern that aims to create loosely coupled application components. The pattern
|
||||
was invented by Alistair Cockburn in 2005.
|
||||
|
||||
Core Concepts
|
||||
The main idea is to isolate the core business logic from external concerns like
|
||||
databases, user interfaces, and external services. This is achieved through the
|
||||
use of ports and adapters.
|
||||
|
||||
Ports are interfaces that define how the application core interacts with the
|
||||
outside world. Adapters are implementations of these ports that connect the
|
||||
application to specific technologies.
|
||||
|
||||
Benefits
|
||||
The benefits of this architecture include improved testability, flexibility,
|
||||
and maintainability. By isolating the core logic, we can easily swap
|
||||
implementations without affecting the business rules.
|
||||
|
||||
Conclusion
|
||||
Hexagonal Architecture is a powerful pattern for building maintainable and
|
||||
flexible applications. It promotes clean separation of concerns and makes
|
||||
testing much easier.
|
||||
"""
|
||||
|
||||
sample_file = Path("sample_document.txt")
|
||||
sample_file.write_text(sample_text.strip())
|
||||
print(f" ✓ Created sample file: {sample_file}\n")
|
||||
|
||||
# Step 3: Process document with fixed-size chunking
|
||||
print("3. Processing document with FIXED SIZE strategy...")
|
||||
fixed_strategy = ChunkingStrategy(
|
||||
strategy_name="fixed_size",
|
||||
chunk_size=300,
|
||||
overlap_size=50,
|
||||
respect_boundaries=True,
|
||||
)
|
||||
|
||||
try:
|
||||
document = service.process_document(
|
||||
file_path=sample_file,
|
||||
chunking_strategy=fixed_strategy,
|
||||
)
|
||||
|
||||
print(f" Document ID: {document.id}")
|
||||
print(f" Metadata: {document.get_metadata_summary()}")
|
||||
print(f" Processed: {document.is_processed}")
|
||||
print(f" Content length: {len(document.content)} characters")
|
||||
print(f" Preview: {document.get_content_preview(100)}...\n")
|
||||
|
||||
# Step 4: Extract and chunk with paragraph strategy
|
||||
print("4. Extracting and chunking with PARAGRAPH strategy...")
|
||||
paragraph_strategy = ChunkingStrategy(
|
||||
strategy_name="paragraph",
|
||||
chunk_size=500,
|
||||
overlap_size=0,
|
||||
respect_boundaries=True,
|
||||
)
|
||||
|
||||
chunks = service.extract_and_chunk(
|
||||
file_path=sample_file,
|
||||
chunking_strategy=paragraph_strategy,
|
||||
)
|
||||
|
||||
print(f" ✓ Created {len(chunks)} chunks\n")
|
||||
|
||||
# Display chunk information
|
||||
print(" Chunk Details:")
|
||||
print(" " + "-" * 66)
|
||||
for i, chunk in enumerate(chunks[:3], 1): # Show first 3 chunks
|
||||
print(f" Chunk #{chunk.sequence_number}")
|
||||
print(f" - Length: {chunk.get_length()} characters")
|
||||
print(f" - Position: {chunk.start_char} to {chunk.end_char}")
|
||||
print(f" - Preview: {chunk.content[:80]}...")
|
||||
print(" " + "-" * 66)
|
||||
|
||||
if len(chunks) > 3:
|
||||
print(f" ... and {len(chunks) - 3} more chunks\n")
|
||||
|
||||
# Step 5: Retrieve the document
|
||||
print("5. Retrieving document from repository...")
|
||||
retrieved = service.get_document(document.id)
|
||||
print(f" ✓ Retrieved document: {retrieved.id}")
|
||||
print(f" ✓ Content matches: {retrieved.content == document.content}\n")
|
||||
|
||||
# Step 6: List all documents
|
||||
print("6. Listing all documents...")
|
||||
all_docs = service.list_documents(limit=10)
|
||||
print(f" ✓ Found {len(all_docs)} document(s) in repository")
|
||||
for doc in all_docs:
|
||||
print(f" - {doc.metadata.file_name} ({doc.metadata.file_type})")
|
||||
print()
|
||||
|
||||
# Step 7: Delete the document
|
||||
print("7. Cleaning up - deleting document...")
|
||||
deleted = service.delete_document(document.id)
|
||||
print(f" ✓ Document deleted: {deleted}\n")
|
||||
|
||||
# Verify deletion
|
||||
remaining = service.list_documents()
|
||||
print(f" ✓ Remaining documents: {len(remaining)}\n")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {str(e)}\n")
|
||||
raise
|
||||
|
||||
finally:
|
||||
# Clean up sample file
|
||||
if sample_file.exists():
|
||||
sample_file.unlink()
|
||||
print(f" ✓ Cleaned up sample file\n")
|
||||
|
||||
print("=" * 70)
|
||||
print("Example completed successfully!")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print("Key Takeaways:")
|
||||
print("1. Core domain is completely isolated from adapters")
|
||||
print("2. Dependencies are injected through bootstrap")
|
||||
print("3. Easy to swap implementations (strategies, extractors)")
|
||||
print("4. Rich domain models with built-in validation")
|
||||
print("5. Clear separation between API models and domain models")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
99
main.py
99
main.py
@ -1,110 +1,17 @@
|
||||
"""
|
||||
Main Application Entry Point.
|
||||
|
||||
This module creates and runs the FastAPI application.
|
||||
This module imports the FastAPI app directly from the routes module
|
||||
and runs it via uvicorn.
|
||||
"""
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from src.bootstrap import create_application
|
||||
from src.shared.constants import (
|
||||
API_DESCRIPTION,
|
||||
API_DOCS_URL,
|
||||
API_PREFIX,
|
||||
API_REDOC_URL,
|
||||
API_TITLE,
|
||||
APP_VERSION,
|
||||
)
|
||||
from src.adapters.incoming.api_routes import app
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Application container (created on startup)
|
||||
app_container = None
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
Application lifespan manager.
|
||||
|
||||
Handles startup and shutdown events.
|
||||
"""
|
||||
# Startup
|
||||
global app_container
|
||||
logger.info("Starting up application...")
|
||||
|
||||
# Create application container with dependency injection
|
||||
app_container = create_application(log_level="INFO")
|
||||
|
||||
logger.info("Application started successfully")
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
logger.info("Shutting down application...")
|
||||
app_container = None
|
||||
logger.info("Application shut down")
|
||||
|
||||
|
||||
# Create FastAPI application
|
||||
app = FastAPI(
|
||||
title=API_TITLE,
|
||||
description=API_DESCRIPTION,
|
||||
version=APP_VERSION,
|
||||
docs_url=API_DOCS_URL,
|
||||
redoc_url=API_REDOC_URL,
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# Add CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # Configure appropriately for production
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def setup_routes():
|
||||
"""Setup API routes on startup."""
|
||||
if app_container:
|
||||
# Include the API routes from the incoming adapter
|
||||
app.include_router(
|
||||
app_container.api.router,
|
||||
prefix=API_PREFIX,
|
||||
tags=["Text Processing"],
|
||||
)
|
||||
logger.info(f"API routes registered at {API_PREFIX}")
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint with API information."""
|
||||
return {
|
||||
"name": API_TITLE,
|
||||
"version": APP_VERSION,
|
||||
"description": API_DESCRIPTION,
|
||||
"docs_url": API_DOCS_URL,
|
||||
"api_prefix": API_PREFIX,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Basic health check endpoint."""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"version": APP_VERSION,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
|
||||
@ -6,10 +6,6 @@ pydantic-settings==2.7.1
|
||||
fastapi==0.115.6
|
||||
uvicorn[standard]==0.34.0
|
||||
|
||||
# Document Processing
|
||||
PyPDF2==3.0.1
|
||||
python-docx==1.1.2
|
||||
|
||||
# Utilities
|
||||
python-multipart==0.0.20
|
||||
|
||||
|
||||
@ -1,15 +1,14 @@
|
||||
"""
|
||||
API Routes - FastAPI routes for text processing operations.
|
||||
API Routes - Functional FastAPI routes for text processing.
|
||||
|
||||
This is the incoming adapter that translates HTTP requests into
|
||||
use case calls.
|
||||
domain operations. Routes pull the service directly from bootstrap.
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, HTTPException, status
|
||||
from fastapi import APIRouter, FastAPI, HTTPException, status
|
||||
|
||||
from ...core.domain.exceptions import (
|
||||
ChunkingError,
|
||||
@ -19,15 +18,13 @@ from ...core.domain.exceptions import (
|
||||
ProcessingError,
|
||||
UnsupportedFileTypeError,
|
||||
)
|
||||
from ...core.domain.models import Chunk, ChunkingStrategy, Document
|
||||
from ...core.domain.models import ChunkingStrategy
|
||||
from ...core.ports.incoming.text_processor import ITextProcessor
|
||||
from .api_schemas import (
|
||||
ChunkResponse,
|
||||
DeleteDocumentResponse,
|
||||
DocumentListResponse,
|
||||
DocumentMetadataResponse,
|
||||
DocumentResponse,
|
||||
ErrorResponse,
|
||||
ExtractAndChunkRequest,
|
||||
ExtractAndChunkResponse,
|
||||
HealthCheckResponse,
|
||||
@ -39,361 +36,409 @@ from .api_schemas import (
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextProcessorAPI:
|
||||
# Create FastAPI application
|
||||
app = FastAPI(
|
||||
title="Text Processor API",
|
||||
description="Text extraction and chunking system using Hexagonal Architecture",
|
||||
version="1.0.0",
|
||||
docs_url="/docs",
|
||||
redoc_url="/redoc",
|
||||
)
|
||||
|
||||
# Create API router
|
||||
router = APIRouter(prefix="/api/v1", tags=["Text Processing"])
|
||||
|
||||
|
||||
def _get_service() -> ITextProcessor:
|
||||
"""
|
||||
FastAPI routes for text processing.
|
||||
Get the text processor service from bootstrap singleton.
|
||||
|
||||
This adapter translates HTTP requests into domain operations
|
||||
and handles error mapping to HTTP responses.
|
||||
This function pulls the service directly without using FastAPI's Depends.
|
||||
|
||||
Returns:
|
||||
ITextProcessor: Core service instance
|
||||
"""
|
||||
from ...bootstrap import get_processor_service
|
||||
|
||||
def __init__(self, text_processor: ITextProcessor) -> None:
|
||||
"""
|
||||
Initialize API routes.
|
||||
return get_processor_service()
|
||||
|
||||
Args:
|
||||
text_processor: Text processor service (incoming port)
|
||||
"""
|
||||
self.text_processor = text_processor
|
||||
self.router = APIRouter()
|
||||
self._register_routes()
|
||||
logger.info("TextProcessorAPI initialized")
|
||||
|
||||
def _register_routes(self) -> None:
|
||||
"""Register all API routes."""
|
||||
self.router.add_api_route(
|
||||
"/process",
|
||||
self.process_document,
|
||||
methods=["POST"],
|
||||
response_model=ProcessDocumentResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
summary="Process a document",
|
||||
description="Extract text from document and store it",
|
||||
def _to_domain_strategy(request_strategy) -> ChunkingStrategy:
|
||||
"""
|
||||
Convert API request strategy to domain model.
|
||||
|
||||
Args:
|
||||
request_strategy: API request strategy schema
|
||||
|
||||
Returns:
|
||||
ChunkingStrategy: Domain strategy model
|
||||
"""
|
||||
return ChunkingStrategy(
|
||||
strategy_name=request_strategy.strategy_name,
|
||||
chunk_size=request_strategy.chunk_size,
|
||||
overlap_size=request_strategy.overlap_size,
|
||||
respect_boundaries=request_strategy.respect_boundaries,
|
||||
)
|
||||
|
||||
|
||||
def _to_document_response(document) -> DocumentResponse:
|
||||
"""
|
||||
Convert domain document to API response.
|
||||
|
||||
Args:
|
||||
document: Domain Document entity
|
||||
|
||||
Returns:
|
||||
DocumentResponse: API response model
|
||||
"""
|
||||
from .api_schemas import DocumentMetadataResponse
|
||||
|
||||
return DocumentResponse(
|
||||
id=str(document.id),
|
||||
content=document.content,
|
||||
metadata=DocumentMetadataResponse(
|
||||
file_name=document.metadata.file_name,
|
||||
file_type=document.metadata.file_type,
|
||||
file_size_bytes=document.metadata.file_size_bytes,
|
||||
created_at=document.metadata.created_at.isoformat(),
|
||||
author=document.metadata.author,
|
||||
page_count=document.metadata.page_count,
|
||||
),
|
||||
is_processed=document.is_processed,
|
||||
content_preview=document.get_content_preview(200),
|
||||
)
|
||||
|
||||
|
||||
def _to_chunk_response(chunk) -> ChunkResponse:
|
||||
"""
|
||||
Convert domain chunk to API response.
|
||||
|
||||
Args:
|
||||
chunk: Domain Chunk entity
|
||||
|
||||
Returns:
|
||||
ChunkResponse: API response model
|
||||
"""
|
||||
return ChunkResponse(
|
||||
id=str(chunk.id),
|
||||
document_id=str(chunk.document_id),
|
||||
content=chunk.content,
|
||||
sequence_number=chunk.sequence_number,
|
||||
start_char=chunk.start_char,
|
||||
end_char=chunk.end_char,
|
||||
length=chunk.get_length(),
|
||||
)
|
||||
|
||||
|
||||
def _map_domain_exception(exception: DomainException) -> HTTPException:
|
||||
"""
|
||||
Map domain exceptions to HTTP exceptions.
|
||||
|
||||
Args:
|
||||
exception: Domain exception
|
||||
|
||||
Returns:
|
||||
HTTPException: Corresponding HTTP exception
|
||||
"""
|
||||
if isinstance(exception, UnsupportedFileTypeError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=str(exception),
|
||||
)
|
||||
elif isinstance(exception, ExtractionError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
detail=str(exception),
|
||||
)
|
||||
elif isinstance(exception, ChunkingError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
detail=str(exception),
|
||||
)
|
||||
elif isinstance(exception, ProcessingError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(exception),
|
||||
)
|
||||
elif isinstance(exception, DocumentNotFoundError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(exception),
|
||||
)
|
||||
else:
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(exception),
|
||||
)
|
||||
|
||||
self.router.add_api_route(
|
||||
"/extract-and-chunk",
|
||||
self.extract_and_chunk,
|
||||
methods=["POST"],
|
||||
response_model=ExtractAndChunkResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Extract and chunk document",
|
||||
description="Extract text and split into chunks",
|
||||
|
||||
@router.post(
|
||||
"/process",
|
||||
response_model=ProcessDocumentResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
summary="Process a document",
|
||||
description="Extract text from document and store it",
|
||||
)
|
||||
async def process_document(request: ProcessDocumentRequest) -> ProcessDocumentResponse:
|
||||
"""
|
||||
Process a document endpoint.
|
||||
|
||||
Args:
|
||||
request: Processing request with file path and strategy
|
||||
|
||||
Returns:
|
||||
Processing response with document details
|
||||
|
||||
Raises:
|
||||
HTTPException: If processing fails
|
||||
"""
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
# Convert request to domain models
|
||||
file_path = Path(request.file_path)
|
||||
strategy = _to_domain_strategy(request.chunking_strategy)
|
||||
|
||||
# Execute use case
|
||||
document = service.process_document(file_path, strategy)
|
||||
|
||||
# Convert to response
|
||||
return ProcessDocumentResponse(
|
||||
document=_to_document_response(document)
|
||||
)
|
||||
|
||||
self.router.add_api_route(
|
||||
"/documents/{document_id}",
|
||||
self.get_document,
|
||||
methods=["GET"],
|
||||
response_model=DocumentResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Get document by ID",
|
||||
description="Retrieve a processed document",
|
||||
except DomainException as e:
|
||||
raise _map_domain_exception(e)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error processing document: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
self.router.add_api_route(
|
||||
"/documents",
|
||||
self.list_documents,
|
||||
methods=["GET"],
|
||||
response_model=DocumentListResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="List all documents",
|
||||
description="Retrieve all documents with pagination",
|
||||
|
||||
@router.post(
|
||||
"/extract-and-chunk",
|
||||
response_model=ExtractAndChunkResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Extract and chunk document",
|
||||
description="Extract text and split into chunks",
|
||||
)
|
||||
async def extract_and_chunk(
|
||||
request: ExtractAndChunkRequest,
|
||||
) -> ExtractAndChunkResponse:
|
||||
"""
|
||||
Extract and chunk document endpoint.
|
||||
|
||||
Args:
|
||||
request: Extract and chunk request
|
||||
|
||||
Returns:
|
||||
Response with chunks
|
||||
|
||||
Raises:
|
||||
HTTPException: If extraction or chunking fails
|
||||
"""
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
# Convert request to domain models
|
||||
file_path = Path(request.file_path)
|
||||
strategy = _to_domain_strategy(request.chunking_strategy)
|
||||
|
||||
# Execute use case
|
||||
chunks = service.extract_and_chunk(file_path, strategy)
|
||||
|
||||
# Convert to response
|
||||
chunk_responses = [_to_chunk_response(c) for c in chunks]
|
||||
|
||||
return ExtractAndChunkResponse(
|
||||
chunks=chunk_responses,
|
||||
total_chunks=len(chunk_responses),
|
||||
)
|
||||
|
||||
self.router.add_api_route(
|
||||
"/documents/{document_id}",
|
||||
self.delete_document,
|
||||
methods=["DELETE"],
|
||||
response_model=DeleteDocumentResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Delete document",
|
||||
description="Delete a document by ID",
|
||||
except DomainException as e:
|
||||
raise _map_domain_exception(e)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error extracting and chunking: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
self.router.add_api_route(
|
||||
"/health",
|
||||
self.health_check,
|
||||
methods=["GET"],
|
||||
response_model=HealthCheckResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Health check",
|
||||
description="Check API health and configuration",
|
||||
|
||||
@router.get(
|
||||
"/documents/{document_id}",
|
||||
response_model=DocumentResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Get document by ID",
|
||||
description="Retrieve a processed document",
|
||||
)
|
||||
async def get_document(document_id: str) -> DocumentResponse:
|
||||
"""
|
||||
Get document by ID endpoint.
|
||||
|
||||
Args:
|
||||
document_id: UUID of the document
|
||||
|
||||
Returns:
|
||||
Document response
|
||||
|
||||
Raises:
|
||||
HTTPException: If document not found
|
||||
"""
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
doc_uuid = UUID(document_id)
|
||||
document = service.get_document(doc_uuid)
|
||||
return _to_document_response(document)
|
||||
|
||||
except ValueError:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Invalid document ID format: {document_id}",
|
||||
)
|
||||
except DocumentNotFoundError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(e),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error retrieving document: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
async def process_document(
|
||||
self,
|
||||
request: ProcessDocumentRequest,
|
||||
) -> ProcessDocumentResponse:
|
||||
"""
|
||||
Process a document endpoint.
|
||||
|
||||
Args:
|
||||
request: Processing request with file path and strategy
|
||||
@router.get(
|
||||
"/documents",
|
||||
response_model=DocumentListResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="List all documents",
|
||||
description="Retrieve all documents with pagination",
|
||||
)
|
||||
async def list_documents(limit: int = 100, offset: int = 0) -> DocumentListResponse:
|
||||
"""
|
||||
List documents endpoint.
|
||||
|
||||
Returns:
|
||||
Processing response with document details
|
||||
Args:
|
||||
limit: Maximum number of documents to return
|
||||
offset: Number of documents to skip
|
||||
|
||||
Raises:
|
||||
HTTPException: If processing fails
|
||||
"""
|
||||
try:
|
||||
# Convert request to domain models
|
||||
file_path = Path(request.file_path)
|
||||
strategy = self._to_domain_strategy(request.chunking_strategy)
|
||||
Returns:
|
||||
List of documents with pagination info
|
||||
"""
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
# Execute use case
|
||||
document = self.text_processor.process_document(file_path, strategy)
|
||||
documents = service.list_documents(limit, offset)
|
||||
doc_responses = [_to_document_response(d) for d in documents]
|
||||
|
||||
# Convert to response
|
||||
return ProcessDocumentResponse(
|
||||
document=self._to_document_response(document)
|
||||
)
|
||||
|
||||
except DomainException as e:
|
||||
raise self._map_domain_exception(e)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error processing document: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
async def extract_and_chunk(
|
||||
self,
|
||||
request: ExtractAndChunkRequest,
|
||||
) -> ExtractAndChunkResponse:
|
||||
"""
|
||||
Extract and chunk document endpoint.
|
||||
|
||||
Args:
|
||||
request: Extract and chunk request
|
||||
|
||||
Returns:
|
||||
Response with chunks
|
||||
|
||||
Raises:
|
||||
HTTPException: If extraction or chunking fails
|
||||
"""
|
||||
try:
|
||||
# Convert request to domain models
|
||||
file_path = Path(request.file_path)
|
||||
strategy = self._to_domain_strategy(request.chunking_strategy)
|
||||
|
||||
# Execute use case
|
||||
chunks = self.text_processor.extract_and_chunk(file_path, strategy)
|
||||
|
||||
# Convert to response
|
||||
chunk_responses = [self._to_chunk_response(c) for c in chunks]
|
||||
|
||||
return ExtractAndChunkResponse(
|
||||
chunks=chunk_responses,
|
||||
total_chunks=len(chunk_responses),
|
||||
)
|
||||
|
||||
except DomainException as e:
|
||||
raise self._map_domain_exception(e)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error extracting and chunking: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
async def get_document(self, document_id: str) -> DocumentResponse:
|
||||
"""
|
||||
Get document by ID endpoint.
|
||||
|
||||
Args:
|
||||
document_id: UUID of the document
|
||||
|
||||
Returns:
|
||||
Document response
|
||||
|
||||
Raises:
|
||||
HTTPException: If document not found
|
||||
"""
|
||||
try:
|
||||
doc_uuid = UUID(document_id)
|
||||
document = self.text_processor.get_document(doc_uuid)
|
||||
return self._to_document_response(document)
|
||||
|
||||
except ValueError:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Invalid document ID format: {document_id}",
|
||||
)
|
||||
except DocumentNotFoundError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(e),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error retrieving document: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
async def list_documents(
|
||||
self,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
) -> DocumentListResponse:
|
||||
"""
|
||||
List documents endpoint.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of documents to return
|
||||
offset: Number of documents to skip
|
||||
|
||||
Returns:
|
||||
List of documents with pagination info
|
||||
"""
|
||||
try:
|
||||
documents = self.text_processor.list_documents(limit, offset)
|
||||
doc_responses = [self._to_document_response(d) for d in documents]
|
||||
|
||||
return DocumentListResponse(
|
||||
documents=doc_responses,
|
||||
total=len(doc_responses),
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error listing documents: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
async def delete_document(self, document_id: str) -> DeleteDocumentResponse:
|
||||
"""
|
||||
Delete document endpoint.
|
||||
|
||||
Args:
|
||||
document_id: UUID of the document
|
||||
|
||||
Returns:
|
||||
Deletion response
|
||||
|
||||
Raises:
|
||||
HTTPException: If document not found or deletion fails
|
||||
"""
|
||||
try:
|
||||
doc_uuid = UUID(document_id)
|
||||
success = self.text_processor.delete_document(doc_uuid)
|
||||
|
||||
return DeleteDocumentResponse(
|
||||
success=success,
|
||||
message=f"Document {document_id} deleted successfully",
|
||||
document_id=document_id,
|
||||
)
|
||||
|
||||
except ValueError:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Invalid document ID format: {document_id}",
|
||||
)
|
||||
except DocumentNotFoundError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(e),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error deleting document: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
async def health_check(self) -> HealthCheckResponse:
|
||||
"""
|
||||
Health check endpoint.
|
||||
|
||||
Returns:
|
||||
Health status and configuration
|
||||
"""
|
||||
# Note: This would ideally get info from dependencies
|
||||
return HealthCheckResponse(
|
||||
status="healthy",
|
||||
version="1.0.0",
|
||||
supported_file_types=["pdf", "docx", "txt"],
|
||||
available_strategies=["fixed_size", "paragraph"],
|
||||
return DocumentListResponse(
|
||||
documents=doc_responses,
|
||||
total=len(doc_responses),
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
def _to_domain_strategy(self, request_strategy) -> ChunkingStrategy:
|
||||
"""Convert API request strategy to domain model."""
|
||||
return ChunkingStrategy(
|
||||
strategy_name=request_strategy.strategy_name,
|
||||
chunk_size=request_strategy.chunk_size,
|
||||
overlap_size=request_strategy.overlap_size,
|
||||
respect_boundaries=request_strategy.respect_boundaries,
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error listing documents: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
def _to_document_response(self, document: Document) -> DocumentResponse:
|
||||
"""Convert domain document to API response."""
|
||||
return DocumentResponse(
|
||||
id=str(document.id),
|
||||
content=document.content,
|
||||
metadata=DocumentMetadataResponse(
|
||||
file_name=document.metadata.file_name,
|
||||
file_type=document.metadata.file_type,
|
||||
file_size_bytes=document.metadata.file_size_bytes,
|
||||
created_at=document.metadata.created_at.isoformat(),
|
||||
author=document.metadata.author,
|
||||
page_count=document.metadata.page_count,
|
||||
),
|
||||
is_processed=document.is_processed,
|
||||
content_preview=document.get_content_preview(200),
|
||||
|
||||
@router.delete(
|
||||
"/documents/{document_id}",
|
||||
response_model=DeleteDocumentResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Delete document",
|
||||
description="Delete a document by ID",
|
||||
)
|
||||
async def delete_document(document_id: str) -> DeleteDocumentResponse:
|
||||
"""
|
||||
Delete document endpoint.
|
||||
|
||||
Args:
|
||||
document_id: UUID of the document
|
||||
|
||||
Returns:
|
||||
Deletion response
|
||||
|
||||
Raises:
|
||||
HTTPException: If document not found or deletion fails
|
||||
"""
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
doc_uuid = UUID(document_id)
|
||||
success = service.delete_document(doc_uuid)
|
||||
|
||||
return DeleteDocumentResponse(
|
||||
success=success,
|
||||
message=f"Document {document_id} deleted successfully",
|
||||
document_id=document_id,
|
||||
)
|
||||
|
||||
def _to_chunk_response(self, chunk: Chunk) -> ChunkResponse:
|
||||
"""Convert domain chunk to API response."""
|
||||
return ChunkResponse(
|
||||
id=str(chunk.id),
|
||||
document_id=str(chunk.document_id),
|
||||
content=chunk.content,
|
||||
sequence_number=chunk.sequence_number,
|
||||
start_char=chunk.start_char,
|
||||
end_char=chunk.end_char,
|
||||
length=chunk.get_length(),
|
||||
except ValueError:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Invalid document ID format: {document_id}",
|
||||
)
|
||||
except DocumentNotFoundError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(e),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error deleting document: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
def _map_domain_exception(self, exception: DomainException) -> HTTPException:
|
||||
"""
|
||||
Map domain exceptions to HTTP exceptions.
|
||||
|
||||
This is where we translate domain errors into API errors.
|
||||
"""
|
||||
if isinstance(exception, UnsupportedFileTypeError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=str(exception),
|
||||
)
|
||||
elif isinstance(exception, ExtractionError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
detail=str(exception),
|
||||
)
|
||||
elif isinstance(exception, ChunkingError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
detail=str(exception),
|
||||
)
|
||||
elif isinstance(exception, ProcessingError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(exception),
|
||||
)
|
||||
elif isinstance(exception, DocumentNotFoundError):
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(exception),
|
||||
)
|
||||
else:
|
||||
return HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(exception),
|
||||
)
|
||||
@router.get(
|
||||
"/health",
|
||||
response_model=HealthCheckResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Health check",
|
||||
description="Check API health and configuration",
|
||||
)
|
||||
async def health_check() -> HealthCheckResponse:
|
||||
"""
|
||||
Health check endpoint.
|
||||
|
||||
Returns:
|
||||
Health status and configuration
|
||||
"""
|
||||
return HealthCheckResponse(
|
||||
status="healthy",
|
||||
version="1.0.0",
|
||||
supported_file_types=["pdf", "docx", "txt"],
|
||||
available_strategies=["fixed_size", "paragraph"],
|
||||
)
|
||||
|
||||
|
||||
# Include router in app
|
||||
app.include_router(router)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint with API information."""
|
||||
return {
|
||||
"name": "Text Processor API",
|
||||
"version": "1.0.0",
|
||||
"description": "Text extraction and chunking system using Hexagonal Architecture",
|
||||
"docs_url": "/docs",
|
||||
"api_prefix": "/api/v1",
|
||||
}
|
||||
|
||||
@ -1,15 +1,15 @@
|
||||
"""
|
||||
Bootstrap - Dependency Injection and Wiring.
|
||||
Bootstrap - Dependency Injection with Lazy Singleton Pattern.
|
||||
|
||||
This module wires together all components of the application.
|
||||
This module wires together the Core and Outgoing Adapters.
|
||||
The Core never imports Adapters - only the Bootstrap does.
|
||||
|
||||
This is the ONLY place where concrete implementations are instantiated
|
||||
and injected into the domain services.
|
||||
The ApplicationContainer manages ONLY:
|
||||
- Core Services
|
||||
- Outgoing Adapters (Extractors, Chunkers, Repository)
|
||||
"""
|
||||
import logging
|
||||
|
||||
from .adapters.incoming.api_routes import TextProcessorAPI
|
||||
from .adapters.outgoing.chunkers.context import ChunkingContext
|
||||
from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
|
||||
from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
||||
@ -28,13 +28,18 @@ from .shared.logging_config import setup_logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Module-level singleton instance (lazy initialization)
|
||||
_container: 'ApplicationContainer | None' = None
|
||||
|
||||
|
||||
class ApplicationContainer:
|
||||
"""
|
||||
Dependency Injection Container.
|
||||
Dependency Injection Container for Core and Outgoing Adapters.
|
||||
|
||||
This container manages the lifecycle and dependencies of:
|
||||
- Core Domain Services
|
||||
- Outgoing Adapters (Extractors, Chunkers, Repository)
|
||||
|
||||
This container manages the lifecycle and dependencies of all
|
||||
application components. It follows the Dependency Inversion Principle
|
||||
by depending on abstractions (ports) rather than concrete implementations.
|
||||
"""
|
||||
|
||||
def __init__(self, log_level: str = "INFO") -> None:
|
||||
@ -48,28 +53,25 @@ class ApplicationContainer:
|
||||
setup_logging(level=log_level)
|
||||
logger.info("Initializing ApplicationContainer")
|
||||
|
||||
# Outgoing adapters
|
||||
# Create Outgoing Adapters
|
||||
self._repository = self._create_repository()
|
||||
self._extractor_factory = self._create_extractor_factory()
|
||||
self._chunking_context = self._create_chunking_context()
|
||||
|
||||
# Core service
|
||||
# Create Core Service (depends only on Ports)
|
||||
self._text_processor_service = self._create_text_processor_service()
|
||||
|
||||
# Incoming adapter
|
||||
self._api = self._create_api()
|
||||
|
||||
logger.info("ApplicationContainer initialized successfully")
|
||||
|
||||
@property
|
||||
def text_processor_service(self) -> ITextProcessor:
|
||||
"""Get the text processor service."""
|
||||
return self._text_processor_service
|
||||
"""
|
||||
Get the text processor service.
|
||||
|
||||
@property
|
||||
def api(self) -> TextProcessorAPI:
|
||||
"""Get the API adapter."""
|
||||
return self._api
|
||||
Returns:
|
||||
ITextProcessor: Core service implementing the incoming port
|
||||
"""
|
||||
return self._text_processor_service
|
||||
|
||||
def _create_repository(self) -> InMemoryDocumentRepository:
|
||||
"""
|
||||
@ -130,7 +132,7 @@ class ApplicationContainer:
|
||||
"""
|
||||
Create the core text processor service.
|
||||
|
||||
Injects all required dependencies (repositories, factories, contexts).
|
||||
Injects all required dependencies via Ports (Dependency Inversion).
|
||||
|
||||
Returns:
|
||||
Configured text processor service
|
||||
@ -142,24 +144,36 @@ class ApplicationContainer:
|
||||
repository=self._repository,
|
||||
)
|
||||
|
||||
def _create_api(self) -> TextProcessorAPI:
|
||||
"""
|
||||
Create the FastAPI adapter.
|
||||
|
||||
Injects the text processor service.
|
||||
def get_processor_service() -> ITextProcessor:
|
||||
"""
|
||||
Lazy singleton provider for the text processor service.
|
||||
|
||||
Returns:
|
||||
Configured API adapter
|
||||
"""
|
||||
logger.debug("Creating TextProcessorAPI")
|
||||
return TextProcessorAPI(text_processor=self._text_processor_service)
|
||||
This function ensures the ApplicationContainer is instantiated only once
|
||||
and returns the core service. API routes pull the service via this function.
|
||||
|
||||
Returns:
|
||||
ITextProcessor: Core service implementing the incoming port
|
||||
|
||||
Example:
|
||||
>>> service = get_processor_service()
|
||||
>>> document = service.process_document(file_path, strategy)
|
||||
"""
|
||||
global _container
|
||||
|
||||
if _container is None:
|
||||
logger.info("Lazy initializing ApplicationContainer (first access)")
|
||||
_container = ApplicationContainer(log_level="INFO")
|
||||
|
||||
return _container.text_processor_service
|
||||
|
||||
|
||||
def create_application(log_level: str = "INFO") -> ApplicationContainer:
|
||||
"""
|
||||
Factory function to create a fully wired application.
|
||||
Factory function to create a fully wired application container.
|
||||
|
||||
This is the main entry point for dependency injection.
|
||||
This is the main entry point for manual dependency injection.
|
||||
For API routes, use get_processor_service() instead.
|
||||
|
||||
Args:
|
||||
log_level: Logging level for the application
|
||||
@ -170,24 +184,6 @@ def create_application(log_level: str = "INFO") -> ApplicationContainer:
|
||||
Example:
|
||||
>>> container = create_application(log_level="DEBUG")
|
||||
>>> service = container.text_processor_service
|
||||
>>> api = container.api
|
||||
"""
|
||||
logger.info("Creating application container")
|
||||
logger.info("Creating application container via factory")
|
||||
return ApplicationContainer(log_level=log_level)
|
||||
|
||||
|
||||
def get_text_processor_service(
|
||||
container: ApplicationContainer,
|
||||
) -> ITextProcessor:
|
||||
"""
|
||||
Get the text processor service from container.
|
||||
|
||||
This is a convenience function for accessing the service.
|
||||
|
||||
Args:
|
||||
container: Application container
|
||||
|
||||
Returns:
|
||||
Text processor service instance
|
||||
"""
|
||||
return container.text_processor_service
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user