some fixes on architecture. make bootstrap wraps only the hexagonal plus the outgoing adapters
This commit is contained in:
parent
70f5b1478c
commit
fd39184c0c
410
ARCHITECTURE.md
410
ARCHITECTURE.md
@ -1,410 +0,0 @@
|
|||||||
# Architecture Documentation
|
|
||||||
|
|
||||||
## Hexagonal Architecture Overview
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────────────┐
|
|
||||||
│ INCOMING ADAPTERS │
|
|
||||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ FastAPI Routes (HTTP) │ │
|
|
||||||
│ │ - ProcessDocumentRequest → API Schemas │ │
|
|
||||||
│ │ - ExtractAndChunkRequest → API Schemas │ │
|
|
||||||
│ └──────────────────────────────────────────────────────────────┘ │
|
|
||||||
└──────────────────────────────┬──────────────────────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌─────────────────────────────────────────────────────────────────────┐
|
|
||||||
│ CORE DOMAIN │
|
|
||||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ PORTS (Interfaces) │ │
|
|
||||||
│ │ ┌────────────────────┐ ┌───────────────────────────┐ │ │
|
|
||||||
│ │ │ Incoming Ports │ │ Outgoing Ports │ │ │
|
|
||||||
│ │ │ - ITextProcessor │ │ - IExtractor │ │ │
|
|
||||||
│ │ │ │ │ - IChunker │ │ │
|
|
||||||
│ │ │ │ │ - IDocumentRepository │ │ │
|
|
||||||
│ │ └────────────────────┘ └───────────────────────────┘ │ │
|
|
||||||
│ └──────────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │
|
|
||||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ SERVICES (Business Logic) │ │
|
|
||||||
│ │ - DocumentProcessorService │ │
|
|
||||||
│ │ • Orchestrates Extract → Clean → Chunk → Save │ │
|
|
||||||
│ │ • Depends ONLY on Port interfaces │ │
|
|
||||||
│ └──────────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │
|
|
||||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ DOMAIN MODELS (Rich Entities) │ │
|
|
||||||
│ │ - Document (with validation & business methods) │ │
|
|
||||||
│ │ - Chunk (immutable value object) │ │
|
|
||||||
│ │ - ChunkingStrategy (configuration) │ │
|
|
||||||
│ │ - DocumentMetadata │ │
|
|
||||||
│ └──────────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │
|
|
||||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ DOMAIN LOGIC (Pure Functions) │ │
|
|
||||||
│ │ - normalize_whitespace() │ │
|
|
||||||
│ │ - clean_text() │ │
|
|
||||||
│ │ - split_into_paragraphs() │ │
|
|
||||||
│ │ - find_sentence_boundary_before() │ │
|
|
||||||
│ └──────────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │
|
|
||||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ EXCEPTIONS (Domain Errors) │ │
|
|
||||||
│ │ - ExtractionError, ChunkingError, ProcessingError │ │
|
|
||||||
│ │ - ValidationError, RepositoryError │ │
|
|
||||||
│ └──────────────────────────────────────────────────────────────┘ │
|
|
||||||
└──────────────────────────────┬──────────────────────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌─────────────────────────────────────────────────────────────────────┐
|
|
||||||
│ OUTGOING ADAPTERS │
|
|
||||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ EXTRACTORS (Implements IExtractor) │ │
|
|
||||||
│ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │
|
|
||||||
│ │ │ PDFExtractor│ │DocxExtractor│ │TxtExtractor│ │ │
|
|
||||||
│ │ │ (PyPDF2) │ │(python-docx)│ │ (built-in) │ │ │
|
|
||||||
│ │ └────────────┘ └────────────┘ └────────────┘ │ │
|
|
||||||
│ │ - Managed by ExtractorFactory (Factory Pattern) │ │
|
|
||||||
│ └──────────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │
|
|
||||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ CHUNKERS (Implements IChunker) │ │
|
|
||||||
│ │ ┌─────────────────┐ ┌──────────────────┐ │ │
|
|
||||||
│ │ │ FixedSizeChunker│ │ParagraphChunker │ │ │
|
|
||||||
│ │ │ - Fixed chunks │ │ - Respect │ │ │
|
|
||||||
│ │ │ - With overlap │ │ paragraphs │ │ │
|
|
||||||
│ │ └─────────────────┘ └──────────────────┘ │ │
|
|
||||||
│ │ - Managed by ChunkingContext (Strategy Pattern) │ │
|
|
||||||
│ └──────────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │
|
|
||||||
│ ┌──────────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ REPOSITORY (Implements IDocumentRepository) │ │
|
|
||||||
│ │ ┌──────────────────────────────────┐ │ │
|
|
||||||
│ │ │ InMemoryDocumentRepository │ │ │
|
|
||||||
│ │ │ - Thread-safe Dict storage │ │ │
|
|
||||||
│ │ │ - Easy to swap for PostgreSQL │ │ │
|
|
||||||
│ │ └──────────────────────────────────┘ │ │
|
|
||||||
│ └──────────────────────────────────────────────────────────────┘ │
|
|
||||||
└─────────────────────────────────────────────────────────────────────┘
|
|
||||||
|
|
||||||
┌─────────────────────────────────────────────────────────────────────┐
|
|
||||||
│ BOOTSTRAP (Wiring) │
|
|
||||||
│ ApplicationContainer: │
|
|
||||||
│ - Creates all adapters │
|
|
||||||
│ - Injects dependencies into core │
|
|
||||||
│ - ONLY place where adapters are instantiated │
|
|
||||||
└─────────────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Data Flow: Process Document
|
|
||||||
|
|
||||||
```
|
|
||||||
1. HTTP Request
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
2. FastAPI Route (Incoming Adapter)
|
|
||||||
│ - Validates request schema
|
|
||||||
▼
|
|
||||||
3. DocumentProcessorService (Core)
|
|
||||||
│ - Calls ExtractorFactory
|
|
||||||
▼
|
|
||||||
4. PDFExtractor (Outgoing Adapter)
|
|
||||||
│ - Extracts text using PyPDF2
|
|
||||||
│ - Maps PyPDF2 exceptions → Domain exceptions
|
|
||||||
▼
|
|
||||||
5. DocumentProcessorService
|
|
||||||
│ - Cleans text using domain logic utils
|
|
||||||
│ - Validates Document
|
|
||||||
▼
|
|
||||||
6. InMemoryRepository (Outgoing Adapter)
|
|
||||||
│ - Saves Document
|
|
||||||
▼
|
|
||||||
7. DocumentProcessorService
|
|
||||||
│ - Returns Document
|
|
||||||
▼
|
|
||||||
8. FastAPI Route
|
|
||||||
│ - Converts Document → DocumentResponse
|
|
||||||
▼
|
|
||||||
9. HTTP Response
|
|
||||||
```
|
|
||||||
|
|
||||||
## Data Flow: Extract and Chunk
|
|
||||||
|
|
||||||
```
|
|
||||||
1. HTTP Request
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
2. FastAPI Route
|
|
||||||
│ - Validates request
|
|
||||||
▼
|
|
||||||
3. DocumentProcessorService
|
|
||||||
│ - Gets extractor from factory
|
|
||||||
│ - Extracts text
|
|
||||||
▼
|
|
||||||
4. Extractor (PDF/DOCX/TXT)
|
|
||||||
│ - Returns Document
|
|
||||||
▼
|
|
||||||
5. DocumentProcessorService
|
|
||||||
│ - Cleans text
|
|
||||||
│ - Calls ChunkingContext
|
|
||||||
▼
|
|
||||||
6. ChunkingContext (Strategy Pattern)
|
|
||||||
│ - Selects appropriate chunker
|
|
||||||
▼
|
|
||||||
7. Chunker (FixedSize/Paragraph)
|
|
||||||
│ - Splits text into segments
|
|
||||||
│ - Creates Chunk entities
|
|
||||||
▼
|
|
||||||
8. DocumentProcessorService
|
|
||||||
│ - Returns List[Chunk]
|
|
||||||
▼
|
|
||||||
9. FastAPI Route
|
|
||||||
│ - Converts Chunks → ChunkResponse[]
|
|
||||||
▼
|
|
||||||
10. HTTP Response
|
|
||||||
```
|
|
||||||
|
|
||||||
## Dependency Rules
|
|
||||||
|
|
||||||
### ✅ ALLOWED Dependencies
|
|
||||||
|
|
||||||
```
|
|
||||||
Incoming Adapters → Core Ports (Incoming)
|
|
||||||
Core Services → Core Ports (Outgoing)
|
|
||||||
Core → Core (Domain Models, Logic Utils, Exceptions)
|
|
||||||
Bootstrap → Everything (Wiring only)
|
|
||||||
```
|
|
||||||
|
|
||||||
### ❌ FORBIDDEN Dependencies
|
|
||||||
|
|
||||||
```
|
|
||||||
Core → Adapters (NEVER!)
|
|
||||||
Core → External Libraries (Only in Adapters)
|
|
||||||
Domain Models → Services
|
|
||||||
Domain Models → Ports
|
|
||||||
```
|
|
||||||
|
|
||||||
## Key Design Patterns
|
|
||||||
|
|
||||||
### 1. Hexagonal Architecture (Ports & Adapters)
|
|
||||||
- **Purpose**: Isolate core business logic from external concerns
|
|
||||||
- **Implementation**:
|
|
||||||
- Ports: Interface definitions (ITextProcessor, IExtractor, etc.)
|
|
||||||
- Adapters: Concrete implementations (PDFExtractor, FastAPI routes)
|
|
||||||
|
|
||||||
### 2. Factory Pattern
|
|
||||||
- **Class**: `ExtractorFactory`
|
|
||||||
- **Purpose**: Create appropriate extractor based on file extension
|
|
||||||
- **Benefit**: Centralized extractor management, easy to add new types
|
|
||||||
|
|
||||||
### 3. Strategy Pattern
|
|
||||||
- **Class**: `ChunkingContext`
|
|
||||||
- **Purpose**: Switch between chunking strategies at runtime
|
|
||||||
- **Strategies**: FixedSizeChunker, ParagraphChunker
|
|
||||||
- **Benefit**: Easy to add new chunking algorithms
|
|
||||||
|
|
||||||
### 4. Repository Pattern
|
|
||||||
- **Interface**: `IDocumentRepository`
|
|
||||||
- **Implementation**: `InMemoryDocumentRepository`
|
|
||||||
- **Purpose**: Abstract data persistence
|
|
||||||
- **Benefit**: Easy to swap storage (memory → PostgreSQL → MongoDB)
|
|
||||||
|
|
||||||
### 5. Dependency Injection
|
|
||||||
- **Class**: `ApplicationContainer`
|
|
||||||
- **Purpose**: Wire all dependencies at startup
|
|
||||||
- **Benefit**: Loose coupling, easy testing
|
|
||||||
|
|
||||||
### 6. Template Method Pattern
|
|
||||||
- **Classes**: `BaseExtractor`, `BaseChunker`
|
|
||||||
- **Purpose**: Define algorithm skeleton, let subclasses fill in details
|
|
||||||
- **Benefit**: Code reuse, consistent behavior
|
|
||||||
|
|
||||||
## SOLID Principles Application
|
|
||||||
|
|
||||||
### Single Responsibility Principle (SRP)
|
|
||||||
- Each extractor handles ONE file type
|
|
||||||
- Each chunker handles ONE strategy
|
|
||||||
- Each service method does ONE thing
|
|
||||||
- Functions are max 15-20 lines
|
|
||||||
|
|
||||||
### Open/Closed Principle (OCP)
|
|
||||||
- Add new extractors without modifying core
|
|
||||||
- Add new chunkers without modifying service
|
|
||||||
- Extend via interfaces, not modification
|
|
||||||
|
|
||||||
### Liskov Substitution Principle (LSP)
|
|
||||||
- All IExtractor implementations are interchangeable
|
|
||||||
- All IChunker implementations are interchangeable
|
|
||||||
- Polymorphism works correctly
|
|
||||||
|
|
||||||
### Interface Segregation Principle (ISP)
|
|
||||||
- Small, focused interfaces
|
|
||||||
- IExtractor: Only extraction concerns
|
|
||||||
- IChunker: Only chunking concerns
|
|
||||||
- No fat interfaces
|
|
||||||
|
|
||||||
### Dependency Inversion Principle (DIP)
|
|
||||||
- Core depends on IExtractor (abstraction)
|
|
||||||
- Core does NOT depend on PDFExtractor (concrete)
|
|
||||||
- High-level modules don't depend on low-level modules
|
|
||||||
|
|
||||||
## Error Handling Strategy
|
|
||||||
|
|
||||||
### Domain Exceptions
|
|
||||||
All external errors are caught and wrapped in domain exceptions:
|
|
||||||
|
|
||||||
```python
|
|
||||||
try:
|
|
||||||
PyPDF2.PdfReader(file) # External library
|
|
||||||
except PyPDF2.errors.PdfReadError as e:
|
|
||||||
raise ExtractionError( # Domain exception
|
|
||||||
message="Invalid PDF",
|
|
||||||
details=str(e),
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Exception Hierarchy
|
|
||||||
```
|
|
||||||
DomainException (Base)
|
|
||||||
├── ExtractionError
|
|
||||||
│ ├── UnsupportedFileTypeError
|
|
||||||
│ └── EmptyContentError
|
|
||||||
├── ChunkingError
|
|
||||||
├── ProcessingError
|
|
||||||
├── ValidationError
|
|
||||||
└── RepositoryError
|
|
||||||
└── DocumentNotFoundError
|
|
||||||
```
|
|
||||||
|
|
||||||
### HTTP Error Mapping
|
|
||||||
FastAPI adapter maps domain exceptions to HTTP status codes:
|
|
||||||
- `UnsupportedFileTypeError` → 400 Bad Request
|
|
||||||
- `ExtractionError` → 422 Unprocessable Entity
|
|
||||||
- `DocumentNotFoundError` → 404 Not Found
|
|
||||||
- `ProcessingError` → 500 Internal Server Error
|
|
||||||
|
|
||||||
## Testing Strategy
|
|
||||||
|
|
||||||
### Unit Tests (Core)
|
|
||||||
- Test domain models in isolation
|
|
||||||
- Test logic utils (pure functions)
|
|
||||||
- Test services with mock ports
|
|
||||||
|
|
||||||
### Integration Tests (Adapters)
|
|
||||||
- Test extractors with real files
|
|
||||||
- Test chunkers with real text
|
|
||||||
- Test repository operations
|
|
||||||
|
|
||||||
### API Tests (End-to-End)
|
|
||||||
- Test FastAPI routes
|
|
||||||
- Test complete workflows
|
|
||||||
- Test error scenarios
|
|
||||||
|
|
||||||
### Example Test Structure
|
|
||||||
```python
|
|
||||||
def test_document_processor_service():
|
|
||||||
# Arrange: Create mocks
|
|
||||||
mock_repository = MockRepository()
|
|
||||||
mock_factory = MockExtractorFactory()
|
|
||||||
mock_context = MockChunkingContext()
|
|
||||||
|
|
||||||
# Act: Inject mocks
|
|
||||||
service = DocumentProcessorService(
|
|
||||||
extractor_factory=mock_factory,
|
|
||||||
chunking_context=mock_context,
|
|
||||||
repository=mock_repository,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Assert: Test behavior
|
|
||||||
result = service.process_document(...)
|
|
||||||
assert result.is_processed
|
|
||||||
```
|
|
||||||
|
|
||||||
## Extensibility Examples
|
|
||||||
|
|
||||||
### Adding a New Extractor (HTML)
|
|
||||||
1. Create `html_extractor.py`:
|
|
||||||
```python
|
|
||||||
class HTMLExtractor(BaseExtractor):
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__(supported_extensions=['html', 'htm'])
|
|
||||||
|
|
||||||
def _extract_text(self, file_path: Path) -> str:
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
html = file_path.read_text()
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
return soup.get_text()
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Register in `bootstrap.py`:
|
|
||||||
```python
|
|
||||||
factory.register_extractor(HTMLExtractor())
|
|
||||||
```
|
|
||||||
|
|
||||||
### Adding a New Chunking Strategy (Sentence)
|
|
||||||
1. Create `sentence_chunker.py`:
|
|
||||||
```python
|
|
||||||
class SentenceChunker(BaseChunker):
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__(strategy_name="sentence")
|
|
||||||
|
|
||||||
def _split_text(self, text: str, strategy: ChunkingStrategy) -> List[tuple[str, int, int]]:
|
|
||||||
# Use NLTK to split into sentences
|
|
||||||
sentences = nltk.sent_tokenize(text)
|
|
||||||
# Group sentences to reach chunk_size
|
|
||||||
return grouped_segments
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Register in `bootstrap.py`:
|
|
||||||
```python
|
|
||||||
context.register_chunker(SentenceChunker())
|
|
||||||
```
|
|
||||||
|
|
||||||
### Adding Database Persistence
|
|
||||||
1. Create `postgres_repository.py`:
|
|
||||||
```python
|
|
||||||
class PostgresDocumentRepository(IDocumentRepository):
|
|
||||||
def __init__(self, connection_string: str):
|
|
||||||
self.engine = create_engine(connection_string)
|
|
||||||
|
|
||||||
def save(self, document: Document) -> Document:
|
|
||||||
# Save to PostgreSQL
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Swap in `bootstrap.py`:
|
|
||||||
```python
|
|
||||||
def _create_repository(self):
|
|
||||||
return PostgresDocumentRepository("postgresql://...")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Performance Considerations
|
|
||||||
|
|
||||||
### Current Implementation
|
|
||||||
- In-memory storage: O(1) lookups, limited by RAM
|
|
||||||
- Synchronous processing: Sequential file processing
|
|
||||||
- Thread-safe: Uses locks for concurrent access
|
|
||||||
|
|
||||||
### Future Optimizations
|
|
||||||
- **Async Processing**: Use `asyncio` for concurrent document processing
|
|
||||||
- **Caching**: Add Redis for frequently accessed documents
|
|
||||||
- **Streaming**: Process large files in chunks
|
|
||||||
- **Database**: Use PostgreSQL with indexes for better queries
|
|
||||||
- **Message Queue**: Use Celery/RabbitMQ for background processing
|
|
||||||
|
|
||||||
## Deployment Considerations
|
|
||||||
|
|
||||||
### Configuration
|
|
||||||
- Use environment variables for settings
|
|
||||||
- Externalize file paths, database connections
|
|
||||||
- Use `pydantic-settings` for config management
|
|
||||||
|
|
||||||
### Monitoring
|
|
||||||
- Add structured logging (JSON format)
|
|
||||||
- Track metrics: processing time, error rates
|
|
||||||
- Use APM tools (DataDog, New Relic)
|
|
||||||
|
|
||||||
### Scaling
|
|
||||||
- Horizontal: Run multiple FastAPI instances behind load balancer
|
|
||||||
- Vertical: Increase resources for compute-heavy extraction
|
|
||||||
- Database: Use connection pooling, read replicas
|
|
||||||
@ -1,408 +0,0 @@
|
|||||||
# Architecture Corrections Summary
|
|
||||||
|
|
||||||
## What Was Fixed
|
|
||||||
|
|
||||||
This document summarizes the corrections made to ensure **strict Hexagonal Architecture compliance**.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ❌ Problems Found
|
|
||||||
|
|
||||||
### 1. Base Classes in Wrong Layer
|
|
||||||
**Problem**: Abstract base classes (`base.py`) were located in the Adapters layer.
|
|
||||||
|
|
||||||
**Files Removed**:
|
|
||||||
- `src/adapters/outgoing/extractors/base.py` ❌
|
|
||||||
- `src/adapters/outgoing/chunkers/base.py` ❌
|
|
||||||
|
|
||||||
**Why This Was Wrong**:
|
|
||||||
- Abstract base classes define **contracts** (interfaces)
|
|
||||||
- Contracts belong in the **Core Ports** layer, NOT Adapters
|
|
||||||
- Adapters should only contain **concrete implementations**
|
|
||||||
|
|
||||||
### 2. Missing Port Interfaces
|
|
||||||
**Problem**: Factory and Context interfaces were defined in Adapters.
|
|
||||||
|
|
||||||
**What Was Missing**:
|
|
||||||
- No `IExtractorFactory` interface in Core Ports
|
|
||||||
- No `IChunkingContext` interface in Core Ports
|
|
||||||
|
|
||||||
**Why This Was Wrong**:
|
|
||||||
- Service layer was importing from Adapters (violates dependency rules)
|
|
||||||
- Core → Adapters dependency is **strictly forbidden**
|
|
||||||
|
|
||||||
### 3. Incorrect Imports in Service
|
|
||||||
**Problem**: Core Service imported from Adapters layer.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# WRONG ❌
|
|
||||||
from ...adapters.outgoing.extractors.factory import IExtractorFactory
|
|
||||||
from ...adapters.outgoing.chunkers.context import IChunkingContext
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why This Was Wrong**:
|
|
||||||
- Core must NEVER import from Adapters
|
|
||||||
- Creates circular dependency risk
|
|
||||||
- Violates Dependency Inversion Principle
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Solutions Implemented
|
|
||||||
|
|
||||||
### 1. Created Port Interfaces in Core
|
|
||||||
|
|
||||||
**New Files Created**:
|
|
||||||
```
|
|
||||||
src/core/ports/outgoing/extractor_factory.py ✅
|
|
||||||
src/core/ports/outgoing/chunking_context.py ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
**Content**:
|
|
||||||
```python
|
|
||||||
# src/core/ports/outgoing/extractor_factory.py
|
|
||||||
class IExtractorFactory(ABC):
|
|
||||||
"""Interface for extractor factory (PORT)."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def create_extractor(self, file_path: Path) -> IExtractor:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def register_extractor(self, extractor: IExtractor) -> None:
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
```python
|
|
||||||
# src/core/ports/outgoing/chunking_context.py
|
|
||||||
class IChunkingContext(ABC):
|
|
||||||
"""Interface for chunking context (PORT)."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def set_strategy(self, strategy_name: str) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def execute_chunking(...) -> List[Chunk]:
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Updated Concrete Implementations
|
|
||||||
|
|
||||||
**Extractors** - Now directly implement `IExtractor` port:
|
|
||||||
```python
|
|
||||||
# src/adapters/outgoing/extractors/pdf_extractor.py
|
|
||||||
from ....core.ports.outgoing.extractor import IExtractor ✅
|
|
||||||
|
|
||||||
class PDFExtractor(IExtractor):
|
|
||||||
"""Concrete PDF extractor implementing IExtractor port."""
|
|
||||||
|
|
||||||
def extract(self, file_path: Path) -> Document:
|
|
||||||
# Direct implementation, no base class needed
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
**Chunkers** - Now directly implement `IChunker` port:
|
|
||||||
```python
|
|
||||||
# src/adapters/outgoing/chunkers/fixed_size_chunker.py
|
|
||||||
from ....core.ports.outgoing.chunker import IChunker ✅
|
|
||||||
|
|
||||||
class FixedSizeChunker(IChunker):
|
|
||||||
"""Concrete fixed-size chunker implementing IChunker port."""
|
|
||||||
|
|
||||||
def chunk(self, text: str, ...) -> List[Chunk]:
|
|
||||||
# Direct implementation, no base class needed
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
**Factory** - Now implements `IExtractorFactory` port:
|
|
||||||
```python
|
|
||||||
# src/adapters/outgoing/extractors/factory.py
|
|
||||||
from ....core.ports.outgoing.extractor_factory import IExtractorFactory ✅
|
|
||||||
|
|
||||||
class ExtractorFactory(IExtractorFactory):
|
|
||||||
"""Concrete factory implementing IExtractorFactory port."""
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
**Context** - Now implements `IChunkingContext` port:
|
|
||||||
```python
|
|
||||||
# src/adapters/outgoing/chunkers/context.py
|
|
||||||
from ....core.ports.outgoing.chunking_context import IChunkingContext ✅
|
|
||||||
|
|
||||||
class ChunkingContext(IChunkingContext):
|
|
||||||
"""Concrete context implementing IChunkingContext port."""
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Fixed Service Layer Imports
|
|
||||||
|
|
||||||
**Before** (WRONG ❌):
|
|
||||||
```python
|
|
||||||
# src/core/services/document_processor_service.py
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from ...adapters.outgoing.extractors.factory import IExtractorFactory
|
|
||||||
from ...adapters.outgoing.chunkers.context import IChunkingContext
|
|
||||||
```
|
|
||||||
|
|
||||||
**After** (CORRECT ✅):
|
|
||||||
```python
|
|
||||||
# src/core/services/document_processor_service.py
|
|
||||||
from ..ports.outgoing.chunking_context import IChunkingContext
|
|
||||||
from ..ports.outgoing.extractor_factory import IExtractorFactory
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Final Architecture
|
|
||||||
|
|
||||||
### Core Layer (Pure Domain)
|
|
||||||
```
|
|
||||||
src/core/
|
|
||||||
├── domain/
|
|
||||||
│ ├── models.py # Pydantic v2 entities
|
|
||||||
│ ├── exceptions.py # Domain exceptions
|
|
||||||
│ └── logic_utils.py # Pure functions
|
|
||||||
├── ports/
|
|
||||||
│ ├── incoming/
|
|
||||||
│ │ └── text_processor.py # ITextProcessor
|
|
||||||
│ └── outgoing/
|
|
||||||
│ ├── extractor.py # IExtractor
|
|
||||||
│ ├── extractor_factory.py # IExtractorFactory ✅ NEW
|
|
||||||
│ ├── chunker.py # IChunker
|
|
||||||
│ ├── chunking_context.py # IChunkingContext ✅ NEW
|
|
||||||
│ └── repository.py # IDocumentRepository
|
|
||||||
└── services/
|
|
||||||
└── document_processor_service.py # Orchestrator
|
|
||||||
```
|
|
||||||
|
|
||||||
### Adapters Layer (Infrastructure)
|
|
||||||
```
|
|
||||||
src/adapters/
|
|
||||||
├── incoming/
|
|
||||||
│ ├── api_routes.py # FastAPI (implements incoming port)
|
|
||||||
│ └── api_schemas.py # API DTOs
|
|
||||||
└── outgoing/
|
|
||||||
├── extractors/
|
|
||||||
│ ├── pdf_extractor.py # Implements IExtractor
|
|
||||||
│ ├── docx_extractor.py # Implements IExtractor
|
|
||||||
│ ├── txt_extractor.py # Implements IExtractor
|
|
||||||
│ └── factory.py # Implements IExtractorFactory
|
|
||||||
├── chunkers/
|
|
||||||
│ ├── fixed_size_chunker.py # Implements IChunker
|
|
||||||
│ ├── paragraph_chunker.py # Implements IChunker
|
|
||||||
│ └── context.py # Implements IChunkingContext
|
|
||||||
└── persistence/
|
|
||||||
└── in_memory_repository.py # Implements IDocumentRepository
|
|
||||||
```
|
|
||||||
|
|
||||||
### Bootstrap Layer (Wiring)
|
|
||||||
```
|
|
||||||
src/bootstrap.py # Dependency Injection
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Verification Results
|
|
||||||
|
|
||||||
### 1. No Adapters Imports in Core
|
|
||||||
```bash
|
|
||||||
$ grep -r "from.*adapters" src/core/
|
|
||||||
# Result: NO MATCHES ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. No External Libraries in Core
|
|
||||||
```bash
|
|
||||||
$ grep -rE "import (PyPDF2|docx|fastapi)" src/core/
|
|
||||||
# Result: NO MATCHES ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. All Interfaces in Core Ports
|
|
||||||
```bash
|
|
||||||
$ find src/core/ports -name "*.py" | grep -v __init__
|
|
||||||
src/core/ports/incoming/text_processor.py
|
|
||||||
src/core/ports/outgoing/extractor.py
|
|
||||||
src/core/ports/outgoing/extractor_factory.py ✅ NEW
|
|
||||||
src/core/ports/outgoing/chunker.py
|
|
||||||
src/core/ports/outgoing/chunking_context.py ✅ NEW
|
|
||||||
src/core/ports/outgoing/repository.py
|
|
||||||
# Result: ALL INTERFACES IN PORTS ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. No Base Classes in Adapters
|
|
||||||
```bash
|
|
||||||
$ find src/adapters -name "base.py"
|
|
||||||
# Result: NO MATCHES ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Dependency Direction
|
|
||||||
|
|
||||||
### ✅ Correct Flow (Inward)
|
|
||||||
```
|
|
||||||
FastAPI Routes
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
ITextProcessor (PORT)
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
DocumentProcessorService (CORE)
|
|
||||||
│
|
|
||||||
├──► IExtractor (PORT)
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ PDFExtractor (ADAPTER)
|
|
||||||
│
|
|
||||||
├──► IChunker (PORT)
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ FixedSizeChunker (ADAPTER)
|
|
||||||
│
|
|
||||||
└──► IDocumentRepository (PORT)
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
InMemoryRepository (ADAPTER)
|
|
||||||
```
|
|
||||||
|
|
||||||
### ❌ What We Avoided
|
|
||||||
```
|
|
||||||
Core Service ──X──> Adapters # NEVER!
|
|
||||||
Core Service ──X──> PyPDF2 # NEVER!
|
|
||||||
Core Service ──X──> FastAPI # NEVER!
|
|
||||||
Domain Models ──X──> Services # NEVER!
|
|
||||||
Domain Models ──X──> Ports # NEVER!
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏆 Benefits Achieved
|
|
||||||
|
|
||||||
### 1. **Pure Core Domain**
|
|
||||||
- Core has ZERO framework dependencies
|
|
||||||
- Core can be tested without ANY infrastructure
|
|
||||||
- Core is completely portable
|
|
||||||
|
|
||||||
### 2. **True Dependency Inversion**
|
|
||||||
- Core depends on abstractions (Ports)
|
|
||||||
- Adapters depend on Core Ports
|
|
||||||
- NO Core → Adapter dependencies
|
|
||||||
|
|
||||||
### 3. **Easy Testing**
|
|
||||||
```python
|
|
||||||
# Test Core without ANY adapters
|
|
||||||
def test_service():
|
|
||||||
mock_factory = MockExtractorFactory() # Mock Port
|
|
||||||
mock_context = MockChunkingContext() # Mock Port
|
|
||||||
mock_repo = MockRepository() # Mock Port
|
|
||||||
|
|
||||||
service = DocumentProcessorService(
|
|
||||||
extractor_factory=mock_factory,
|
|
||||||
chunking_context=mock_context,
|
|
||||||
repository=mock_repo,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test pure business logic
|
|
||||||
result = service.process_document(...)
|
|
||||||
assert result.is_processed
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. **Easy Extension**
|
|
||||||
```python
|
|
||||||
# Add new file type - NO Core changes needed
|
|
||||||
class HTMLExtractor(IExtractor):
|
|
||||||
def extract(self, file_path: Path) -> Document:
|
|
||||||
# Implementation
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Register in Bootstrap
|
|
||||||
factory.register_extractor(HTMLExtractor())
|
|
||||||
```
|
|
||||||
|
|
||||||
### 5. **Swappable Implementations**
|
|
||||||
```python
|
|
||||||
# Swap repository - ONE line change in Bootstrap
|
|
||||||
# Before:
|
|
||||||
self._repository = InMemoryDocumentRepository()
|
|
||||||
|
|
||||||
# After:
|
|
||||||
self._repository = PostgresDocumentRepository(connection_string)
|
|
||||||
|
|
||||||
# NO other code changes needed!
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📝 Summary of Changes
|
|
||||||
|
|
||||||
### Files Deleted
|
|
||||||
- ❌ `src/adapters/outgoing/extractors/base.py`
|
|
||||||
- ❌ `src/adapters/outgoing/chunkers/base.py`
|
|
||||||
|
|
||||||
### Files Created
|
|
||||||
- ✅ `src/core/ports/outgoing/extractor_factory.py`
|
|
||||||
- ✅ `src/core/ports/outgoing/chunking_context.py`
|
|
||||||
- ✅ `HEXAGONAL_ARCHITECTURE_COMPLIANCE.md`
|
|
||||||
- ✅ `ARCHITECTURE_CORRECTIONS_SUMMARY.md`
|
|
||||||
|
|
||||||
### Files Modified
|
|
||||||
- 🔧 `src/core/services/document_processor_service.py` (fixed imports)
|
|
||||||
- 🔧 `src/adapters/outgoing/extractors/pdf_extractor.py` (implement port directly)
|
|
||||||
- 🔧 `src/adapters/outgoing/extractors/docx_extractor.py` (implement port directly)
|
|
||||||
- 🔧 `src/adapters/outgoing/extractors/txt_extractor.py` (implement port directly)
|
|
||||||
- 🔧 `src/adapters/outgoing/extractors/factory.py` (implement port from Core)
|
|
||||||
- 🔧 `src/adapters/outgoing/chunkers/fixed_size_chunker.py` (implement port directly)
|
|
||||||
- 🔧 `src/adapters/outgoing/chunkers/paragraph_chunker.py` (implement port directly)
|
|
||||||
- 🔧 `src/adapters/outgoing/chunkers/context.py` (implement port from Core)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎓 Key Learnings
|
|
||||||
|
|
||||||
### What is a "Port"?
|
|
||||||
- An **interface** (abstract base class)
|
|
||||||
- Defines a **contract**
|
|
||||||
- Lives in **Core** layer
|
|
||||||
- Independent of implementation details
|
|
||||||
|
|
||||||
### What is an "Adapter"?
|
|
||||||
- A **concrete implementation**
|
|
||||||
- Implements a **Port** interface
|
|
||||||
- Lives in **Adapters** layer
|
|
||||||
- Contains technology-specific code
|
|
||||||
|
|
||||||
### Where Do Factories/Contexts Live?
|
|
||||||
- **Interfaces** (IExtractorFactory, IChunkingContext) → **Core Ports**
|
|
||||||
- **Implementations** (ExtractorFactory, ChunkingContext) → **Adapters**
|
|
||||||
- Bootstrap injects implementations into Core Service
|
|
||||||
|
|
||||||
### Dependency Rule
|
|
||||||
```
|
|
||||||
Adapters → Ports (Core) ✅
|
|
||||||
Core → Ports (Core) ✅
|
|
||||||
Core → Adapters ❌ NEVER!
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Final Certification
|
|
||||||
|
|
||||||
This codebase now **STRICTLY ADHERES** to Hexagonal Architecture:
|
|
||||||
|
|
||||||
- ✅ All interfaces in Core Ports
|
|
||||||
- ✅ All implementations in Adapters
|
|
||||||
- ✅ Zero Core → Adapter dependencies
|
|
||||||
- ✅ Pure domain layer
|
|
||||||
- ✅ Proper dependency inversion
|
|
||||||
- ✅ Easy to test
|
|
||||||
- ✅ Easy to extend
|
|
||||||
- ✅ Production-ready
|
|
||||||
|
|
||||||
**Architecture Compliance**: **GOLD STANDARD** ⭐⭐⭐⭐⭐
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Corrections Applied: 2026-01-07*
|
|
||||||
*Architecture Review: APPROVED*
|
|
||||||
*Compliance Status: CERTIFIED*
|
|
||||||
@ -1,230 +0,0 @@
|
|||||||
TEXT PROCESSOR - HEXAGONAL ARCHITECTURE
|
|
||||||
Complete Directory Structure
|
|
||||||
|
|
||||||
text_processor_hex/
|
|
||||||
│
|
|
||||||
├── 📄 README.md Project documentation and overview
|
|
||||||
├── 📄 QUICK_START.md Quick start guide for users
|
|
||||||
├── 📄 ARCHITECTURE.md Detailed architecture documentation
|
|
||||||
├── 📄 PROJECT_SUMMARY.md Complete project summary
|
|
||||||
├── 📄 DIRECTORY_TREE.txt This file
|
|
||||||
│
|
|
||||||
├── 📄 requirements.txt Python dependencies
|
|
||||||
├── 🚀 main.py FastAPI application entry point
|
|
||||||
├── 📝 example_usage.py Programmatic usage examples
|
|
||||||
│
|
|
||||||
└── 📁 src/
|
|
||||||
├── 📄 __init__.py
|
|
||||||
├── 🔧 bootstrap.py ⚙️ DEPENDENCY INJECTION CONTAINER
|
|
||||||
│
|
|
||||||
├── 📁 core/ ⭐ DOMAIN LAYER (Pure Business Logic)
|
|
||||||
│ ├── 📄 __init__.py
|
|
||||||
│ │
|
|
||||||
│ ├── 📁 domain/ Domain Models & Logic
|
|
||||||
│ │ ├── 📄 __init__.py
|
|
||||||
│ │ ├── 📦 models.py Rich Pydantic v2 Entities
|
|
||||||
│ │ │ - Document
|
|
||||||
│ │ │ - DocumentMetadata
|
|
||||||
│ │ │ - Chunk
|
|
||||||
│ │ │ - ChunkingStrategy
|
|
||||||
│ │ ├── ⚠️ exceptions.py Domain Exceptions
|
|
||||||
│ │ │ - ExtractionError
|
|
||||||
│ │ │ - ChunkingError
|
|
||||||
│ │ │ - ProcessingError
|
|
||||||
│ │ │ - ValidationError
|
|
||||||
│ │ │ - RepositoryError
|
|
||||||
│ │ └── 🔨 logic_utils.py Pure Functions
|
|
||||||
│ │ - normalize_whitespace()
|
|
||||||
│ │ - clean_text()
|
|
||||||
│ │ - split_into_paragraphs()
|
|
||||||
│ │ - truncate_to_word_boundary()
|
|
||||||
│ │
|
|
||||||
│ ├── 📁 ports/ Port Interfaces (Abstractions)
|
|
||||||
│ │ ├── 📄 __init__.py
|
|
||||||
│ │ │
|
|
||||||
│ │ ├── 📁 incoming/ Service Interfaces (Use Cases)
|
|
||||||
│ │ │ ├── 📄 __init__.py
|
|
||||||
│ │ │ └── 🔌 text_processor.py ITextProcessor
|
|
||||||
│ │ │ - process_document()
|
|
||||||
│ │ │ - extract_and_chunk()
|
|
||||||
│ │ │ - get_document()
|
|
||||||
│ │ │ - list_documents()
|
|
||||||
│ │ │
|
|
||||||
│ │ └── 📁 outgoing/ SPIs (Service Provider Interfaces)
|
|
||||||
│ │ ├── 📄 __init__.py
|
|
||||||
│ │ ├── 🔌 extractor.py IExtractor
|
|
||||||
│ │ │ - extract()
|
|
||||||
│ │ │ - supports_file_type()
|
|
||||||
│ │ ├── 🔌 chunker.py IChunker
|
|
||||||
│ │ │ - chunk()
|
|
||||||
│ │ │ - supports_strategy()
|
|
||||||
│ │ └── 🔌 repository.py IDocumentRepository
|
|
||||||
│ │ - save()
|
|
||||||
│ │ - find_by_id()
|
|
||||||
│ │ - delete()
|
|
||||||
│ │
|
|
||||||
│ └── 📁 services/ Business Logic Orchestration
|
|
||||||
│ ├── 📄 __init__.py
|
|
||||||
│ └── ⚙️ document_processor_service.py
|
|
||||||
│ DocumentProcessorService
|
|
||||||
│ Implements: ITextProcessor
|
|
||||||
│ Workflow: Extract → Clean → Chunk → Save
|
|
||||||
│
|
|
||||||
├── 📁 adapters/ 🔌 ADAPTER LAYER (External Concerns)
|
|
||||||
│ ├── 📄 __init__.py
|
|
||||||
│ │
|
|
||||||
│ ├── 📁 incoming/ Driving Adapters (Primary)
|
|
||||||
│ │ ├── 📄 __init__.py
|
|
||||||
│ │ ├── 🌐 api_routes.py FastAPI Routes (HTTP Adapter)
|
|
||||||
│ │ │ - POST /process
|
|
||||||
│ │ │ - POST /extract-and-chunk
|
|
||||||
│ │ │ - GET /documents/{id}
|
|
||||||
│ │ │ - GET /documents
|
|
||||||
│ │ │ - DELETE /documents/{id}
|
|
||||||
│ │ └── 📋 api_schemas.py Pydantic Request/Response Models
|
|
||||||
│ │ - ProcessDocumentRequest
|
|
||||||
│ │ - DocumentResponse
|
|
||||||
│ │ - ChunkResponse
|
|
||||||
│ │
|
|
||||||
│ └── 📁 outgoing/ Driven Adapters (Secondary)
|
|
||||||
│ ├── 📄 __init__.py
|
|
||||||
│ │
|
|
||||||
│ ├── 📁 extractors/ Text Extraction Adapters
|
|
||||||
│ │ ├── 📄 __init__.py
|
|
||||||
│ │ ├── 📑 base.py BaseExtractor (Template Method)
|
|
||||||
│ │ ├── 📕 pdf_extractor.py PDFExtractor
|
|
||||||
│ │ │ Uses: PyPDF2
|
|
||||||
│ │ │ Supports: .pdf
|
|
||||||
│ │ ├── 📘 docx_extractor.py DocxExtractor
|
|
||||||
│ │ │ Uses: python-docx
|
|
||||||
│ │ │ Supports: .docx
|
|
||||||
│ │ ├── 📄 txt_extractor.py TxtExtractor
|
|
||||||
│ │ │ Uses: built-in
|
|
||||||
│ │ │ Supports: .txt, .md
|
|
||||||
│ │ └── 🏭 factory.py ExtractorFactory (Factory Pattern)
|
|
||||||
│ │ - create_extractor()
|
|
||||||
│ │ - register_extractor()
|
|
||||||
│ │
|
|
||||||
│ ├── 📁 chunkers/ Text Chunking Adapters
|
|
||||||
│ │ ├── 📄 __init__.py
|
|
||||||
│ │ ├── 📑 base.py BaseChunker (Template Method)
|
|
||||||
│ │ ├── ✂️ fixed_size_chunker.py FixedSizeChunker
|
|
||||||
│ │ │ Strategy: Fixed-size chunks
|
|
||||||
│ │ │ Features: Overlap, boundaries
|
|
||||||
│ │ ├── 📝 paragraph_chunker.py ParagraphChunker
|
|
||||||
│ │ │ Strategy: Paragraph-based
|
|
||||||
│ │ │ Features: Respect paragraphs
|
|
||||||
│ │ └── 🎯 context.py ChunkingContext (Strategy Pattern)
|
|
||||||
│ │ - set_strategy()
|
|
||||||
│ │ - execute_chunking()
|
|
||||||
│ │
|
|
||||||
│ └── 📁 persistence/ Data Persistence Adapters
|
|
||||||
│ ├── 📄 __init__.py
|
|
||||||
│ └── 💾 in_memory_repository.py
|
|
||||||
│ InMemoryDocumentRepository
|
|
||||||
│ Features: Thread-safe, Dict storage
|
|
||||||
│
|
|
||||||
└── 📁 shared/ 🛠️ SHARED LAYER (Cross-Cutting)
|
|
||||||
├── 📄 __init__.py
|
|
||||||
├── 🎛️ constants.py Application Constants
|
|
||||||
│ - File types
|
|
||||||
│ - Chunk sizes
|
|
||||||
│ - API config
|
|
||||||
└── 📋 logging_config.py Logging Configuration
|
|
||||||
- setup_logging()
|
|
||||||
- get_logger()
|
|
||||||
|
|
||||||
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
📊 PROJECT STATISTICS
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
Total Files: 44
|
|
||||||
- Python files: 42
|
|
||||||
- Documentation: 4 (README, ARCHITECTURE, SUMMARY, QUICK_START)
|
|
||||||
- Configuration: 1 (requirements.txt)
|
|
||||||
- Other: 1 (this tree)
|
|
||||||
|
|
||||||
Lines of Code: ~3,800
|
|
||||||
- Core Domain: ~1,200 lines
|
|
||||||
- Adapters: ~1,400 lines
|
|
||||||
- Bootstrap/Main: ~200 lines
|
|
||||||
- Documentation: ~1,000 lines
|
|
||||||
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
🏗️ ARCHITECTURE LAYERS
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
1. CORE (Domain Layer)
|
|
||||||
- Pure business logic
|
|
||||||
- No external dependencies
|
|
||||||
- Rich domain models
|
|
||||||
- Pure functions
|
|
||||||
|
|
||||||
2. ADAPTERS (Infrastructure Layer)
|
|
||||||
- Incoming: FastAPI (HTTP)
|
|
||||||
- Outgoing: Extractors, Chunkers, Repository
|
|
||||||
- Technology-specific implementations
|
|
||||||
|
|
||||||
3. BOOTSTRAP (Wiring Layer)
|
|
||||||
- Dependency injection
|
|
||||||
- Configuration
|
|
||||||
- Application assembly
|
|
||||||
|
|
||||||
4. SHARED (Utilities Layer)
|
|
||||||
- Cross-cutting concerns
|
|
||||||
- Logging, constants
|
|
||||||
- No business logic
|
|
||||||
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
🎨 DESIGN PATTERNS
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
✓ Hexagonal Architecture (Ports & Adapters)
|
|
||||||
✓ Factory Pattern (ExtractorFactory)
|
|
||||||
✓ Strategy Pattern (ChunkingContext)
|
|
||||||
✓ Repository Pattern (IDocumentRepository)
|
|
||||||
✓ Template Method Pattern (BaseExtractor, BaseChunker)
|
|
||||||
✓ Dependency Injection (ApplicationContainer)
|
|
||||||
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
💎 SOLID PRINCIPLES
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
✓ Single Responsibility: Each class has one job
|
|
||||||
✓ Open/Closed: Extend via interfaces, not modification
|
|
||||||
✓ Liskov Substitution: All implementations are interchangeable
|
|
||||||
✓ Interface Segregation: Small, focused interfaces
|
|
||||||
✓ Dependency Inversion: Depend on abstractions, not concretions
|
|
||||||
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
🎯 KEY FEATURES
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
✓ Multiple file types (PDF, DOCX, TXT)
|
|
||||||
✓ Multiple chunking strategies (Fixed, Paragraph)
|
|
||||||
✓ Rich domain models with validation
|
|
||||||
✓ Comprehensive error handling
|
|
||||||
✓ RESTful API with FastAPI
|
|
||||||
✓ Thread-safe repository
|
|
||||||
✓ 100% type hints
|
|
||||||
✓ Google-style docstrings
|
|
||||||
✓ Complete documentation
|
|
||||||
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
📚 DOCUMENTATION FILES
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
README.md - Project overview and installation
|
|
||||||
QUICK_START.md - Quick start guide for users
|
|
||||||
ARCHITECTURE.md - Detailed architecture documentation with diagrams
|
|
||||||
PROJECT_SUMMARY.md - Complete project summary and statistics
|
|
||||||
DIRECTORY_TREE.txt - This file
|
|
||||||
|
|
||||||
═══════════════════════════════════════════════════════════════════════════
|
|
||||||
@ -1,590 +0,0 @@
|
|||||||
# Hexagonal Architecture Compliance Report
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
This document certifies that the Text Processor codebase strictly adheres to **Hexagonal Architecture** (Ports & Adapters) principles as defined by Alistair Cockburn.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ Architectural Compliance Checklist
|
|
||||||
|
|
||||||
### 1. Core Domain Isolation
|
|
||||||
- [x] **Core has ZERO dependencies on Adapters**
|
|
||||||
- [x] **Core depends ONLY on standard library and Pydantic**
|
|
||||||
- [x] **No framework dependencies in Core** (no FastAPI, no PyPDF2, no python-docx)
|
|
||||||
- [x] **All external tool usage is in Adapters**
|
|
||||||
|
|
||||||
### 2. Port Definitions (Interfaces)
|
|
||||||
- [x] **ALL interfaces defined in `src/core/ports/`**
|
|
||||||
- [x] **NO abstract base classes in `src/adapters/`**
|
|
||||||
- [x] **Incoming Ports**: `ITextProcessor` (Service Interface)
|
|
||||||
- [x] **Outgoing Ports**: `IExtractor`, `IChunker`, `IDocumentRepository`
|
|
||||||
|
|
||||||
### 3. Adapter Implementation
|
|
||||||
- [x] **ALL concrete implementations in `src/adapters/`**
|
|
||||||
- [x] **Adapters implement Core Ports**
|
|
||||||
- [x] **Adapters catch technical errors and raise Domain exceptions**
|
|
||||||
- [x] **NO business logic in Adapters**
|
|
||||||
|
|
||||||
### 4. Dependency Direction
|
|
||||||
- [x] **Dependencies point INWARD** (Adapters → Core, never Core → Adapters)
|
|
||||||
- [x] **Dependency Inversion Principle satisfied**
|
|
||||||
- [x] **Bootstrap is ONLY place that knows about both Core and Adapters**
|
|
||||||
|
|
||||||
### 5. Factory & Strategy Patterns
|
|
||||||
- [x] **ExtractorFactory in Adapters layer** (not Core)
|
|
||||||
- [x] **ChunkingContext in Adapters layer** (not Core)
|
|
||||||
- [x] **Factories/Contexts registered in Bootstrap**
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📂 Corrected Directory Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
src/
|
|
||||||
├── core/ # DOMAIN LAYER (Pure Logic)
|
|
||||||
│ ├── domain/
|
|
||||||
│ │ ├── models.py # Rich Pydantic entities
|
|
||||||
│ │ ├── exceptions.py # Domain exceptions
|
|
||||||
│ │ └── logic_utils.py # Pure functions
|
|
||||||
│ ├── ports/
|
|
||||||
│ │ ├── incoming/
|
|
||||||
│ │ │ └── text_processor.py # ITextProcessor (USE CASE)
|
|
||||||
│ │ └── outgoing/
|
|
||||||
│ │ ├── extractor.py # IExtractor (SPI)
|
|
||||||
│ │ ├── chunker.py # IChunker (SPI)
|
|
||||||
│ │ └── repository.py # IDocumentRepository (SPI)
|
|
||||||
│ └── services/
|
|
||||||
│ └── document_processor_service.py # Orchestrator (depends on Ports)
|
|
||||||
│
|
|
||||||
├── adapters/ # INFRASTRUCTURE LAYER
|
|
||||||
│ ├── incoming/
|
|
||||||
│ │ ├── api_routes.py # FastAPI adapter
|
|
||||||
│ │ └── api_schemas.py # API DTOs
|
|
||||||
│ └── outgoing/
|
|
||||||
│ ├── extractors/
|
|
||||||
│ │ ├── pdf_extractor.py # Implements IExtractor
|
|
||||||
│ │ ├── docx_extractor.py # Implements IExtractor
|
|
||||||
│ │ ├── txt_extractor.py # Implements IExtractor
|
|
||||||
│ │ └── factory.py # Factory (ADAPTER LAYER)
|
|
||||||
│ ├── chunkers/
|
|
||||||
│ │ ├── fixed_size_chunker.py # Implements IChunker
|
|
||||||
│ │ ├── paragraph_chunker.py # Implements IChunker
|
|
||||||
│ │ └── context.py # Strategy Context (ADAPTER LAYER)
|
|
||||||
│ └── persistence/
|
|
||||||
│ └── in_memory_repository.py # Implements IDocumentRepository
|
|
||||||
│
|
|
||||||
├── shared/ # UTILITIES
|
|
||||||
│ ├── constants.py
|
|
||||||
│ └── logging_config.py
|
|
||||||
│
|
|
||||||
└── bootstrap.py # DEPENDENCY INJECTION
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔍 Key Corrections Made
|
|
||||||
|
|
||||||
### ❌ REMOVED: `base.py` files from Adapters
|
|
||||||
**Before (WRONG)**:
|
|
||||||
```
|
|
||||||
src/adapters/outgoing/extractors/base.py # Abstract base in Adapters ❌
|
|
||||||
src/adapters/outgoing/chunkers/base.py # Abstract base in Adapters ❌
|
|
||||||
```
|
|
||||||
|
|
||||||
**After (CORRECT)**:
|
|
||||||
- Removed all `base.py` files from adapters
|
|
||||||
- Abstract interfaces exist ONLY in `src/core/ports/outgoing/`
|
|
||||||
|
|
||||||
### ✅ Concrete Implementations Directly Implement Ports
|
|
||||||
|
|
||||||
**Before (WRONG)**:
|
|
||||||
```python
|
|
||||||
# In src/adapters/outgoing/extractors/pdf_extractor.py
|
|
||||||
from .base import BaseExtractor # Inheriting from adapter base ❌
|
|
||||||
|
|
||||||
class PDFExtractor(BaseExtractor):
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
**After (CORRECT)**:
|
|
||||||
```python
|
|
||||||
# In src/adapters/outgoing/extractors/pdf_extractor.py
|
|
||||||
from ....core.ports.outgoing.extractor import IExtractor # Port from Core ✅
|
|
||||||
|
|
||||||
class PDFExtractor(IExtractor):
|
|
||||||
"""Concrete implementation of IExtractor for PDF files."""
|
|
||||||
|
|
||||||
def extract(self, file_path: Path) -> Document:
|
|
||||||
# Implementation
|
|
||||||
pass
|
|
||||||
|
|
||||||
def supports_file_type(self, file_extension: str) -> bool:
|
|
||||||
# Implementation
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_supported_types(self) -> List[str]:
|
|
||||||
# Implementation
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Dependency Graph
|
|
||||||
|
|
||||||
```
|
|
||||||
┌──────────────────────────────────────────────────────────────┐
|
|
||||||
│ HTTP Request (FastAPI) │
|
|
||||||
└────────────────────────┬─────────────────────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────────────────────────────────────────────────────┐
|
|
||||||
│ INCOMING ADAPTER (api_routes.py) │
|
|
||||||
│ Depends on: ITextProcessor (Port) │
|
|
||||||
└────────────────────────┬─────────────────────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────────────────────────────────────────────────────┐
|
|
||||||
│ CORE DOMAIN LAYER │
|
|
||||||
│ ┌────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ DocumentProcessorService (implements ITextProcessor) │ │
|
|
||||||
│ │ Depends on: │ │
|
|
||||||
│ │ - IExtractor (Port) │ │
|
|
||||||
│ │ - IChunker (Port) │ │
|
|
||||||
│ │ - IDocumentRepository (Port) │ │
|
|
||||||
│ │ - Domain Models │ │
|
|
||||||
│ │ - Domain Logic Utils │ │
|
|
||||||
│ └────────────────────────────────────────────────────────┘ │
|
|
||||||
└────────────────────────┬─────────────────────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────────────────────────────────────────────────────┐
|
|
||||||
│ OUTGOING ADAPTERS │
|
|
||||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
|
||||||
│ │PDFExtractor │ │FixedSizeChkr │ │InMemoryRepo │ │
|
|
||||||
│ │(IExtractor) │ │(IChunker) │ │(IRepository) │ │
|
|
||||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
|
||||||
│ │
|
|
||||||
│ Uses: PyPDF2 Uses: Logic Uses: Dict │
|
|
||||||
│ Utils │
|
|
||||||
└──────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔒 Dependency Rules Enforcement
|
|
||||||
|
|
||||||
### ✅ ALLOWED Dependencies
|
|
||||||
|
|
||||||
```
|
|
||||||
Core Domain ──→ Standard Library
|
|
||||||
Core Domain ──→ Pydantic (Data Validation)
|
|
||||||
Core Services ──→ Core Ports (Interfaces)
|
|
||||||
Core Services ──→ Core Domain Models
|
|
||||||
Core Services ──→ Core Logic Utils
|
|
||||||
|
|
||||||
Adapters ──→ Core Ports (Implement interfaces)
|
|
||||||
Adapters ──→ Core Domain Models (Use entities)
|
|
||||||
Adapters ──→ Core Exceptions (Raise domain errors)
|
|
||||||
Adapters ──→ External Libraries (PyPDF2, python-docx, FastAPI)
|
|
||||||
|
|
||||||
Bootstrap ──→ Core (Services, Ports)
|
|
||||||
Bootstrap ──→ Adapters (Concrete implementations)
|
|
||||||
```
|
|
||||||
|
|
||||||
### ❌ FORBIDDEN Dependencies
|
|
||||||
|
|
||||||
```
|
|
||||||
Core ──X──> Adapters (NEVER!)
|
|
||||||
Core ──X──> External Libraries (ONLY via Adapters)
|
|
||||||
Core ──X──> FastAPI (ONLY in Adapters)
|
|
||||||
Core ──X──> PyPDF2 (ONLY in Adapters)
|
|
||||||
Core ──X──> python-docx (ONLY in Adapters)
|
|
||||||
|
|
||||||
Domain Models ──X──> Services
|
|
||||||
Domain Models ──X──> Ports
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📋 Port Interfaces (Core Layer)
|
|
||||||
|
|
||||||
### Incoming Port: ITextProcessor
|
|
||||||
```python
|
|
||||||
# src/core/ports/incoming/text_processor.py
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
|
||||||
class ITextProcessor(ABC):
|
|
||||||
"""Service interface for text processing use cases."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def process_document(self, file_path: Path, strategy: ChunkingStrategy) -> Document:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def extract_and_chunk(self, file_path: Path, strategy: ChunkingStrategy) -> List[Chunk]:
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
### Outgoing Port: IExtractor
|
|
||||||
```python
|
|
||||||
# src/core/ports/outgoing/extractor.py
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
|
||||||
class IExtractor(ABC):
|
|
||||||
"""Interface for text extraction from documents."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def extract(self, file_path: Path) -> Document:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def supports_file_type(self, file_extension: str) -> bool:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_supported_types(self) -> List[str]:
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
### Outgoing Port: IChunker
|
|
||||||
```python
|
|
||||||
# src/core/ports/outgoing/chunker.py
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
|
||||||
class IChunker(ABC):
|
|
||||||
"""Interface for text chunking strategies."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def chunk(self, text: str, document_id: UUID, strategy: ChunkingStrategy) -> List[Chunk]:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def supports_strategy(self, strategy_name: str) -> bool:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_strategy_name(self) -> str:
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
### Outgoing Port: IDocumentRepository
|
|
||||||
```python
|
|
||||||
# src/core/ports/outgoing/repository.py
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
|
||||||
class IDocumentRepository(ABC):
|
|
||||||
"""Interface for document persistence."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def save(self, document: Document) -> Document:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def find_by_id(self, document_id: UUID) -> Optional[Document]:
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🔧 Adapter Implementations
|
|
||||||
|
|
||||||
### PDF Extractor
|
|
||||||
```python
|
|
||||||
# src/adapters/outgoing/extractors/pdf_extractor.py
|
|
||||||
from ....core.ports.outgoing.extractor import IExtractor
|
|
||||||
from ....core.domain.models import Document
|
|
||||||
from ....core.domain.exceptions import ExtractionError
|
|
||||||
|
|
||||||
class PDFExtractor(IExtractor):
|
|
||||||
"""Concrete PDF extractor using PyPDF2."""
|
|
||||||
|
|
||||||
def extract(self, file_path: Path) -> Document:
|
|
||||||
try:
|
|
||||||
import PyPDF2 # External library ONLY in adapter
|
|
||||||
# ... extraction logic
|
|
||||||
except PyPDF2.errors.PdfReadError as e:
|
|
||||||
# Map technical error to domain error
|
|
||||||
raise ExtractionError(
|
|
||||||
message="Invalid PDF file",
|
|
||||||
details=str(e),
|
|
||||||
file_path=str(file_path),
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Fixed Size Chunker
|
|
||||||
```python
|
|
||||||
# src/adapters/outgoing/chunkers/fixed_size_chunker.py
|
|
||||||
from ....core.ports.outgoing.chunker import IChunker
|
|
||||||
from ....core.domain.models import Chunk, ChunkingStrategy
|
|
||||||
from ....core.domain import logic_utils # Pure functions from Core
|
|
||||||
|
|
||||||
class FixedSizeChunker(IChunker):
|
|
||||||
"""Concrete fixed-size chunker."""
|
|
||||||
|
|
||||||
def chunk(self, text: str, document_id: UUID, strategy: ChunkingStrategy) -> List[Chunk]:
|
|
||||||
# Uses pure functions from Core (logic_utils)
|
|
||||||
# Creates Chunk entities from Core domain
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎨 Design Pattern Locations
|
|
||||||
|
|
||||||
### Factory Pattern
|
|
||||||
**Location**: `src/adapters/outgoing/extractors/factory.py`
|
|
||||||
```python
|
|
||||||
class ExtractorFactory:
|
|
||||||
"""Factory for creating extractors (ADAPTER LAYER)."""
|
|
||||||
|
|
||||||
def create_extractor(self, file_path: Path) -> IExtractor:
|
|
||||||
# Returns implementations of IExtractor port
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why in Adapters?**
|
|
||||||
- Factory knows about concrete implementations (PDFExtractor, DocxExtractor)
|
|
||||||
- Core should NOT know about concrete implementations
|
|
||||||
- Factory registered in Bootstrap, injected into Service
|
|
||||||
|
|
||||||
### Strategy Pattern
|
|
||||||
**Location**: `src/adapters/outgoing/chunkers/context.py`
|
|
||||||
```python
|
|
||||||
class ChunkingContext:
|
|
||||||
"""Strategy context for chunking (ADAPTER LAYER)."""
|
|
||||||
|
|
||||||
def set_strategy(self, strategy_name: str) -> None:
|
|
||||||
# Selects concrete IChunker implementation
|
|
||||||
pass
|
|
||||||
|
|
||||||
def execute_chunking(self, ...) -> List[Chunk]:
|
|
||||||
# Delegates to selected strategy
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why in Adapters?**
|
|
||||||
- Context knows about concrete strategies (FixedSizeChunker, ParagraphChunker)
|
|
||||||
- Core should NOT know about concrete strategies
|
|
||||||
- Context registered in Bootstrap, injected into Service
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🧪 Error Handling: Adapter → Domain
|
|
||||||
|
|
||||||
Adapters catch technical errors and map them to domain exceptions:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# In PDFExtractor (Adapter)
|
|
||||||
try:
|
|
||||||
import PyPDF2
|
|
||||||
# ... PyPDF2 operations
|
|
||||||
except PyPDF2.errors.PdfReadError as e: # Technical error
|
|
||||||
raise ExtractionError( # Domain error
|
|
||||||
message="Invalid PDF file",
|
|
||||||
details=str(e),
|
|
||||||
)
|
|
||||||
|
|
||||||
# In DocxExtractor (Adapter)
|
|
||||||
try:
|
|
||||||
import docx
|
|
||||||
# ... python-docx operations
|
|
||||||
except Exception as e: # Technical error
|
|
||||||
raise ExtractionError( # Domain error
|
|
||||||
message="DOCX extraction failed",
|
|
||||||
details=str(e),
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why?**
|
|
||||||
- Core defines domain exceptions (ExtractionError, ChunkingError, etc.)
|
|
||||||
- Adapters catch library-specific errors (PyPDF2.errors, etc.)
|
|
||||||
- Service layer only deals with domain exceptions
|
|
||||||
- Clean separation of technical vs. business concerns
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏗️ Bootstrap: The Wiring Layer
|
|
||||||
|
|
||||||
**Location**: `src/bootstrap.py`
|
|
||||||
|
|
||||||
```python
|
|
||||||
class ApplicationContainer:
|
|
||||||
"""Dependency injection container."""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
# Create ADAPTERS (knows about concrete implementations)
|
|
||||||
self._repository = InMemoryDocumentRepository()
|
|
||||||
self._extractor_factory = self._create_extractor_factory()
|
|
||||||
self._chunking_context = self._create_chunking_context()
|
|
||||||
|
|
||||||
# Inject into CORE SERVICE (only knows about Ports)
|
|
||||||
self._service = DocumentProcessorService(
|
|
||||||
extractor_factory=self._extractor_factory, # IExtractorFactory
|
|
||||||
chunking_context=self._chunking_context, # IChunkingContext
|
|
||||||
repository=self._repository, # IDocumentRepository
|
|
||||||
)
|
|
||||||
|
|
||||||
def _create_extractor_factory(self) -> ExtractorFactory:
|
|
||||||
factory = ExtractorFactory()
|
|
||||||
factory.register_extractor(PDFExtractor()) # Concrete
|
|
||||||
factory.register_extractor(DocxExtractor()) # Concrete
|
|
||||||
factory.register_extractor(TxtExtractor()) # Concrete
|
|
||||||
return factory
|
|
||||||
|
|
||||||
def _create_chunking_context(self) -> ChunkingContext:
|
|
||||||
context = ChunkingContext()
|
|
||||||
context.register_chunker(FixedSizeChunker()) # Concrete
|
|
||||||
context.register_chunker(ParagraphChunker()) # Concrete
|
|
||||||
return context
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Points**:
|
|
||||||
1. Bootstrap is the ONLY place that imports both Core and Adapters
|
|
||||||
2. Core Service receives interfaces (Ports), not concrete implementations
|
|
||||||
3. Adapters are created and registered here
|
|
||||||
4. Perfect Dependency Inversion
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ SOLID Principles Compliance
|
|
||||||
|
|
||||||
### Single Responsibility Principle
|
|
||||||
- [x] Each extractor handles ONE file type
|
|
||||||
- [x] Each chunker handles ONE strategy
|
|
||||||
- [x] Each service method has ONE responsibility
|
|
||||||
- [x] Functions are max 15-20 lines
|
|
||||||
|
|
||||||
### Open/Closed Principle
|
|
||||||
- [x] Add new extractors without modifying Core
|
|
||||||
- [x] Add new chunkers without modifying Core
|
|
||||||
- [x] Extend via Ports, not modification
|
|
||||||
|
|
||||||
### Liskov Substitution Principle
|
|
||||||
- [x] All IExtractor implementations are interchangeable
|
|
||||||
- [x] All IChunker implementations are interchangeable
|
|
||||||
- [x] Polymorphism works correctly
|
|
||||||
|
|
||||||
### Interface Segregation Principle
|
|
||||||
- [x] Small, focused Port interfaces
|
|
||||||
- [x] IExtractor: Only extraction concerns
|
|
||||||
- [x] IChunker: Only chunking concerns
|
|
||||||
- [x] No fat interfaces
|
|
||||||
|
|
||||||
### Dependency Inversion Principle
|
|
||||||
- [x] Core depends on IExtractor (abstraction), not PDFExtractor (concrete)
|
|
||||||
- [x] Core depends on IChunker (abstraction), not FixedSizeChunker (concrete)
|
|
||||||
- [x] High-level modules don't depend on low-level modules
|
|
||||||
- [x] Both depend on abstractions (Ports)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🧪 Testing Benefits
|
|
||||||
|
|
||||||
### Unit Tests (Core)
|
|
||||||
```python
|
|
||||||
def test_document_processor_service():
|
|
||||||
# Mock the Ports (interfaces)
|
|
||||||
mock_factory = MockExtractorFactory()
|
|
||||||
mock_context = MockChunkingContext()
|
|
||||||
mock_repo = MockRepository()
|
|
||||||
|
|
||||||
# Inject mocks (Dependency Inversion)
|
|
||||||
service = DocumentProcessorService(
|
|
||||||
extractor_factory=mock_factory,
|
|
||||||
chunking_context=mock_context,
|
|
||||||
repository=mock_repo,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test business logic WITHOUT any infrastructure
|
|
||||||
result = service.process_document(...)
|
|
||||||
assert result.is_processed
|
|
||||||
```
|
|
||||||
|
|
||||||
### Integration Tests (Adapters)
|
|
||||||
```python
|
|
||||||
def test_pdf_extractor():
|
|
||||||
# Test concrete implementation with real PDF
|
|
||||||
extractor = PDFExtractor()
|
|
||||||
document = extractor.extract(Path("test.pdf"))
|
|
||||||
assert len(document.content) > 0
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📊 Verification Checklist
|
|
||||||
|
|
||||||
Run these checks to verify architecture compliance:
|
|
||||||
|
|
||||||
### 1. Import Analysis
|
|
||||||
```bash
|
|
||||||
# Core should NOT import from adapters
|
|
||||||
grep -r "from.*adapters" src/core/
|
|
||||||
# Expected: NO RESULTS ✅
|
|
||||||
|
|
||||||
# Core should NOT import external libs (except Pydantic)
|
|
||||||
grep -r "import PyPDF2\|import docx\|import fastapi" src/core/
|
|
||||||
# Expected: NO RESULTS ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Dependency Direction
|
|
||||||
```bash
|
|
||||||
# All imports should point inward (toward Core)
|
|
||||||
# Adapters → Core: YES ✅
|
|
||||||
# Core → Adapters: NO ❌
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Abstract Base Classes
|
|
||||||
```bash
|
|
||||||
# NO base.py files in adapters
|
|
||||||
find src/adapters -name "base.py"
|
|
||||||
# Expected: NO RESULTS ✅
|
|
||||||
|
|
||||||
# All interfaces in Core ports
|
|
||||||
find src/core/ports -name "*.py" | grep -v __init__
|
|
||||||
# Expected: extractor.py, chunker.py, repository.py, text_processor.py ✅
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🎯 Summary
|
|
||||||
|
|
||||||
### What Changed
|
|
||||||
1. **Removed** `base.py` from `src/adapters/outgoing/extractors/`
|
|
||||||
2. **Removed** `base.py` from `src/adapters/outgoing/chunkers/`
|
|
||||||
3. **Updated** all concrete implementations to directly implement Core Ports
|
|
||||||
4. **Confirmed** Factory and Context are in Adapters layer (correct location)
|
|
||||||
5. **Verified** Core has ZERO dependencies on Adapters
|
|
||||||
|
|
||||||
### Architecture Guarantees
|
|
||||||
- ✅ Core is **100% pure** (no framework dependencies)
|
|
||||||
- ✅ Core depends ONLY on **abstractions** (Ports)
|
|
||||||
- ✅ Adapters implement **Core Ports**
|
|
||||||
- ✅ Bootstrap performs **Dependency Injection**
|
|
||||||
- ✅ **Zero circular dependencies**
|
|
||||||
- ✅ **Perfect Dependency Inversion**
|
|
||||||
|
|
||||||
### Benefits Achieved
|
|
||||||
1. **Testability**: Core can be tested with mocks, no infrastructure needed
|
|
||||||
2. **Flexibility**: Swap implementations (in-memory → PostgreSQL) with one line
|
|
||||||
3. **Maintainability**: Clear separation of concerns
|
|
||||||
4. **Extensibility**: Add new file types/strategies without touching Core
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🏆 Certification
|
|
||||||
|
|
||||||
This codebase is **CERTIFIED** as a true Hexagonal Architecture implementation:
|
|
||||||
|
|
||||||
- ✅ Adheres to Alistair Cockburn's Ports & Adapters pattern
|
|
||||||
- ✅ Satisfies all SOLID principles
|
|
||||||
- ✅ Maintains proper dependency direction
|
|
||||||
- ✅ Zero Core → Adapter dependencies
|
|
||||||
- ✅ All interfaces in Core, all implementations in Adapters
|
|
||||||
- ✅ Bootstrap handles all dependency injection
|
|
||||||
|
|
||||||
**Compliance Level**: **GOLD STANDARD** ⭐⭐⭐⭐⭐
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Last Updated: 2026-01-07*
|
|
||||||
*Architecture Review Status: APPROVED*
|
|
||||||
@ -1,419 +0,0 @@
|
|||||||
# Project Summary: Text Processor - Hexagonal Architecture
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
This is a **production-ready, "Gold Standard" implementation** of a text extraction and chunking system built with **Hexagonal Architecture** (Ports & Adapters pattern).
|
|
||||||
|
|
||||||
## Complete File Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
text_processor_hex/
|
|
||||||
├── README.md # Project documentation
|
|
||||||
├── ARCHITECTURE.md # Detailed architecture guide
|
|
||||||
├── PROJECT_SUMMARY.md # This file
|
|
||||||
├── requirements.txt # Python dependencies
|
|
||||||
├── main.py # FastAPI application entry point
|
|
||||||
├── example_usage.py # Programmatic usage example
|
|
||||||
│
|
|
||||||
└── src/
|
|
||||||
├── __init__.py
|
|
||||||
├── bootstrap.py # Dependency Injection Container
|
|
||||||
│
|
|
||||||
├── core/ # DOMAIN LAYER (Pure Business Logic)
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── domain/
|
|
||||||
│ │ ├── __init__.py
|
|
||||||
│ │ ├── models.py # Rich Pydantic v2 Entities
|
|
||||||
│ │ ├── exceptions.py # Domain Exceptions
|
|
||||||
│ │ └── logic_utils.py # Pure Functions
|
|
||||||
│ ├── ports/
|
|
||||||
│ │ ├── __init__.py
|
|
||||||
│ │ ├── incoming/
|
|
||||||
│ │ │ ├── __init__.py
|
|
||||||
│ │ │ └── text_processor.py # Service Interface (Use Case)
|
|
||||||
│ │ └── outgoing/
|
|
||||||
│ │ ├── __init__.py
|
|
||||||
│ │ ├── extractor.py # Extractor Interface (SPI)
|
|
||||||
│ │ ├── chunker.py # Chunker Interface (SPI)
|
|
||||||
│ │ └── repository.py # Repository Interface (SPI)
|
|
||||||
│ └── services/
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ └── document_processor_service.py # Business Logic Orchestration
|
|
||||||
│
|
|
||||||
├── adapters/ # ADAPTER LAYER (External Concerns)
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── incoming/ # Driving Adapters (HTTP)
|
|
||||||
│ │ ├── __init__.py
|
|
||||||
│ │ ├── api_routes.py # FastAPI Routes
|
|
||||||
│ │ └── api_schemas.py # Pydantic Request/Response Models
|
|
||||||
│ └── outgoing/ # Driven Adapters (Infrastructure)
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ ├── extractors/
|
|
||||||
│ │ ├── __init__.py
|
|
||||||
│ │ ├── base.py # Abstract Base Extractor
|
|
||||||
│ │ ├── pdf_extractor.py # PDF Implementation (PyPDF2)
|
|
||||||
│ │ ├── docx_extractor.py # DOCX Implementation (python-docx)
|
|
||||||
│ │ ├── txt_extractor.py # TXT Implementation (built-in)
|
|
||||||
│ │ └── factory.py # Extractor Factory (Factory Pattern)
|
|
||||||
│ ├── chunkers/
|
|
||||||
│ │ ├── __init__.py
|
|
||||||
│ │ ├── base.py # Abstract Base Chunker
|
|
||||||
│ │ ├── fixed_size_chunker.py # Fixed Size Strategy
|
|
||||||
│ │ ├── paragraph_chunker.py # Paragraph Strategy
|
|
||||||
│ │ └── context.py # Chunking Context (Strategy Pattern)
|
|
||||||
│ └── persistence/
|
|
||||||
│ ├── __init__.py
|
|
||||||
│ └── in_memory_repository.py # In-Memory Repository (Thread-Safe)
|
|
||||||
│
|
|
||||||
└── shared/ # SHARED LAYER (Cross-Cutting)
|
|
||||||
├── __init__.py
|
|
||||||
├── constants.py # Application Constants
|
|
||||||
└── logging_config.py # Logging Configuration
|
|
||||||
```
|
|
||||||
|
|
||||||
## File Count & Statistics
|
|
||||||
|
|
||||||
### Total Files
|
|
||||||
- **42 Python files** (.py)
|
|
||||||
- **3 Documentation files** (.md)
|
|
||||||
- **1 Requirements file** (.txt)
|
|
||||||
- **Total: 46 files**
|
|
||||||
|
|
||||||
### Lines of Code (Approximate)
|
|
||||||
- Core Domain: ~1,200 lines
|
|
||||||
- Adapters: ~1,400 lines
|
|
||||||
- Bootstrap & Main: ~200 lines
|
|
||||||
- Documentation: ~1,000 lines
|
|
||||||
- **Total: ~3,800 lines**
|
|
||||||
|
|
||||||
## Architecture Layers
|
|
||||||
|
|
||||||
### 1. Core Domain (src/core/)
|
|
||||||
**Responsibility**: Pure business logic, no external dependencies
|
|
||||||
|
|
||||||
#### Domain Models (models.py)
|
|
||||||
- `Document`: Rich entity with validation and business methods
|
|
||||||
- `DocumentMetadata`: Value object for file information
|
|
||||||
- `Chunk`: Immutable chunk entity
|
|
||||||
- `ChunkingStrategy`: Strategy configuration
|
|
||||||
|
|
||||||
**Features**:
|
|
||||||
- Pydantic v2 validation
|
|
||||||
- Business methods: `validate_content()`, `get_metadata_summary()`
|
|
||||||
- Immutability where appropriate
|
|
||||||
|
|
||||||
#### Domain Exceptions (exceptions.py)
|
|
||||||
- `DomainException`: Base exception
|
|
||||||
- `ExtractionError`, `ChunkingError`, `ProcessingError`
|
|
||||||
- `ValidationError`, `RepositoryError`
|
|
||||||
- `UnsupportedFileTypeError`, `DocumentNotFoundError`, `EmptyContentError`
|
|
||||||
|
|
||||||
#### Domain Logic Utils (logic_utils.py)
|
|
||||||
Pure functions for text processing:
|
|
||||||
- `normalize_whitespace()`, `clean_text()`
|
|
||||||
- `split_into_sentences()`, `split_into_paragraphs()`
|
|
||||||
- `truncate_to_word_boundary()`
|
|
||||||
- `find_sentence_boundary_before()`
|
|
||||||
|
|
||||||
#### Ports (Interfaces)
|
|
||||||
**Incoming**:
|
|
||||||
- `ITextProcessor`: Service interface (use cases)
|
|
||||||
|
|
||||||
**Outgoing**:
|
|
||||||
- `IExtractor`: Text extraction interface
|
|
||||||
- `IChunker`: Chunking strategy interface
|
|
||||||
- `IDocumentRepository`: Persistence interface
|
|
||||||
|
|
||||||
#### Services (document_processor_service.py)
|
|
||||||
- `DocumentProcessorService`: Orchestrates Extract → Clean → Chunk → Save
|
|
||||||
- Depends ONLY on port interfaces
|
|
||||||
- Implements ITextProcessor
|
|
||||||
|
|
||||||
### 2. Adapters (src/adapters/)
|
|
||||||
**Responsibility**: Connect core to external world
|
|
||||||
|
|
||||||
#### Incoming Adapters (incoming/)
|
|
||||||
**FastAPI HTTP Adapter**:
|
|
||||||
- `api_routes.py`: HTTP endpoints
|
|
||||||
- `api_schemas.py`: Pydantic request/response models
|
|
||||||
- Maps HTTP requests to domain operations
|
|
||||||
- Maps domain exceptions to HTTP status codes
|
|
||||||
|
|
||||||
**Endpoints**:
|
|
||||||
- `POST /api/v1/process`: Process document
|
|
||||||
- `POST /api/v1/extract-and-chunk`: Extract and chunk
|
|
||||||
- `GET /api/v1/documents/{id}`: Get document
|
|
||||||
- `GET /api/v1/documents`: List documents
|
|
||||||
- `DELETE /api/v1/documents/{id}`: Delete document
|
|
||||||
- `GET /api/v1/health`: Health check
|
|
||||||
|
|
||||||
#### Outgoing Adapters (outgoing/)
|
|
||||||
|
|
||||||
**Extractors (extractors/)**:
|
|
||||||
- `base.py`: Template method pattern base class
|
|
||||||
- `pdf_extractor.py`: PDF extraction using PyPDF2
|
|
||||||
- `docx_extractor.py`: DOCX extraction using python-docx
|
|
||||||
- `txt_extractor.py`: Plain text extraction (multi-encoding)
|
|
||||||
- `factory.py`: Factory pattern for extractor selection
|
|
||||||
|
|
||||||
**Chunkers (chunkers/)**:
|
|
||||||
- `base.py`: Template method pattern base class
|
|
||||||
- `fixed_size_chunker.py`: Fixed-size chunks with overlap
|
|
||||||
- `paragraph_chunker.py`: Paragraph-based chunking
|
|
||||||
- `context.py`: Strategy pattern context
|
|
||||||
|
|
||||||
**Persistence (persistence/)**:
|
|
||||||
- `in_memory_repository.py`: Thread-safe in-memory storage
|
|
||||||
|
|
||||||
### 3. Bootstrap (src/bootstrap.py)
|
|
||||||
**Responsibility**: Dependency injection and wiring
|
|
||||||
|
|
||||||
**ApplicationContainer**:
|
|
||||||
- Creates all adapters
|
|
||||||
- Injects dependencies into core
|
|
||||||
- ONLY place where concrete implementations are instantiated
|
|
||||||
- Provides factory method: `create_application()`
|
|
||||||
|
|
||||||
### 4. Shared (src/shared/)
|
|
||||||
**Responsibility**: Cross-cutting concerns
|
|
||||||
|
|
||||||
- `constants.py`: Application constants
|
|
||||||
- `logging_config.py`: Centralized logging setup
|
|
||||||
|
|
||||||
## Design Patterns Implemented
|
|
||||||
|
|
||||||
### 1. Hexagonal Architecture (Ports & Adapters)
|
|
||||||
- Core isolated from external concerns
|
|
||||||
- Dependency inversion at boundaries
|
|
||||||
- Easy to swap implementations
|
|
||||||
|
|
||||||
### 2. Factory Pattern
|
|
||||||
- `ExtractorFactory`: Creates appropriate extractor based on file type
|
|
||||||
- Centralized management
|
|
||||||
- Easy to add new file types
|
|
||||||
|
|
||||||
### 3. Strategy Pattern
|
|
||||||
- `ChunkingContext`: Runtime strategy selection
|
|
||||||
- `FixedSizeChunker`, `ParagraphChunker`
|
|
||||||
- Easy to add new strategies
|
|
||||||
|
|
||||||
### 4. Repository Pattern
|
|
||||||
- `IDocumentRepository`: Abstract persistence
|
|
||||||
- `InMemoryDocumentRepository`: Concrete implementation
|
|
||||||
- Easy to swap storage (memory → DB)
|
|
||||||
|
|
||||||
### 5. Template Method Pattern
|
|
||||||
- `BaseExtractor`: Common extraction workflow
|
|
||||||
- `BaseChunker`: Common chunking workflow
|
|
||||||
- Subclasses fill in specific details
|
|
||||||
|
|
||||||
### 6. Dependency Injection
|
|
||||||
- `ApplicationContainer`: Constructor injection
|
|
||||||
- Loose coupling
|
|
||||||
- Easy testing with mocks
|
|
||||||
|
|
||||||
## SOLID Principles Compliance
|
|
||||||
|
|
||||||
### Single Responsibility Principle ✓
|
|
||||||
- Each class has one reason to change
|
|
||||||
- Each function does ONE thing
|
|
||||||
- Maximum 15-20 lines per function
|
|
||||||
|
|
||||||
### Open/Closed Principle ✓
|
|
||||||
- Open for extension (add extractors, chunkers)
|
|
||||||
- Closed for modification (core unchanged)
|
|
||||||
|
|
||||||
### Liskov Substitution Principle ✓
|
|
||||||
- All IExtractor implementations are interchangeable
|
|
||||||
- All IChunker implementations are interchangeable
|
|
||||||
|
|
||||||
### Interface Segregation Principle ✓
|
|
||||||
- Small, focused interfaces
|
|
||||||
- No fat interfaces
|
|
||||||
|
|
||||||
### Dependency Inversion Principle ✓
|
|
||||||
- Core depends on abstractions (ports)
|
|
||||||
- Core does NOT depend on concrete implementations
|
|
||||||
- High-level modules independent of low-level modules
|
|
||||||
|
|
||||||
## Clean Code Principles
|
|
||||||
|
|
||||||
### DRY (Don't Repeat Yourself) ✓
|
|
||||||
- Base classes for common functionality
|
|
||||||
- Pure functions for reusable logic
|
|
||||||
- No code duplication
|
|
||||||
|
|
||||||
### KISS (Keep It Simple, Stupid) ✓
|
|
||||||
- Simple, readable solutions
|
|
||||||
- No over-engineering
|
|
||||||
- Clear naming
|
|
||||||
|
|
||||||
### YAGNI (You Aren't Gonna Need It) ✓
|
|
||||||
- Implements only required features
|
|
||||||
- No speculative generality
|
|
||||||
- Focused on current needs
|
|
||||||
|
|
||||||
## Type Safety
|
|
||||||
|
|
||||||
- **100% type hints** on all functions
|
|
||||||
- Python 3.10+ type annotations
|
|
||||||
- Pydantic for runtime validation
|
|
||||||
- Mypy compatible
|
|
||||||
|
|
||||||
## Documentation Standards
|
|
||||||
|
|
||||||
- **Google-style docstrings** on all public APIs
|
|
||||||
- Module-level documentation
|
|
||||||
- Inline comments for complex logic
|
|
||||||
- Architecture documentation
|
|
||||||
- Usage examples
|
|
||||||
|
|
||||||
## Testing Strategy
|
|
||||||
|
|
||||||
### Unit Tests
|
|
||||||
- Test domain models in isolation
|
|
||||||
- Test pure functions
|
|
||||||
- Test services with mocks
|
|
||||||
|
|
||||||
### Integration Tests
|
|
||||||
- Test extractors with real files
|
|
||||||
- Test chunkers with real text
|
|
||||||
- Test repository operations
|
|
||||||
|
|
||||||
### API Tests
|
|
||||||
- Test FastAPI endpoints
|
|
||||||
- Test error scenarios
|
|
||||||
- Test complete workflows
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
|
|
||||||
### Domain Exceptions
|
|
||||||
- All external errors wrapped in domain exceptions
|
|
||||||
- Rich error context (file path, operation, details)
|
|
||||||
- Hierarchical exception structure
|
|
||||||
|
|
||||||
### HTTP Error Mapping
|
|
||||||
- 400: Invalid request, unsupported file type
|
|
||||||
- 404: Document not found
|
|
||||||
- 422: Extraction/chunking failed
|
|
||||||
- 500: Internal processing error
|
|
||||||
|
|
||||||
## Extensibility
|
|
||||||
|
|
||||||
### Adding New File Type (Example: HTML)
|
|
||||||
1. Create `html_extractor.py` extending `BaseExtractor`
|
|
||||||
2. Register in `bootstrap.py`: `factory.register_extractor(HTMLExtractor())`
|
|
||||||
3. Done! No changes to core required
|
|
||||||
|
|
||||||
### Adding New Chunking Strategy (Example: Sentence)
|
|
||||||
1. Create `sentence_chunker.py` extending `BaseChunker`
|
|
||||||
2. Register in `bootstrap.py`: `context.register_chunker(SentenceChunker())`
|
|
||||||
3. Done! No changes to core required
|
|
||||||
|
|
||||||
### Swapping Storage (Example: PostgreSQL)
|
|
||||||
1. Create `postgres_repository.py` implementing `IDocumentRepository`
|
|
||||||
2. Swap in `bootstrap.py`: `return PostgresDocumentRepository(...)`
|
|
||||||
3. Done! No changes to core or API required
|
|
||||||
|
|
||||||
## Dependencies
|
|
||||||
|
|
||||||
### Production
|
|
||||||
- `pydantic==2.10.5`: Data validation and models
|
|
||||||
- `fastapi==0.115.6`: Web framework
|
|
||||||
- `uvicorn==0.34.0`: ASGI server
|
|
||||||
- `PyPDF2==3.0.1`: PDF extraction
|
|
||||||
- `python-docx==1.1.2`: DOCX extraction
|
|
||||||
|
|
||||||
### Development
|
|
||||||
- `pytest==8.3.4`: Testing framework
|
|
||||||
- `black==24.10.0`: Code formatting
|
|
||||||
- `ruff==0.8.5`: Linting
|
|
||||||
- `mypy==1.14.0`: Type checking
|
|
||||||
|
|
||||||
## Running the Application
|
|
||||||
|
|
||||||
### Install Dependencies
|
|
||||||
```bash
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run FastAPI Server
|
|
||||||
```bash
|
|
||||||
python main.py
|
|
||||||
# or
|
|
||||||
uvicorn main:app --reload
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run Example Script
|
|
||||||
```bash
|
|
||||||
python example_usage.py
|
|
||||||
```
|
|
||||||
|
|
||||||
### Access API Documentation
|
|
||||||
- Swagger UI: http://localhost:8000/docs
|
|
||||||
- ReDoc: http://localhost:8000/redoc
|
|
||||||
|
|
||||||
## Key Achievements
|
|
||||||
|
|
||||||
### Architecture
|
|
||||||
✓ Pure hexagonal architecture implementation
|
|
||||||
✓ Zero circular dependencies
|
|
||||||
✓ Core completely isolated from adapters
|
|
||||||
✓ Perfect dependency inversion
|
|
||||||
|
|
||||||
### Code Quality
|
|
||||||
✓ 100% type-hinted
|
|
||||||
✓ Google-style docstrings on all APIs
|
|
||||||
✓ Functions ≤ 15-20 lines
|
|
||||||
✓ DRY, KISS, YAGNI principles
|
|
||||||
|
|
||||||
### Design Patterns
|
|
||||||
✓ 6 patterns implemented correctly
|
|
||||||
✓ Factory for extractors
|
|
||||||
✓ Strategy for chunkers
|
|
||||||
✓ Repository for persistence
|
|
||||||
✓ Template method for base classes
|
|
||||||
|
|
||||||
### SOLID Principles
|
|
||||||
✓ All 5 principles demonstrated
|
|
||||||
✓ Single Responsibility throughout
|
|
||||||
✓ Open/Closed via interfaces
|
|
||||||
✓ Dependency Inversion at boundaries
|
|
||||||
|
|
||||||
### Features
|
|
||||||
✓ Multiple file type support (PDF, DOCX, TXT)
|
|
||||||
✓ Multiple chunking strategies
|
|
||||||
✓ Rich domain models with validation
|
|
||||||
✓ Comprehensive error handling
|
|
||||||
✓ Thread-safe repository
|
|
||||||
✓ RESTful API with FastAPI
|
|
||||||
✓ Complete documentation
|
|
||||||
|
|
||||||
## Next Steps (Future Enhancements)
|
|
||||||
|
|
||||||
1. **Database Persistence**: PostgreSQL/MongoDB repository
|
|
||||||
2. **Async Processing**: Async extractors and chunkers
|
|
||||||
3. **Caching**: Redis for frequently accessed documents
|
|
||||||
4. **More Strategies**: Sentence-based, semantic chunking
|
|
||||||
5. **Batch Processing**: Process multiple documents at once
|
|
||||||
6. **Search**: Full-text search integration
|
|
||||||
7. **Monitoring**: Structured logging, metrics, APM
|
|
||||||
8. **Testing**: Add comprehensive test suite
|
|
||||||
|
|
||||||
## Conclusion
|
|
||||||
|
|
||||||
This implementation represents a **"Gold Standard"** hexagonal architecture:
|
|
||||||
|
|
||||||
- **Clean**: Clear separation of concerns
|
|
||||||
- **Testable**: Easy to mock and test
|
|
||||||
- **Flexible**: Easy to extend and modify
|
|
||||||
- **Maintainable**: Well-documented and organized
|
|
||||||
- **Production-Ready**: Error handling, logging, type safety
|
|
||||||
|
|
||||||
The architecture allows you to:
|
|
||||||
- Add new file types without touching core logic
|
|
||||||
- Swap storage implementations with one line change
|
|
||||||
- Add new chunking algorithms independently
|
|
||||||
- Test business logic without any infrastructure
|
|
||||||
- Scale horizontally or vertically as needed
|
|
||||||
|
|
||||||
This is how professional, enterprise-grade software should be built.
|
|
||||||
256
QUICK_START.md
256
QUICK_START.md
@ -1,256 +0,0 @@
|
|||||||
# Quick Start Guide
|
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Navigate to project directory
|
|
||||||
cd text_processor_hex
|
|
||||||
|
|
||||||
# Create virtual environment
|
|
||||||
python -m venv venv
|
|
||||||
|
|
||||||
# Activate virtual environment
|
|
||||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
## Run the Application
|
|
||||||
|
|
||||||
### Option 1: FastAPI Server
|
|
||||||
```bash
|
|
||||||
python main.py
|
|
||||||
```
|
|
||||||
Then visit: http://localhost:8000/docs
|
|
||||||
|
|
||||||
### Option 2: Programmatic Usage
|
|
||||||
```bash
|
|
||||||
python example_usage.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Basic Usage Examples
|
|
||||||
|
|
||||||
### 1. Using the API (cURL)
|
|
||||||
|
|
||||||
**Process a Document:**
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://localhost:8000/api/v1/process" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"file_path": "/path/to/document.pdf",
|
|
||||||
"chunking_strategy": {
|
|
||||||
"strategy_name": "fixed_size",
|
|
||||||
"chunk_size": 1000,
|
|
||||||
"overlap_size": 100,
|
|
||||||
"respect_boundaries": true
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Extract and Chunk:**
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://localhost:8000/api/v1/extract-and-chunk" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"file_path": "/path/to/document.pdf",
|
|
||||||
"chunking_strategy": {
|
|
||||||
"strategy_name": "paragraph",
|
|
||||||
"chunk_size": 1000,
|
|
||||||
"overlap_size": 0,
|
|
||||||
"respect_boundaries": true
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Get Document:**
|
|
||||||
```bash
|
|
||||||
curl -X GET "http://localhost:8000/api/v1/documents/{document_id}"
|
|
||||||
```
|
|
||||||
|
|
||||||
**List Documents:**
|
|
||||||
```bash
|
|
||||||
curl -X GET "http://localhost:8000/api/v1/documents?limit=10&offset=0"
|
|
||||||
```
|
|
||||||
|
|
||||||
**Delete Document:**
|
|
||||||
```bash
|
|
||||||
curl -X DELETE "http://localhost:8000/api/v1/documents/{document_id}"
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Using Python Code
|
|
||||||
|
|
||||||
```python
|
|
||||||
from pathlib import Path
|
|
||||||
from src.bootstrap import create_application
|
|
||||||
from src.core.domain.models import ChunkingStrategy
|
|
||||||
|
|
||||||
# Initialize
|
|
||||||
container = create_application()
|
|
||||||
service = container.text_processor_service
|
|
||||||
|
|
||||||
# Process a PDF
|
|
||||||
strategy = ChunkingStrategy(
|
|
||||||
strategy_name="fixed_size",
|
|
||||||
chunk_size=1000,
|
|
||||||
overlap_size=100,
|
|
||||||
respect_boundaries=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
document = service.process_document(
|
|
||||||
file_path=Path("example.pdf"),
|
|
||||||
chunking_strategy=strategy,
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Document ID: {document.id}")
|
|
||||||
print(f"Metadata: {document.get_metadata_summary()}")
|
|
||||||
|
|
||||||
# Extract and chunk
|
|
||||||
chunks = service.extract_and_chunk(
|
|
||||||
file_path=Path("example.pdf"),
|
|
||||||
chunking_strategy=strategy,
|
|
||||||
)
|
|
||||||
|
|
||||||
for chunk in chunks:
|
|
||||||
print(f"Chunk {chunk.sequence_number}: {chunk.get_length()} chars")
|
|
||||||
```
|
|
||||||
|
|
||||||
## Available Chunking Strategies
|
|
||||||
|
|
||||||
### 1. Fixed Size
|
|
||||||
Splits text into equal-sized chunks with optional overlap.
|
|
||||||
|
|
||||||
```python
|
|
||||||
ChunkingStrategy(
|
|
||||||
strategy_name="fixed_size",
|
|
||||||
chunk_size=1000, # Target size in characters
|
|
||||||
overlap_size=100, # Overlap between chunks
|
|
||||||
respect_boundaries=True # Try to break at sentences
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Paragraph
|
|
||||||
Splits text by paragraph boundaries, combining paragraphs to reach target size.
|
|
||||||
|
|
||||||
```python
|
|
||||||
ChunkingStrategy(
|
|
||||||
strategy_name="paragraph",
|
|
||||||
chunk_size=1000,
|
|
||||||
overlap_size=0,
|
|
||||||
respect_boundaries=True
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Supported File Types
|
|
||||||
|
|
||||||
- **PDF** (.pdf) - using PyPDF2
|
|
||||||
- **DOCX** (.docx) - using python-docx
|
|
||||||
- **Text** (.txt, .md, .text) - native Python
|
|
||||||
|
|
||||||
## Project Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
text_processor_hex/
|
|
||||||
├── main.py # FastAPI entry point
|
|
||||||
├── example_usage.py # Usage examples
|
|
||||||
├── requirements.txt # Dependencies
|
|
||||||
│
|
|
||||||
└── src/
|
|
||||||
├── core/ # Business logic (NO external dependencies)
|
|
||||||
│ ├── domain/ # Models, exceptions, logic
|
|
||||||
│ ├── ports/ # Interface definitions
|
|
||||||
│ └── services/ # Orchestration
|
|
||||||
│
|
|
||||||
├── adapters/ # External integrations
|
|
||||||
│ ├── incoming/ # FastAPI routes
|
|
||||||
│ └── outgoing/ # Extractors, chunkers, storage
|
|
||||||
│
|
|
||||||
├── shared/ # Utilities
|
|
||||||
└── bootstrap.py # Dependency injection
|
|
||||||
```
|
|
||||||
|
|
||||||
## Common Tasks
|
|
||||||
|
|
||||||
### Add a New File Type
|
|
||||||
1. Create extractor in `src/adapters/outgoing/extractors/`
|
|
||||||
2. Extend `BaseExtractor`
|
|
||||||
3. Register in `bootstrap.py`
|
|
||||||
|
|
||||||
### Add a New Chunking Strategy
|
|
||||||
1. Create chunker in `src/adapters/outgoing/chunkers/`
|
|
||||||
2. Extend `BaseChunker`
|
|
||||||
3. Register in `bootstrap.py`
|
|
||||||
|
|
||||||
### Change Storage
|
|
||||||
1. Implement `IDocumentRepository` interface
|
|
||||||
2. Swap implementation in `bootstrap.py`
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run example
|
|
||||||
python example_usage.py
|
|
||||||
|
|
||||||
# Test API with curl
|
|
||||||
curl http://localhost:8000/health
|
|
||||||
|
|
||||||
# Check API docs
|
|
||||||
# Visit: http://localhost:8000/docs
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Import Errors
|
|
||||||
```bash
|
|
||||||
# Make sure you're in the right directory
|
|
||||||
cd text_processor_hex
|
|
||||||
|
|
||||||
# Activate virtual environment
|
|
||||||
source venv/bin/activate
|
|
||||||
```
|
|
||||||
|
|
||||||
### Missing Dependencies
|
|
||||||
```bash
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
### File Not Found Errors
|
|
||||||
Use absolute paths for file_path in API requests:
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"file_path": "/absolute/path/to/file.pdf"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Architecture Highlights
|
|
||||||
|
|
||||||
**Hexagonal Architecture:**
|
|
||||||
- Core business logic is isolated
|
|
||||||
- Easy to test without infrastructure
|
|
||||||
- Easy to swap implementations
|
|
||||||
|
|
||||||
**Design Patterns:**
|
|
||||||
- Factory: ExtractorFactory selects extractor by file type
|
|
||||||
- Strategy: ChunkingContext selects chunking strategy
|
|
||||||
- Repository: Abstract data storage
|
|
||||||
- Dependency Injection: All dependencies injected via bootstrap
|
|
||||||
|
|
||||||
**SOLID Principles:**
|
|
||||||
- Single Responsibility: Each class does one thing
|
|
||||||
- Open/Closed: Add features without modifying core
|
|
||||||
- Dependency Inversion: Core depends on abstractions
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
|
|
||||||
1. Read `README.md` for detailed documentation
|
|
||||||
2. Read `ARCHITECTURE.md` for architecture details
|
|
||||||
3. Run `example_usage.py` to see it in action
|
|
||||||
4. Explore the code starting from `bootstrap.py`
|
|
||||||
5. Try the API using the Swagger docs at `/docs`
|
|
||||||
|
|
||||||
## Need Help?
|
|
||||||
|
|
||||||
- Check `README.md` for detailed docs
|
|
||||||
- Check `ARCHITECTURE.md` for architecture diagrams
|
|
||||||
- Check `PROJECT_SUMMARY.md` for complete overview
|
|
||||||
- Look at `example_usage.py` for usage patterns
|
|
||||||
157
example_usage.py
157
example_usage.py
@ -1,157 +0,0 @@
|
|||||||
"""
|
|
||||||
Example Usage Script - Demonstrates how to use the Text Processor.
|
|
||||||
|
|
||||||
This script shows how to use the text processor programmatically
|
|
||||||
without going through the HTTP API.
|
|
||||||
"""
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from src.bootstrap import create_application
|
|
||||||
from src.core.domain.models import ChunkingStrategy
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main example function."""
|
|
||||||
print("=" * 70)
|
|
||||||
print("Text Processor - Hexagonal Architecture Example")
|
|
||||||
print("=" * 70)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Step 1: Create application container with dependency injection
|
|
||||||
print("1. Initializing application container...")
|
|
||||||
container = create_application(log_level="INFO")
|
|
||||||
service = container.text_processor_service
|
|
||||||
print(" ✓ Container initialized\n")
|
|
||||||
|
|
||||||
# Step 2: Create a sample text file for demonstration
|
|
||||||
print("2. Creating sample text file...")
|
|
||||||
sample_text = """
|
|
||||||
The Hexagonal Architecture Pattern
|
|
||||||
|
|
||||||
Introduction
|
|
||||||
Hexagonal Architecture, also known as Ports and Adapters, is a software design
|
|
||||||
pattern that aims to create loosely coupled application components. The pattern
|
|
||||||
was invented by Alistair Cockburn in 2005.
|
|
||||||
|
|
||||||
Core Concepts
|
|
||||||
The main idea is to isolate the core business logic from external concerns like
|
|
||||||
databases, user interfaces, and external services. This is achieved through the
|
|
||||||
use of ports and adapters.
|
|
||||||
|
|
||||||
Ports are interfaces that define how the application core interacts with the
|
|
||||||
outside world. Adapters are implementations of these ports that connect the
|
|
||||||
application to specific technologies.
|
|
||||||
|
|
||||||
Benefits
|
|
||||||
The benefits of this architecture include improved testability, flexibility,
|
|
||||||
and maintainability. By isolating the core logic, we can easily swap
|
|
||||||
implementations without affecting the business rules.
|
|
||||||
|
|
||||||
Conclusion
|
|
||||||
Hexagonal Architecture is a powerful pattern for building maintainable and
|
|
||||||
flexible applications. It promotes clean separation of concerns and makes
|
|
||||||
testing much easier.
|
|
||||||
"""
|
|
||||||
|
|
||||||
sample_file = Path("sample_document.txt")
|
|
||||||
sample_file.write_text(sample_text.strip())
|
|
||||||
print(f" ✓ Created sample file: {sample_file}\n")
|
|
||||||
|
|
||||||
# Step 3: Process document with fixed-size chunking
|
|
||||||
print("3. Processing document with FIXED SIZE strategy...")
|
|
||||||
fixed_strategy = ChunkingStrategy(
|
|
||||||
strategy_name="fixed_size",
|
|
||||||
chunk_size=300,
|
|
||||||
overlap_size=50,
|
|
||||||
respect_boundaries=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
document = service.process_document(
|
|
||||||
file_path=sample_file,
|
|
||||||
chunking_strategy=fixed_strategy,
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f" Document ID: {document.id}")
|
|
||||||
print(f" Metadata: {document.get_metadata_summary()}")
|
|
||||||
print(f" Processed: {document.is_processed}")
|
|
||||||
print(f" Content length: {len(document.content)} characters")
|
|
||||||
print(f" Preview: {document.get_content_preview(100)}...\n")
|
|
||||||
|
|
||||||
# Step 4: Extract and chunk with paragraph strategy
|
|
||||||
print("4. Extracting and chunking with PARAGRAPH strategy...")
|
|
||||||
paragraph_strategy = ChunkingStrategy(
|
|
||||||
strategy_name="paragraph",
|
|
||||||
chunk_size=500,
|
|
||||||
overlap_size=0,
|
|
||||||
respect_boundaries=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
chunks = service.extract_and_chunk(
|
|
||||||
file_path=sample_file,
|
|
||||||
chunking_strategy=paragraph_strategy,
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f" ✓ Created {len(chunks)} chunks\n")
|
|
||||||
|
|
||||||
# Display chunk information
|
|
||||||
print(" Chunk Details:")
|
|
||||||
print(" " + "-" * 66)
|
|
||||||
for i, chunk in enumerate(chunks[:3], 1): # Show first 3 chunks
|
|
||||||
print(f" Chunk #{chunk.sequence_number}")
|
|
||||||
print(f" - Length: {chunk.get_length()} characters")
|
|
||||||
print(f" - Position: {chunk.start_char} to {chunk.end_char}")
|
|
||||||
print(f" - Preview: {chunk.content[:80]}...")
|
|
||||||
print(" " + "-" * 66)
|
|
||||||
|
|
||||||
if len(chunks) > 3:
|
|
||||||
print(f" ... and {len(chunks) - 3} more chunks\n")
|
|
||||||
|
|
||||||
# Step 5: Retrieve the document
|
|
||||||
print("5. Retrieving document from repository...")
|
|
||||||
retrieved = service.get_document(document.id)
|
|
||||||
print(f" ✓ Retrieved document: {retrieved.id}")
|
|
||||||
print(f" ✓ Content matches: {retrieved.content == document.content}\n")
|
|
||||||
|
|
||||||
# Step 6: List all documents
|
|
||||||
print("6. Listing all documents...")
|
|
||||||
all_docs = service.list_documents(limit=10)
|
|
||||||
print(f" ✓ Found {len(all_docs)} document(s) in repository")
|
|
||||||
for doc in all_docs:
|
|
||||||
print(f" - {doc.metadata.file_name} ({doc.metadata.file_type})")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Step 7: Delete the document
|
|
||||||
print("7. Cleaning up - deleting document...")
|
|
||||||
deleted = service.delete_document(document.id)
|
|
||||||
print(f" ✓ Document deleted: {deleted}\n")
|
|
||||||
|
|
||||||
# Verify deletion
|
|
||||||
remaining = service.list_documents()
|
|
||||||
print(f" ✓ Remaining documents: {len(remaining)}\n")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ✗ Error: {str(e)}\n")
|
|
||||||
raise
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Clean up sample file
|
|
||||||
if sample_file.exists():
|
|
||||||
sample_file.unlink()
|
|
||||||
print(f" ✓ Cleaned up sample file\n")
|
|
||||||
|
|
||||||
print("=" * 70)
|
|
||||||
print("Example completed successfully!")
|
|
||||||
print("=" * 70)
|
|
||||||
print()
|
|
||||||
print("Key Takeaways:")
|
|
||||||
print("1. Core domain is completely isolated from adapters")
|
|
||||||
print("2. Dependencies are injected through bootstrap")
|
|
||||||
print("3. Easy to swap implementations (strategies, extractors)")
|
|
||||||
print("4. Rich domain models with built-in validation")
|
|
||||||
print("5. Clear separation between API models and domain models")
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
99
main.py
99
main.py
@ -1,110 +1,17 @@
|
|||||||
"""
|
"""
|
||||||
Main Application Entry Point.
|
Main Application Entry Point.
|
||||||
|
|
||||||
This module creates and runs the FastAPI application.
|
This module imports the FastAPI app directly from the routes module
|
||||||
|
and runs it via uvicorn.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
|
|
||||||
from fastapi import FastAPI
|
from src.adapters.incoming.api_routes import app
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
|
|
||||||
from src.bootstrap import create_application
|
|
||||||
from src.shared.constants import (
|
|
||||||
API_DESCRIPTION,
|
|
||||||
API_DOCS_URL,
|
|
||||||
API_PREFIX,
|
|
||||||
API_REDOC_URL,
|
|
||||||
API_TITLE,
|
|
||||||
APP_VERSION,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Application container (created on startup)
|
|
||||||
app_container = None
|
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def lifespan(app: FastAPI):
|
|
||||||
"""
|
|
||||||
Application lifespan manager.
|
|
||||||
|
|
||||||
Handles startup and shutdown events.
|
|
||||||
"""
|
|
||||||
# Startup
|
|
||||||
global app_container
|
|
||||||
logger.info("Starting up application...")
|
|
||||||
|
|
||||||
# Create application container with dependency injection
|
|
||||||
app_container = create_application(log_level="INFO")
|
|
||||||
|
|
||||||
logger.info("Application started successfully")
|
|
||||||
|
|
||||||
yield
|
|
||||||
|
|
||||||
# Shutdown
|
|
||||||
logger.info("Shutting down application...")
|
|
||||||
app_container = None
|
|
||||||
logger.info("Application shut down")
|
|
||||||
|
|
||||||
|
|
||||||
# Create FastAPI application
|
|
||||||
app = FastAPI(
|
|
||||||
title=API_TITLE,
|
|
||||||
description=API_DESCRIPTION,
|
|
||||||
version=APP_VERSION,
|
|
||||||
docs_url=API_DOCS_URL,
|
|
||||||
redoc_url=API_REDOC_URL,
|
|
||||||
lifespan=lifespan,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add CORS middleware
|
|
||||||
app.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=["*"], # Configure appropriately for production
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("startup")
|
|
||||||
async def setup_routes():
|
|
||||||
"""Setup API routes on startup."""
|
|
||||||
if app_container:
|
|
||||||
# Include the API routes from the incoming adapter
|
|
||||||
app.include_router(
|
|
||||||
app_container.api.router,
|
|
||||||
prefix=API_PREFIX,
|
|
||||||
tags=["Text Processing"],
|
|
||||||
)
|
|
||||||
logger.info(f"API routes registered at {API_PREFIX}")
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
async def root():
|
|
||||||
"""Root endpoint with API information."""
|
|
||||||
return {
|
|
||||||
"name": API_TITLE,
|
|
||||||
"version": APP_VERSION,
|
|
||||||
"description": API_DESCRIPTION,
|
|
||||||
"docs_url": API_DOCS_URL,
|
|
||||||
"api_prefix": API_PREFIX,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
async def health_check():
|
|
||||||
"""Basic health check endpoint."""
|
|
||||||
return {
|
|
||||||
"status": "healthy",
|
|
||||||
"version": APP_VERSION,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
|
|||||||
@ -6,10 +6,6 @@ pydantic-settings==2.7.1
|
|||||||
fastapi==0.115.6
|
fastapi==0.115.6
|
||||||
uvicorn[standard]==0.34.0
|
uvicorn[standard]==0.34.0
|
||||||
|
|
||||||
# Document Processing
|
|
||||||
PyPDF2==3.0.1
|
|
||||||
python-docx==1.1.2
|
|
||||||
|
|
||||||
# Utilities
|
# Utilities
|
||||||
python-multipart==0.0.20
|
python-multipart==0.0.20
|
||||||
|
|
||||||
|
|||||||
@ -1,15 +1,14 @@
|
|||||||
"""
|
"""
|
||||||
API Routes - FastAPI routes for text processing operations.
|
API Routes - Functional FastAPI routes for text processing.
|
||||||
|
|
||||||
This is the incoming adapter that translates HTTP requests into
|
This is the incoming adapter that translates HTTP requests into
|
||||||
use case calls.
|
domain operations. Routes pull the service directly from bootstrap.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, status
|
from fastapi import APIRouter, FastAPI, HTTPException, status
|
||||||
|
|
||||||
from ...core.domain.exceptions import (
|
from ...core.domain.exceptions import (
|
||||||
ChunkingError,
|
ChunkingError,
|
||||||
@ -19,15 +18,13 @@ from ...core.domain.exceptions import (
|
|||||||
ProcessingError,
|
ProcessingError,
|
||||||
UnsupportedFileTypeError,
|
UnsupportedFileTypeError,
|
||||||
)
|
)
|
||||||
from ...core.domain.models import Chunk, ChunkingStrategy, Document
|
from ...core.domain.models import ChunkingStrategy
|
||||||
from ...core.ports.incoming.text_processor import ITextProcessor
|
from ...core.ports.incoming.text_processor import ITextProcessor
|
||||||
from .api_schemas import (
|
from .api_schemas import (
|
||||||
ChunkResponse,
|
ChunkResponse,
|
||||||
DeleteDocumentResponse,
|
DeleteDocumentResponse,
|
||||||
DocumentListResponse,
|
DocumentListResponse,
|
||||||
DocumentMetadataResponse,
|
|
||||||
DocumentResponse,
|
DocumentResponse,
|
||||||
ErrorResponse,
|
|
||||||
ExtractAndChunkRequest,
|
ExtractAndChunkRequest,
|
||||||
ExtractAndChunkResponse,
|
ExtractAndChunkResponse,
|
||||||
HealthCheckResponse,
|
HealthCheckResponse,
|
||||||
@ -39,292 +36,43 @@ from .api_schemas import (
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TextProcessorAPI:
|
# Create FastAPI application
|
||||||
"""
|
app = FastAPI(
|
||||||
FastAPI routes for text processing.
|
title="Text Processor API",
|
||||||
|
description="Text extraction and chunking system using Hexagonal Architecture",
|
||||||
This adapter translates HTTP requests into domain operations
|
|
||||||
and handles error mapping to HTTP responses.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, text_processor: ITextProcessor) -> None:
|
|
||||||
"""
|
|
||||||
Initialize API routes.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text_processor: Text processor service (incoming port)
|
|
||||||
"""
|
|
||||||
self.text_processor = text_processor
|
|
||||||
self.router = APIRouter()
|
|
||||||
self._register_routes()
|
|
||||||
logger.info("TextProcessorAPI initialized")
|
|
||||||
|
|
||||||
def _register_routes(self) -> None:
|
|
||||||
"""Register all API routes."""
|
|
||||||
self.router.add_api_route(
|
|
||||||
"/process",
|
|
||||||
self.process_document,
|
|
||||||
methods=["POST"],
|
|
||||||
response_model=ProcessDocumentResponse,
|
|
||||||
status_code=status.HTTP_201_CREATED,
|
|
||||||
summary="Process a document",
|
|
||||||
description="Extract text from document and store it",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.router.add_api_route(
|
|
||||||
"/extract-and-chunk",
|
|
||||||
self.extract_and_chunk,
|
|
||||||
methods=["POST"],
|
|
||||||
response_model=ExtractAndChunkResponse,
|
|
||||||
status_code=status.HTTP_200_OK,
|
|
||||||
summary="Extract and chunk document",
|
|
||||||
description="Extract text and split into chunks",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.router.add_api_route(
|
|
||||||
"/documents/{document_id}",
|
|
||||||
self.get_document,
|
|
||||||
methods=["GET"],
|
|
||||||
response_model=DocumentResponse,
|
|
||||||
status_code=status.HTTP_200_OK,
|
|
||||||
summary="Get document by ID",
|
|
||||||
description="Retrieve a processed document",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.router.add_api_route(
|
|
||||||
"/documents",
|
|
||||||
self.list_documents,
|
|
||||||
methods=["GET"],
|
|
||||||
response_model=DocumentListResponse,
|
|
||||||
status_code=status.HTTP_200_OK,
|
|
||||||
summary="List all documents",
|
|
||||||
description="Retrieve all documents with pagination",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.router.add_api_route(
|
|
||||||
"/documents/{document_id}",
|
|
||||||
self.delete_document,
|
|
||||||
methods=["DELETE"],
|
|
||||||
response_model=DeleteDocumentResponse,
|
|
||||||
status_code=status.HTTP_200_OK,
|
|
||||||
summary="Delete document",
|
|
||||||
description="Delete a document by ID",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.router.add_api_route(
|
|
||||||
"/health",
|
|
||||||
self.health_check,
|
|
||||||
methods=["GET"],
|
|
||||||
response_model=HealthCheckResponse,
|
|
||||||
status_code=status.HTTP_200_OK,
|
|
||||||
summary="Health check",
|
|
||||||
description="Check API health and configuration",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def process_document(
|
|
||||||
self,
|
|
||||||
request: ProcessDocumentRequest,
|
|
||||||
) -> ProcessDocumentResponse:
|
|
||||||
"""
|
|
||||||
Process a document endpoint.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
request: Processing request with file path and strategy
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Processing response with document details
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
HTTPException: If processing fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Convert request to domain models
|
|
||||||
file_path = Path(request.file_path)
|
|
||||||
strategy = self._to_domain_strategy(request.chunking_strategy)
|
|
||||||
|
|
||||||
# Execute use case
|
|
||||||
document = self.text_processor.process_document(file_path, strategy)
|
|
||||||
|
|
||||||
# Convert to response
|
|
||||||
return ProcessDocumentResponse(
|
|
||||||
document=self._to_document_response(document)
|
|
||||||
)
|
|
||||||
|
|
||||||
except DomainException as e:
|
|
||||||
raise self._map_domain_exception(e)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error processing document: {str(e)}")
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
||||||
detail=f"Internal server error: {str(e)}",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def extract_and_chunk(
|
|
||||||
self,
|
|
||||||
request: ExtractAndChunkRequest,
|
|
||||||
) -> ExtractAndChunkResponse:
|
|
||||||
"""
|
|
||||||
Extract and chunk document endpoint.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
request: Extract and chunk request
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Response with chunks
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
HTTPException: If extraction or chunking fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Convert request to domain models
|
|
||||||
file_path = Path(request.file_path)
|
|
||||||
strategy = self._to_domain_strategy(request.chunking_strategy)
|
|
||||||
|
|
||||||
# Execute use case
|
|
||||||
chunks = self.text_processor.extract_and_chunk(file_path, strategy)
|
|
||||||
|
|
||||||
# Convert to response
|
|
||||||
chunk_responses = [self._to_chunk_response(c) for c in chunks]
|
|
||||||
|
|
||||||
return ExtractAndChunkResponse(
|
|
||||||
chunks=chunk_responses,
|
|
||||||
total_chunks=len(chunk_responses),
|
|
||||||
)
|
|
||||||
|
|
||||||
except DomainException as e:
|
|
||||||
raise self._map_domain_exception(e)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error extracting and chunking: {str(e)}")
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
||||||
detail=f"Internal server error: {str(e)}",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def get_document(self, document_id: str) -> DocumentResponse:
|
|
||||||
"""
|
|
||||||
Get document by ID endpoint.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document_id: UUID of the document
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Document response
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
HTTPException: If document not found
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
doc_uuid = UUID(document_id)
|
|
||||||
document = self.text_processor.get_document(doc_uuid)
|
|
||||||
return self._to_document_response(document)
|
|
||||||
|
|
||||||
except ValueError:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
|
||||||
detail=f"Invalid document ID format: {document_id}",
|
|
||||||
)
|
|
||||||
except DocumentNotFoundError as e:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_404_NOT_FOUND,
|
|
||||||
detail=str(e),
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error retrieving document: {str(e)}")
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
||||||
detail=f"Internal server error: {str(e)}",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def list_documents(
|
|
||||||
self,
|
|
||||||
limit: int = 100,
|
|
||||||
offset: int = 0,
|
|
||||||
) -> DocumentListResponse:
|
|
||||||
"""
|
|
||||||
List documents endpoint.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
limit: Maximum number of documents to return
|
|
||||||
offset: Number of documents to skip
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of documents with pagination info
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
documents = self.text_processor.list_documents(limit, offset)
|
|
||||||
doc_responses = [self._to_document_response(d) for d in documents]
|
|
||||||
|
|
||||||
return DocumentListResponse(
|
|
||||||
documents=doc_responses,
|
|
||||||
total=len(doc_responses),
|
|
||||||
limit=limit,
|
|
||||||
offset=offset,
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error listing documents: {str(e)}")
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
||||||
detail=f"Internal server error: {str(e)}",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def delete_document(self, document_id: str) -> DeleteDocumentResponse:
|
|
||||||
"""
|
|
||||||
Delete document endpoint.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document_id: UUID of the document
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Deletion response
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
HTTPException: If document not found or deletion fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
doc_uuid = UUID(document_id)
|
|
||||||
success = self.text_processor.delete_document(doc_uuid)
|
|
||||||
|
|
||||||
return DeleteDocumentResponse(
|
|
||||||
success=success,
|
|
||||||
message=f"Document {document_id} deleted successfully",
|
|
||||||
document_id=document_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
except ValueError:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
|
||||||
detail=f"Invalid document ID format: {document_id}",
|
|
||||||
)
|
|
||||||
except DocumentNotFoundError as e:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_404_NOT_FOUND,
|
|
||||||
detail=str(e),
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error deleting document: {str(e)}")
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
||||||
detail=f"Internal server error: {str(e)}",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def health_check(self) -> HealthCheckResponse:
|
|
||||||
"""
|
|
||||||
Health check endpoint.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Health status and configuration
|
|
||||||
"""
|
|
||||||
# Note: This would ideally get info from dependencies
|
|
||||||
return HealthCheckResponse(
|
|
||||||
status="healthy",
|
|
||||||
version="1.0.0",
|
version="1.0.0",
|
||||||
supported_file_types=["pdf", "docx", "txt"],
|
docs_url="/docs",
|
||||||
available_strategies=["fixed_size", "paragraph"],
|
redoc_url="/redoc",
|
||||||
)
|
)
|
||||||
|
|
||||||
def _to_domain_strategy(self, request_strategy) -> ChunkingStrategy:
|
# Create API router
|
||||||
"""Convert API request strategy to domain model."""
|
router = APIRouter(prefix="/api/v1", tags=["Text Processing"])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_service() -> ITextProcessor:
|
||||||
|
"""
|
||||||
|
Get the text processor service from bootstrap singleton.
|
||||||
|
|
||||||
|
This function pulls the service directly without using FastAPI's Depends.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ITextProcessor: Core service instance
|
||||||
|
"""
|
||||||
|
from ...bootstrap import get_processor_service
|
||||||
|
|
||||||
|
return get_processor_service()
|
||||||
|
|
||||||
|
|
||||||
|
def _to_domain_strategy(request_strategy) -> ChunkingStrategy:
|
||||||
|
"""
|
||||||
|
Convert API request strategy to domain model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request_strategy: API request strategy schema
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ChunkingStrategy: Domain strategy model
|
||||||
|
"""
|
||||||
return ChunkingStrategy(
|
return ChunkingStrategy(
|
||||||
strategy_name=request_strategy.strategy_name,
|
strategy_name=request_strategy.strategy_name,
|
||||||
chunk_size=request_strategy.chunk_size,
|
chunk_size=request_strategy.chunk_size,
|
||||||
@ -332,8 +80,19 @@ class TextProcessorAPI:
|
|||||||
respect_boundaries=request_strategy.respect_boundaries,
|
respect_boundaries=request_strategy.respect_boundaries,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _to_document_response(self, document: Document) -> DocumentResponse:
|
|
||||||
"""Convert domain document to API response."""
|
def _to_document_response(document) -> DocumentResponse:
|
||||||
|
"""
|
||||||
|
Convert domain document to API response.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: Domain Document entity
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DocumentResponse: API response model
|
||||||
|
"""
|
||||||
|
from .api_schemas import DocumentMetadataResponse
|
||||||
|
|
||||||
return DocumentResponse(
|
return DocumentResponse(
|
||||||
id=str(document.id),
|
id=str(document.id),
|
||||||
content=document.content,
|
content=document.content,
|
||||||
@ -349,8 +108,17 @@ class TextProcessorAPI:
|
|||||||
content_preview=document.get_content_preview(200),
|
content_preview=document.get_content_preview(200),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _to_chunk_response(self, chunk: Chunk) -> ChunkResponse:
|
|
||||||
"""Convert domain chunk to API response."""
|
def _to_chunk_response(chunk) -> ChunkResponse:
|
||||||
|
"""
|
||||||
|
Convert domain chunk to API response.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk: Domain Chunk entity
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ChunkResponse: API response model
|
||||||
|
"""
|
||||||
return ChunkResponse(
|
return ChunkResponse(
|
||||||
id=str(chunk.id),
|
id=str(chunk.id),
|
||||||
document_id=str(chunk.document_id),
|
document_id=str(chunk.document_id),
|
||||||
@ -361,11 +129,16 @@ class TextProcessorAPI:
|
|||||||
length=chunk.get_length(),
|
length=chunk.get_length(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _map_domain_exception(self, exception: DomainException) -> HTTPException:
|
|
||||||
|
def _map_domain_exception(exception: DomainException) -> HTTPException:
|
||||||
"""
|
"""
|
||||||
Map domain exceptions to HTTP exceptions.
|
Map domain exceptions to HTTP exceptions.
|
||||||
|
|
||||||
This is where we translate domain errors into API errors.
|
Args:
|
||||||
|
exception: Domain exception
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HTTPException: Corresponding HTTP exception
|
||||||
"""
|
"""
|
||||||
if isinstance(exception, UnsupportedFileTypeError):
|
if isinstance(exception, UnsupportedFileTypeError):
|
||||||
return HTTPException(
|
return HTTPException(
|
||||||
@ -397,3 +170,275 @@ class TextProcessorAPI:
|
|||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
detail=str(exception),
|
detail=str(exception),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/process",
|
||||||
|
response_model=ProcessDocumentResponse,
|
||||||
|
status_code=status.HTTP_201_CREATED,
|
||||||
|
summary="Process a document",
|
||||||
|
description="Extract text from document and store it",
|
||||||
|
)
|
||||||
|
async def process_document(request: ProcessDocumentRequest) -> ProcessDocumentResponse:
|
||||||
|
"""
|
||||||
|
Process a document endpoint.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: Processing request with file path and strategy
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Processing response with document details
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If processing fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Pull service from bootstrap
|
||||||
|
service: ITextProcessor = _get_service()
|
||||||
|
|
||||||
|
# Convert request to domain models
|
||||||
|
file_path = Path(request.file_path)
|
||||||
|
strategy = _to_domain_strategy(request.chunking_strategy)
|
||||||
|
|
||||||
|
# Execute use case
|
||||||
|
document = service.process_document(file_path, strategy)
|
||||||
|
|
||||||
|
# Convert to response
|
||||||
|
return ProcessDocumentResponse(
|
||||||
|
document=_to_document_response(document)
|
||||||
|
)
|
||||||
|
|
||||||
|
except DomainException as e:
|
||||||
|
raise _map_domain_exception(e)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error processing document: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Internal server error: {str(e)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/extract-and-chunk",
|
||||||
|
response_model=ExtractAndChunkResponse,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
summary="Extract and chunk document",
|
||||||
|
description="Extract text and split into chunks",
|
||||||
|
)
|
||||||
|
async def extract_and_chunk(
|
||||||
|
request: ExtractAndChunkRequest,
|
||||||
|
) -> ExtractAndChunkResponse:
|
||||||
|
"""
|
||||||
|
Extract and chunk document endpoint.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: Extract and chunk request
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response with chunks
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If extraction or chunking fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Pull service from bootstrap
|
||||||
|
service: ITextProcessor = _get_service()
|
||||||
|
|
||||||
|
# Convert request to domain models
|
||||||
|
file_path = Path(request.file_path)
|
||||||
|
strategy = _to_domain_strategy(request.chunking_strategy)
|
||||||
|
|
||||||
|
# Execute use case
|
||||||
|
chunks = service.extract_and_chunk(file_path, strategy)
|
||||||
|
|
||||||
|
# Convert to response
|
||||||
|
chunk_responses = [_to_chunk_response(c) for c in chunks]
|
||||||
|
|
||||||
|
return ExtractAndChunkResponse(
|
||||||
|
chunks=chunk_responses,
|
||||||
|
total_chunks=len(chunk_responses),
|
||||||
|
)
|
||||||
|
|
||||||
|
except DomainException as e:
|
||||||
|
raise _map_domain_exception(e)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error extracting and chunking: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Internal server error: {str(e)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/documents/{document_id}",
|
||||||
|
response_model=DocumentResponse,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
summary="Get document by ID",
|
||||||
|
description="Retrieve a processed document",
|
||||||
|
)
|
||||||
|
async def get_document(document_id: str) -> DocumentResponse:
|
||||||
|
"""
|
||||||
|
Get document by ID endpoint.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: UUID of the document
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document response
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If document not found
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Pull service from bootstrap
|
||||||
|
service: ITextProcessor = _get_service()
|
||||||
|
|
||||||
|
doc_uuid = UUID(document_id)
|
||||||
|
document = service.get_document(doc_uuid)
|
||||||
|
return _to_document_response(document)
|
||||||
|
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail=f"Invalid document ID format: {document_id}",
|
||||||
|
)
|
||||||
|
except DocumentNotFoundError as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail=str(e),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error retrieving document: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Internal server error: {str(e)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/documents",
|
||||||
|
response_model=DocumentListResponse,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
summary="List all documents",
|
||||||
|
description="Retrieve all documents with pagination",
|
||||||
|
)
|
||||||
|
async def list_documents(limit: int = 100, offset: int = 0) -> DocumentListResponse:
|
||||||
|
"""
|
||||||
|
List documents endpoint.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
limit: Maximum number of documents to return
|
||||||
|
offset: Number of documents to skip
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of documents with pagination info
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Pull service from bootstrap
|
||||||
|
service: ITextProcessor = _get_service()
|
||||||
|
|
||||||
|
documents = service.list_documents(limit, offset)
|
||||||
|
doc_responses = [_to_document_response(d) for d in documents]
|
||||||
|
|
||||||
|
return DocumentListResponse(
|
||||||
|
documents=doc_responses,
|
||||||
|
total=len(doc_responses),
|
||||||
|
limit=limit,
|
||||||
|
offset=offset,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error listing documents: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Internal server error: {str(e)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete(
|
||||||
|
"/documents/{document_id}",
|
||||||
|
response_model=DeleteDocumentResponse,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
summary="Delete document",
|
||||||
|
description="Delete a document by ID",
|
||||||
|
)
|
||||||
|
async def delete_document(document_id: str) -> DeleteDocumentResponse:
|
||||||
|
"""
|
||||||
|
Delete document endpoint.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_id: UUID of the document
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Deletion response
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If document not found or deletion fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Pull service from bootstrap
|
||||||
|
service: ITextProcessor = _get_service()
|
||||||
|
|
||||||
|
doc_uuid = UUID(document_id)
|
||||||
|
success = service.delete_document(doc_uuid)
|
||||||
|
|
||||||
|
return DeleteDocumentResponse(
|
||||||
|
success=success,
|
||||||
|
message=f"Document {document_id} deleted successfully",
|
||||||
|
document_id=document_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail=f"Invalid document ID format: {document_id}",
|
||||||
|
)
|
||||||
|
except DocumentNotFoundError as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail=str(e),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error deleting document: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Internal server error: {str(e)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get(
|
||||||
|
"/health",
|
||||||
|
response_model=HealthCheckResponse,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
summary="Health check",
|
||||||
|
description="Check API health and configuration",
|
||||||
|
)
|
||||||
|
async def health_check() -> HealthCheckResponse:
|
||||||
|
"""
|
||||||
|
Health check endpoint.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Health status and configuration
|
||||||
|
"""
|
||||||
|
return HealthCheckResponse(
|
||||||
|
status="healthy",
|
||||||
|
version="1.0.0",
|
||||||
|
supported_file_types=["pdf", "docx", "txt"],
|
||||||
|
available_strategies=["fixed_size", "paragraph"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Include router in app
|
||||||
|
app.include_router(router)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root():
|
||||||
|
"""Root endpoint with API information."""
|
||||||
|
return {
|
||||||
|
"name": "Text Processor API",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "Text extraction and chunking system using Hexagonal Architecture",
|
||||||
|
"docs_url": "/docs",
|
||||||
|
"api_prefix": "/api/v1",
|
||||||
|
}
|
||||||
|
|||||||
@ -1,15 +1,15 @@
|
|||||||
"""
|
"""
|
||||||
Bootstrap - Dependency Injection and Wiring.
|
Bootstrap - Dependency Injection with Lazy Singleton Pattern.
|
||||||
|
|
||||||
This module wires together all components of the application.
|
This module wires together the Core and Outgoing Adapters.
|
||||||
The Core never imports Adapters - only the Bootstrap does.
|
The Core never imports Adapters - only the Bootstrap does.
|
||||||
|
|
||||||
This is the ONLY place where concrete implementations are instantiated
|
The ApplicationContainer manages ONLY:
|
||||||
and injected into the domain services.
|
- Core Services
|
||||||
|
- Outgoing Adapters (Extractors, Chunkers, Repository)
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .adapters.incoming.api_routes import TextProcessorAPI
|
|
||||||
from .adapters.outgoing.chunkers.context import ChunkingContext
|
from .adapters.outgoing.chunkers.context import ChunkingContext
|
||||||
from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
|
from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
|
||||||
from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
|
||||||
@ -28,13 +28,18 @@ from .shared.logging_config import setup_logging
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton instance (lazy initialization)
|
||||||
|
_container: 'ApplicationContainer | None' = None
|
||||||
|
|
||||||
|
|
||||||
class ApplicationContainer:
|
class ApplicationContainer:
|
||||||
"""
|
"""
|
||||||
Dependency Injection Container.
|
Dependency Injection Container for Core and Outgoing Adapters.
|
||||||
|
|
||||||
|
This container manages the lifecycle and dependencies of:
|
||||||
|
- Core Domain Services
|
||||||
|
- Outgoing Adapters (Extractors, Chunkers, Repository)
|
||||||
|
|
||||||
This container manages the lifecycle and dependencies of all
|
|
||||||
application components. It follows the Dependency Inversion Principle
|
|
||||||
by depending on abstractions (ports) rather than concrete implementations.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, log_level: str = "INFO") -> None:
|
def __init__(self, log_level: str = "INFO") -> None:
|
||||||
@ -48,28 +53,25 @@ class ApplicationContainer:
|
|||||||
setup_logging(level=log_level)
|
setup_logging(level=log_level)
|
||||||
logger.info("Initializing ApplicationContainer")
|
logger.info("Initializing ApplicationContainer")
|
||||||
|
|
||||||
# Outgoing adapters
|
# Create Outgoing Adapters
|
||||||
self._repository = self._create_repository()
|
self._repository = self._create_repository()
|
||||||
self._extractor_factory = self._create_extractor_factory()
|
self._extractor_factory = self._create_extractor_factory()
|
||||||
self._chunking_context = self._create_chunking_context()
|
self._chunking_context = self._create_chunking_context()
|
||||||
|
|
||||||
# Core service
|
# Create Core Service (depends only on Ports)
|
||||||
self._text_processor_service = self._create_text_processor_service()
|
self._text_processor_service = self._create_text_processor_service()
|
||||||
|
|
||||||
# Incoming adapter
|
|
||||||
self._api = self._create_api()
|
|
||||||
|
|
||||||
logger.info("ApplicationContainer initialized successfully")
|
logger.info("ApplicationContainer initialized successfully")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text_processor_service(self) -> ITextProcessor:
|
def text_processor_service(self) -> ITextProcessor:
|
||||||
"""Get the text processor service."""
|
"""
|
||||||
return self._text_processor_service
|
Get the text processor service.
|
||||||
|
|
||||||
@property
|
Returns:
|
||||||
def api(self) -> TextProcessorAPI:
|
ITextProcessor: Core service implementing the incoming port
|
||||||
"""Get the API adapter."""
|
"""
|
||||||
return self._api
|
return self._text_processor_service
|
||||||
|
|
||||||
def _create_repository(self) -> InMemoryDocumentRepository:
|
def _create_repository(self) -> InMemoryDocumentRepository:
|
||||||
"""
|
"""
|
||||||
@ -130,7 +132,7 @@ class ApplicationContainer:
|
|||||||
"""
|
"""
|
||||||
Create the core text processor service.
|
Create the core text processor service.
|
||||||
|
|
||||||
Injects all required dependencies (repositories, factories, contexts).
|
Injects all required dependencies via Ports (Dependency Inversion).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Configured text processor service
|
Configured text processor service
|
||||||
@ -142,24 +144,36 @@ class ApplicationContainer:
|
|||||||
repository=self._repository,
|
repository=self._repository,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _create_api(self) -> TextProcessorAPI:
|
|
||||||
"""
|
|
||||||
Create the FastAPI adapter.
|
|
||||||
|
|
||||||
Injects the text processor service.
|
def get_processor_service() -> ITextProcessor:
|
||||||
|
"""
|
||||||
|
Lazy singleton provider for the text processor service.
|
||||||
|
|
||||||
|
This function ensures the ApplicationContainer is instantiated only once
|
||||||
|
and returns the core service. API routes pull the service via this function.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Configured API adapter
|
ITextProcessor: Core service implementing the incoming port
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> service = get_processor_service()
|
||||||
|
>>> document = service.process_document(file_path, strategy)
|
||||||
"""
|
"""
|
||||||
logger.debug("Creating TextProcessorAPI")
|
global _container
|
||||||
return TextProcessorAPI(text_processor=self._text_processor_service)
|
|
||||||
|
if _container is None:
|
||||||
|
logger.info("Lazy initializing ApplicationContainer (first access)")
|
||||||
|
_container = ApplicationContainer(log_level="INFO")
|
||||||
|
|
||||||
|
return _container.text_processor_service
|
||||||
|
|
||||||
|
|
||||||
def create_application(log_level: str = "INFO") -> ApplicationContainer:
|
def create_application(log_level: str = "INFO") -> ApplicationContainer:
|
||||||
"""
|
"""
|
||||||
Factory function to create a fully wired application.
|
Factory function to create a fully wired application container.
|
||||||
|
|
||||||
This is the main entry point for dependency injection.
|
This is the main entry point for manual dependency injection.
|
||||||
|
For API routes, use get_processor_service() instead.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
log_level: Logging level for the application
|
log_level: Logging level for the application
|
||||||
@ -170,24 +184,6 @@ def create_application(log_level: str = "INFO") -> ApplicationContainer:
|
|||||||
Example:
|
Example:
|
||||||
>>> container = create_application(log_level="DEBUG")
|
>>> container = create_application(log_level="DEBUG")
|
||||||
>>> service = container.text_processor_service
|
>>> service = container.text_processor_service
|
||||||
>>> api = container.api
|
|
||||||
"""
|
"""
|
||||||
logger.info("Creating application container")
|
logger.info("Creating application container via factory")
|
||||||
return ApplicationContainer(log_level=log_level)
|
return ApplicationContainer(log_level=log_level)
|
||||||
|
|
||||||
|
|
||||||
def get_text_processor_service(
|
|
||||||
container: ApplicationContainer,
|
|
||||||
) -> ITextProcessor:
|
|
||||||
"""
|
|
||||||
Get the text processor service from container.
|
|
||||||
|
|
||||||
This is a convenience function for accessing the service.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
container: Application container
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Text processor service instance
|
|
||||||
"""
|
|
||||||
return container.text_processor_service
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user