158 lines
5.6 KiB
Python
158 lines
5.6 KiB
Python
"""
|
|
Example Usage Script - Demonstrates how to use the Text Processor.
|
|
|
|
This script shows how to use the text processor programmatically
|
|
without going through the HTTP API.
|
|
"""
|
|
from pathlib import Path
|
|
|
|
from src.bootstrap import create_application
|
|
from src.core.domain.models import ChunkingStrategy
|
|
|
|
|
|
def main():
|
|
"""Main example function."""
|
|
print("=" * 70)
|
|
print("Text Processor - Hexagonal Architecture Example")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Step 1: Create application container with dependency injection
|
|
print("1. Initializing application container...")
|
|
container = create_application(log_level="INFO")
|
|
service = container.text_processor_service
|
|
print(" ✓ Container initialized\n")
|
|
|
|
# Step 2: Create a sample text file for demonstration
|
|
print("2. Creating sample text file...")
|
|
sample_text = """
|
|
The Hexagonal Architecture Pattern
|
|
|
|
Introduction
|
|
Hexagonal Architecture, also known as Ports and Adapters, is a software design
|
|
pattern that aims to create loosely coupled application components. The pattern
|
|
was invented by Alistair Cockburn in 2005.
|
|
|
|
Core Concepts
|
|
The main idea is to isolate the core business logic from external concerns like
|
|
databases, user interfaces, and external services. This is achieved through the
|
|
use of ports and adapters.
|
|
|
|
Ports are interfaces that define how the application core interacts with the
|
|
outside world. Adapters are implementations of these ports that connect the
|
|
application to specific technologies.
|
|
|
|
Benefits
|
|
The benefits of this architecture include improved testability, flexibility,
|
|
and maintainability. By isolating the core logic, we can easily swap
|
|
implementations without affecting the business rules.
|
|
|
|
Conclusion
|
|
Hexagonal Architecture is a powerful pattern for building maintainable and
|
|
flexible applications. It promotes clean separation of concerns and makes
|
|
testing much easier.
|
|
"""
|
|
|
|
sample_file = Path("sample_document.txt")
|
|
sample_file.write_text(sample_text.strip())
|
|
print(f" ✓ Created sample file: {sample_file}\n")
|
|
|
|
# Step 3: Process document with fixed-size chunking
|
|
print("3. Processing document with FIXED SIZE strategy...")
|
|
fixed_strategy = ChunkingStrategy(
|
|
strategy_name="fixed_size",
|
|
chunk_size=300,
|
|
overlap_size=50,
|
|
respect_boundaries=True,
|
|
)
|
|
|
|
try:
|
|
document = service.process_document(
|
|
file_path=sample_file,
|
|
chunking_strategy=fixed_strategy,
|
|
)
|
|
|
|
print(f" Document ID: {document.id}")
|
|
print(f" Metadata: {document.get_metadata_summary()}")
|
|
print(f" Processed: {document.is_processed}")
|
|
print(f" Content length: {len(document.content)} characters")
|
|
print(f" Preview: {document.get_content_preview(100)}...\n")
|
|
|
|
# Step 4: Extract and chunk with paragraph strategy
|
|
print("4. Extracting and chunking with PARAGRAPH strategy...")
|
|
paragraph_strategy = ChunkingStrategy(
|
|
strategy_name="paragraph",
|
|
chunk_size=500,
|
|
overlap_size=0,
|
|
respect_boundaries=True,
|
|
)
|
|
|
|
chunks = service.extract_and_chunk(
|
|
file_path=sample_file,
|
|
chunking_strategy=paragraph_strategy,
|
|
)
|
|
|
|
print(f" ✓ Created {len(chunks)} chunks\n")
|
|
|
|
# Display chunk information
|
|
print(" Chunk Details:")
|
|
print(" " + "-" * 66)
|
|
for i, chunk in enumerate(chunks[:3], 1): # Show first 3 chunks
|
|
print(f" Chunk #{chunk.sequence_number}")
|
|
print(f" - Length: {chunk.get_length()} characters")
|
|
print(f" - Position: {chunk.start_char} to {chunk.end_char}")
|
|
print(f" - Preview: {chunk.content[:80]}...")
|
|
print(" " + "-" * 66)
|
|
|
|
if len(chunks) > 3:
|
|
print(f" ... and {len(chunks) - 3} more chunks\n")
|
|
|
|
# Step 5: Retrieve the document
|
|
print("5. Retrieving document from repository...")
|
|
retrieved = service.get_document(document.id)
|
|
print(f" ✓ Retrieved document: {retrieved.id}")
|
|
print(f" ✓ Content matches: {retrieved.content == document.content}\n")
|
|
|
|
# Step 6: List all documents
|
|
print("6. Listing all documents...")
|
|
all_docs = service.list_documents(limit=10)
|
|
print(f" ✓ Found {len(all_docs)} document(s) in repository")
|
|
for doc in all_docs:
|
|
print(f" - {doc.metadata.file_name} ({doc.metadata.file_type})")
|
|
print()
|
|
|
|
# Step 7: Delete the document
|
|
print("7. Cleaning up - deleting document...")
|
|
deleted = service.delete_document(document.id)
|
|
print(f" ✓ Document deleted: {deleted}\n")
|
|
|
|
# Verify deletion
|
|
remaining = service.list_documents()
|
|
print(f" ✓ Remaining documents: {len(remaining)}\n")
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error: {str(e)}\n")
|
|
raise
|
|
|
|
finally:
|
|
# Clean up sample file
|
|
if sample_file.exists():
|
|
sample_file.unlink()
|
|
print(f" ✓ Cleaned up sample file\n")
|
|
|
|
print("=" * 70)
|
|
print("Example completed successfully!")
|
|
print("=" * 70)
|
|
print()
|
|
print("Key Takeaways:")
|
|
print("1. Core domain is completely isolated from adapters")
|
|
print("2. Dependencies are injected through bootstrap")
|
|
print("3. Easy to swap implementations (strategies, extractors)")
|
|
print("4. Rich domain models with built-in validation")
|
|
print("5. Clear separation between API models and domain models")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|