add s3 storage

2026-01-20 12:46:47 +03:30 · 2026-01-20 12:46:47 +03:30 · 91f8035043
commit 91f8035043
parent 0c09c79a2e
12 changed files with 607 additions and 18 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
 # Default ignored files
 /shelf/
 /workspace.xml
--- a/requirements.txt
+++ b/requirements.txt
@ -8,10 +8,22 @@ marko==2.1.2
 # Web Framework
 fastapi==0.115.6
 uvicorn[standard]==0.34.0
 # Utilities
 python-multipart==0.0.20
 # Document Processing - Extractors
 PyPDF2==3.0.1          # PDF extraction
 python-docx==1.1.2     # DOCX extraction
 # Cloud Storage
 boto3==1.35.94         # AWS S3 integration
 botocore==1.35.94      # AWS SDK core (installed with boto3)
 # Environment Variables
 python-dotenv==1.0.1   # Load .env files
 # HTTP Client (for testing)
 requests==2.32.3       # API testing scripts
 # Development Dependencies (optional)
 pytest==8.3.4
 pytest-asyncio==0.24.0
@ -19,3 +31,7 @@ httpx==0.28.1
 black==24.10.0
 ruff==0.8.5
 mypy==1.14.0
 # Type Stubs for Development
 types-requests==2.32.0.20241016
 boto3-stubs[s3]==1.35.94
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -20,6 +20,7 @@ from typing import Iterator, List, Optional
 from fastapi import APIRouter, Depends, FastAPI, File, Form, HTTPException, UploadFile, status
 from fastapi.responses import JSONResponse
 from ...core.config import get_settings
 from ...core.domain.exceptions import (
    ChunkingError,
    DomainException,
@ -45,6 +46,9 @@ logger = logging.getLogger(__name__)
 # Application Setup
 # =============================================================================
 # Load settings
 settings = get_settings()
 app = FastAPI(
    title="Text Processor API",
    description="Text extraction and chunking system using Hexagonal Architecture",
@ -194,6 +198,7 @@ def to_document_response(document: Document) -> DocumentResponse:
        ),
        is_processed=document.is_processed,
        content_preview=document.get_content_preview(200),
        download_url=document.download_url,
    )
--- a/src/adapters/incoming/api_schemas.py
+++ b/src/adapters/incoming/api_schemas.py
@ -88,6 +88,10 @@ class DocumentResponse(BaseModel):
        ...,
        description="Preview of content (first 200 chars)",
    )
    download_url: Optional[str] = Field(
        None,
        description="Presigned URL for downloading the markdown file (expires in 1 hour)",
    )
 class ChunkResponse(BaseModel):
--- a/src/adapters/outgoing/extractors/markdown_extractor.py
+++ b/src/adapters/outgoing/extractors/markdown_extractor.py
@ -0,0 +1,186 @@
 """
 Markdown Extractor - Concrete implementation for Markdown file extraction.
 This adapter implements the IExtractor port for .md files.
 """
 import logging
 from pathlib import Path
 from typing import List
 from ....core.domain.exceptions import (
    EmptyContentError,
    ExtractionError,
 )
 from ....core.domain.models import Document, DocumentMetadata, SourceType
 from ....core.ports.outgoing.extractor import IExtractor
 logger = logging.getLogger(__name__)
 class MarkdownExtractor(IExtractor):
    """
    Concrete Markdown extractor for .md files.
    This adapter:
    1. Reads .md files directly
    2. Handles multiple encodings
    3. Returns Document with raw markdown content
    """
    def __init__(self) -> None:
        """Initialize Markdown extractor."""
        self._supported_extensions = ['md', 'markdown']
        self._encodings = ['utf-8', 'utf-16', 'latin-1', 'cp1252']
        logger.debug("MarkdownExtractor initialized")
    def extract(self, file_path: Path) -> Document:
        """
        Extract text content from Markdown file.
        Args:
            file_path: Path to the .md file
        Returns:
            Document entity with raw markdown and metadata
        Raises:
            ExtractionError: If extraction fails
            EmptyContentError: If file is empty
        """
        try:
            logger.info(f"Extracting markdown file: {file_path}")
            # Validate file
            self._validate_file(file_path)
            # Read markdown content
            markdown_text = self._read_file(file_path)
            # Validate content
            if not markdown_text or not markdown_text.strip():
                raise EmptyContentError(file_path=str(file_path))
            # Create metadata
            metadata = self._create_metadata(file_path)
            # Build document with raw_markdown
            document = Document(raw_markdown=markdown_text, metadata=metadata)
            logger.info(
                f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
            )
            return document
        except EmptyContentError:
            raise
        except ExtractionError:
            raise
        except Exception as e:
            logger.error(f"Markdown extraction failed for {file_path}: {str(e)}")
            raise ExtractionError(
                message=f"Failed to extract markdown from {file_path.name}",
                details=str(e),
                file_path=str(file_path),
            )
    def supports_file_type(self, file_extension: str) -> bool:
        """
        Check if this extractor supports Markdown files.
        Args:
            file_extension: File extension (e.g., 'md', 'markdown')
        Returns:
            True if Markdown files are supported
        """
        return file_extension.lower() in self._supported_extensions
    def get_supported_types(self) -> List[str]:
        """
        Get list of supported file extensions.
        Returns:
            List containing 'md' and 'markdown'
        """
        return self._supported_extensions.copy()
    def _validate_file(self, file_path: Path) -> None:
        """
        Validate file exists and is readable.
        Args:
            file_path: Path to validate
        Raises:
            ExtractionError: If file is invalid
        """
        if not file_path.exists():
            raise ExtractionError(
                message=f"File not found: {file_path}",
                file_path=str(file_path),
            )
        if not file_path.is_file():
            raise ExtractionError(
                message=f"Path is not a file: {file_path}",
                file_path=str(file_path),
            )
        if file_path.stat().st_size == 0:
            raise EmptyContentError(file_path=str(file_path))
    def _read_file(self, file_path: Path) -> str:
        """
        Read file content with encoding detection.
        Args:
            file_path: Path to markdown file
        Returns:
            File content as string
        Raises:
            ExtractionError: If reading fails
        """
        # Try multiple encodings
        for encoding in self._encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    text = f.read()
                logger.debug(f"Successfully read {file_path.name} with {encoding}")
                return text
            except UnicodeDecodeError:
                continue
            except Exception as e:
                logger.error(f"Error reading {file_path.name}: {str(e)}")
                raise ExtractionError(
                    message=f"Failed to read file: {file_path.name}",
                    details=str(e),
                    file_path=str(file_path),
                )
        # If all encodings fail
        raise ExtractionError(
            message=f"Failed to decode {file_path.name} with any supported encoding",
            file_path=str(file_path),
        )
    def _create_metadata(self, file_path: Path) -> DocumentMetadata:
        """
        Create document metadata from markdown file.
        Args:
            file_path: Path to the markdown file
        Returns:
            DocumentMetadata entity
        """
        stat = file_path.stat()
        return DocumentMetadata(
            source_id=str(file_path.absolute()),
            source_type=SourceType.FILE,
            display_name=file_path.name,
            size_bytes=stat.st_size,
        )
--- a/src/adapters/outgoing/storage/init.py
+++ b/src/adapters/outgoing/storage/init.py
@ -0,0 +1 @@
 """Outgoing storage adapters."""
--- a/src/adapters/outgoing/storage/s3_storage_adapter.py
+++ b/src/adapters/outgoing/storage/s3_storage_adapter.py
@ -0,0 +1,216 @@
 """
 S3 Storage Adapter - Concrete implementation using AWS S3.
 This adapter implements the IFileStorage port using boto3 for AWS S3 operations.
 """
 import logging
 from typing import Optional
 import boto3
 from botocore.exceptions import ClientError
 from ....core.config import Settings
 from ....core.domain.exceptions import ProcessingError
 from ....core.ports.outgoing.file_storage import IFileStorage
 logger = logging.getLogger(__name__)
 class S3StorageAdapter(IFileStorage):
    """
    Concrete S3 storage adapter implementation.
    This adapter:
    1. Uploads content to AWS S3
    2. Generates presigned URLs for secure downloads
    3. Handles S3 operations with proper error handling
    """
    def __init__(self, settings: Settings) -> None:
        """
        Initialize S3 storage adapter.
        Args:
            settings: Application settings with S3 configuration
        Raises:
            ValueError: If S3 bucket is not configured
        """
        if not settings.S3_BUCKET:
            raise ValueError("S3_BUCKET must be configured in settings")
        self.bucket_name = settings.S3_BUCKET
        self.region = settings.S3_REGION
        self.expiration_seconds = settings.S3_PRESIGNED_URL_EXPIRATION
        self.upload_path_prefix = settings.S3_UPLOAD_PATH_PREFIX
        # Build boto3 client config
        client_kwargs = {"region_name": self.region}
        # Add custom endpoint (for MinIO)
        if settings.S3_ENDPOINT_URL:
            client_kwargs["endpoint_url"] = settings.S3_ENDPOINT_URL
            logger.debug(f"Using custom S3 endpoint: {settings.S3_ENDPOINT_URL}")
        # Add credentials if provided (otherwise boto3 uses default credential chain)
        if settings.S3_ACCESS_KEY and settings.S3_SECRET_KEY:
            client_kwargs["aws_access_key_id"] = settings.S3_ACCESS_KEY
            client_kwargs["aws_secret_access_key"] = settings.S3_SECRET_KEY
            logger.debug("Using explicit S3 credentials from settings")
        else:
            logger.debug("Using default AWS credential chain (IAM role, AWS CLI, etc.)")
        # Initialize S3 client
        self.s3_client = boto3.client("s3", **client_kwargs)
        logger.info(
            f"S3StorageAdapter initialized: bucket={self.bucket_name}, "
            f"region={self.region}, expiration={self.expiration_seconds}s"
        )
    def upload_content(self, content: str, destination_path: str) -> str:
        """
        Upload text content to S3 and return presigned download URL.
        Args:
            content: Text content to upload (markdown)
            destination_path: S3 object key (e.g., "extractions/doc123.md")
        Returns:
            Presigned download URL valid for 1 hour
        Raises:
            ProcessingError: If upload fails
        """
        try:
            logger.info(f"Uploading content to S3: {destination_path}")
            # Upload content to S3
            self.s3_client.put_object(
                Bucket=self.bucket_name,
                Key=destination_path,
                Body=content.encode("utf-8"),
                ContentType="text/markdown",
                ContentDisposition=f'attachment; filename="{self._get_filename(destination_path)}"',
            )
            logger.info(f"Successfully uploaded to S3: s3://{self.bucket_name}/{destination_path}")
            # Generate presigned URL
            presigned_url = self._generate_presigned_url(destination_path)
            logger.info(f"Generated presigned URL (expires in {self.expiration_seconds}s)")
            return presigned_url
        except ClientError as e:
            error_code = e.response.get("Error", {}).get("Code", "Unknown")
            logger.error(f"S3 upload failed: {error_code} - {str(e)}")
            raise ProcessingError(
                message=f"Failed to upload to S3: {error_code}",
                details=str(e),
            )
        except Exception as e:
            logger.error(f"Unexpected error during S3 upload: {str(e)}")
            raise ProcessingError(
                message="Failed to upload to S3",
                details=str(e),
            )
    def delete_file(self, file_path: str) -> bool:
        """
        Delete a file from S3.
        Args:
            file_path: S3 object key to delete
        Returns:
            True if deleted successfully, False otherwise
        """
        try:
            logger.info(f"Deleting file from S3: {file_path}")
            self.s3_client.delete_object(
                Bucket=self.bucket_name,
                Key=file_path,
            )
            logger.info(f"Successfully deleted from S3: {file_path}")
            return True
        except ClientError as e:
            logger.error(f"S3 delete failed: {str(e)}")
            return False
        except Exception as e:
            logger.error(f"Unexpected error during S3 delete: {str(e)}")
            return False
    def file_exists(self, file_path: str) -> bool:
        """
        Check if a file exists in S3.
        Args:
            file_path: S3 object key to check
        Returns:
            True if file exists, False otherwise
        """
        try:
            self.s3_client.head_object(
                Bucket=self.bucket_name,
                Key=file_path,
            )
            return True
        except ClientError as e:
            if e.response["Error"]["Code"] == "404":
                return False
            logger.error(f"Error checking S3 file existence: {str(e)}")
            return False
        except Exception as e:
            logger.error(f"Unexpected error checking S3 file: {str(e)}")
            return False
    def _generate_presigned_url(self, object_key: str) -> str:
        """
        Generate presigned URL for S3 object.
        Args:
            object_key: S3 object key
        Returns:
            Presigned download URL
        Raises:
            ProcessingError: If URL generation fails
        """
        try:
            presigned_url = self.s3_client.generate_presigned_url(
                "get_object",
                Params={
                    "Bucket": self.bucket_name,
                    "Key": object_key,
                },
                ExpiresIn=self.expiration_seconds,
            )
            return presigned_url
        except ClientError as e:
            logger.error(f"Failed to generate presigned URL: {str(e)}")
            raise ProcessingError(
                message="Failed to generate download URL",
                details=str(e),
            )
    def _get_filename(self, path: str) -> str:
        """
        Extract filename from path.
        Args:
            path: Full path (e.g., "extractions/doc123.md")
        Returns:
            Filename only (e.g., "doc123.md")
        """
        return path.split("/")[-1] if "/" in path else path
--- a/src/bootstrap.py
+++ b/src/bootstrap.py
@ -7,6 +7,7 @@ The Core never imports Adapters - only the Bootstrap does.
 The ApplicationContainer manages ONLY:
 - Core Services
 - Outgoing Adapters (Extractors, Chunkers, Repository)
 - Configuration (Settings)
 """
 import logging
@ -22,6 +23,8 @@ from .adapters.outgoing.extractors.zip_extractor import ZipExtractor
 from .adapters.outgoing.persistence.in_memory_repository import (
    InMemoryDocumentRepository,
 )
 from .adapters.outgoing.storage.s3_storage_adapter import S3StorageAdapter
 from .core.config import Settings, get_settings
 from .core.ports.incoming.text_processor import ITextProcessor
 from .core.services.document_processor_service import DocumentProcessorService
 from .shared.logging_config import setup_logging
@ -39,32 +42,47 @@ class ApplicationContainer:
    Dependency Injection Container for Core and Outgoing Adapters.
    This container manages the lifecycle and dependencies of:
    - Configuration (Settings)
    - Core Domain Services
-    - Outgoing Adapters (Extractors, Chunkers, Repository)
+    - Outgoing Adapters (Extractors, Chunkers, Repository, Storage)
    """
-    def __init__(self, log_level: str = "INFO") -> None:
+    def __init__(self, settings: Settings | None = None) -> None:
        """
        Initialize the application container.
        Args:
-            log_level: Logging level for the application
+            settings: Application settings (uses singleton if not provided)
        """
-        # Setup logging first
+        # Load settings (singleton)
-        setup_logging(level=log_level)
+        self._settings = settings or get_settings()
        # Setup logging
        setup_logging(level=self._settings.LOG_LEVEL)
        logger.info("Initializing ApplicationContainer")
        logger.debug(f"Configuration: bucket={self._settings.S3_BUCKET}, region={self._settings.S3_REGION}")
        # Create Outgoing Adapters
        self._repository = self._create_repository()
        self._extractor_factory = self._create_extractor_factory()
        self._chunking_context = self._create_chunking_context()
        self._file_storage = self._create_file_storage()
        # Create Core Service (depends only on Ports)
        self._text_processor_service = self._create_text_processor_service()
        logger.info("ApplicationContainer initialized successfully")
    @property
    def settings(self) -> Settings:
        """
        Get application settings.
        Returns:
            Settings: Application configuration
        """
        return self._settings
    @property
    def text_processor_service(self) -> ITextProcessor:
        """
@ -132,6 +150,16 @@ class ApplicationContainer:
        return context
    def _create_file_storage(self) -> S3StorageAdapter:
        """
        Create and configure the file storage adapter.
        Returns:
            Configured S3 storage adapter
        """
        logger.debug("Creating S3StorageAdapter")
        return S3StorageAdapter(settings=self._settings)
    def _create_text_processor_service(self) -> DocumentProcessorService:
        """
        Create the core text processor service.
@ -146,6 +174,8 @@ class ApplicationContainer:
            extractor_factory=self._extractor_factory,
            chunking_context=self._chunking_context,
            repository=self._repository,
            file_storage=self._file_storage,
            settings=self._settings,
        )
@ -167,12 +197,12 @@ def get_processor_service() -> ITextProcessor:
    if _container is None:
        logger.info("Lazy initializing ApplicationContainer (first access)")
-        _container = ApplicationContainer(log_level="INFO")
+        _container = ApplicationContainer()
    return _container.text_processor_service
-def create_application(log_level: str = "INFO") -> ApplicationContainer:
+def create_application(settings: Settings | None = None) -> ApplicationContainer:
    """
    Factory function to create a fully wired application container.
@ -180,14 +210,14 @@ def create_application(log_level: str = "INFO") -> ApplicationContainer:
    For API routes, use get_processor_service() instead.
    Args:
-        log_level: Logging level for the application
+        settings: Application settings (uses singleton if not provided)
    Returns:
        Configured application container
    Example:
-        >>> container = create_application(log_level="DEBUG")
+        >>> container = create_application()
        >>> service = container.text_processor_service
    """
    logger.info("Creating application container via factory")
-    return ApplicationContainer(log_level=log_level)
+    return ApplicationContainer(settings=settings)
--- a/src/core/config.py
+++ b/src/core/config.py
@ -0,0 +1,41 @@
 """S3 Configuration."""
 from typing import Optional
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
    """S3/MinIO settings loaded from environment variables or .env file."""
    S3_BUCKET: str = "bi-chatbot"
    S3_REGION: str = "us-east-1"
    S3_ACCESS_KEY: Optional[str] = "bi-chatbot"
    S3_SECRET_KEY: Optional[str] = "9ixloSaqtYTkfmrJzE"
    S3_ENDPOINT_URL: Optional[str] = "https://cdn.d.aiengines.ir"
    S3_PRESIGNED_URL_EXPIRATION: int = 3600
    S3_UPLOAD_PATH_PREFIX: str = "extractions"
    LOG_LEVEL: str = "INFO"
    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        case_sensitive=True,
        extra="ignore",
    )
    def get_s3_path(self, document_id: str) -> str:
        """Generate S3 path for a document."""
        return f"{self.S3_UPLOAD_PATH_PREFIX}/{document_id}.md"
 _settings_instance: Optional[Settings] = None
 def get_settings() -> Settings:
    """Get or create singleton settings instance."""
    global _settings_instance
    if _settings_instance is None:
        _settings_instance = Settings()
    return _settings_instance
--- a/src/core/domain/models.py
+++ b/src/core/domain/models.py
@ -231,6 +231,8 @@ class Document(BaseModel):
        sections: Parsed structured sections from Markdown
        metadata: Associated metadata
        is_processed: Flag indicating if document has been processed
        download_url: Optional presigned URL for downloading the markdown file
    """
    id: UUID = Field(default_factory=uuid4, description="Unique document ID")
    raw_markdown: str = Field(..., description="Raw Markdown content")
@ -240,6 +242,7 @@ class Document(BaseModel):
    )
    metadata: DocumentMetadata = Field(..., description="Document metadata")
    is_processed: bool = Field(default=False, description="Processing status")
    download_url: Optional[str] = Field(None, description="Presigned download URL")
    model_config = {
        "frozen": False,  # Allow mutation for processing status
--- a/src/core/ports/outgoing/file_storage.py
+++ b/src/core/ports/outgoing/file_storage.py
@ -0,0 +1,59 @@
 """
 Outgoing Port - File Storage Interface.
 This port defines the contract for storing files and generating download URLs.
 Implementations can use S3, Azure Blob, GCS, or local filesystem.
 """
 from abc import ABC, abstractmethod
 class IFileStorage(ABC):
    """
    Port interface for file storage operations.
    This abstraction allows the core domain to store files without
    depending on specific storage implementations (S3, Azure, etc.).
    """
    @abstractmethod
    def upload_content(self, content: str, destination_path: str) -> str:
        """
        Upload text content to storage and return a download URL.
        Args:
            content: Text content to upload (e.g., markdown)
            destination_path: Destination path in storage (e.g., "extractions/doc123.md")
        Returns:
            Secure download URL (presigned URL for cloud storage)
        Raises:
            StorageError: If upload fails
        """
        pass
    @abstractmethod
    def delete_file(self, file_path: str) -> bool:
        """
        Delete a file from storage.
        Args:
            file_path: Path to file in storage
        Returns:
            True if deleted successfully, False otherwise
        """
        pass
    @abstractmethod
    def file_exists(self, file_path: str) -> bool:
        """
        Check if a file exists in storage.
        Args:
            file_path: Path to file in storage
        Returns:
            True if file exists, False otherwise
        """
        pass
--- a/src/core/services/document_processor_service.py
+++ b/src/core/services/document_processor_service.py
@ -9,6 +9,7 @@ from pathlib import Path
 from typing import List
 from uuid import UUID
 from ..config import Settings
 from ..domain import logic_utils
 from ..domain.exceptions import (
    DocumentNotFoundError,
@ -20,6 +21,7 @@ from ..domain.models import Chunk, ChunkingStrategy, Document, SourceFile
 from ..ports.incoming.text_processor import ITextProcessor
 from ..ports.outgoing.chunking_context import IChunkingContext
 from ..ports.outgoing.extractor_factory import IExtractorFactory
 from ..ports.outgoing.file_storage import IFileStorage
 from ..ports.outgoing.repository import IDocumentRepository
@ -39,6 +41,8 @@ class DocumentProcessorService(ITextProcessor):
        extractor_factory: IExtractorFactory,
        chunking_context: IChunkingContext,
        repository: IDocumentRepository,
        file_storage: IFileStorage,
        settings: Settings,
    ) -> None:
        """
        Initialize the document processor service.
@ -47,10 +51,14 @@ class DocumentProcessorService(ITextProcessor):
            extractor_factory: Factory for creating appropriate extractors
            chunking_context: Context for managing chunking strategies
            repository: Repository for document persistence
            file_storage: File storage for uploading extracted content
            settings: Application settings for configuration
        """
        self._extractor_factory = extractor_factory
        self._chunking_context = chunking_context
        self._repository = repository
        self._file_storage = file_storage
        self._settings = settings
        logger.info("DocumentProcessorService initialized")
    def extract_and_chunk(
@ -102,28 +110,45 @@ class DocumentProcessorService(ITextProcessor):
    def extract_document(self, file_path: Path) -> Document:
        """
-        Extract text content from document without parsing or chunking.
+        Extract text content from document and upload to S3.
-        This method only performs extraction:
+        This method:
        1. Extracts raw text content from file
-        2. Creates Document entity with metadata
+        2. Uploads markdown to S3
-        3. Returns Document with raw_markdown (no sections)
+        3. Generates presigned download URL
        4. Returns Document with raw_markdown and download_url
        Args:
            file_path: Path to the document file
        Returns:
-            Document entity with raw markdown
+            Document entity with raw markdown and download URL
        Raises:
            ExtractionError: If text extraction fails
            UnsupportedFileTypeError: If file type is not supported
            ProcessingError: If S3 upload fails
        """
        try:
            logger.info(f"Extracting document: {file_path}")
            # Extract document
            document = self._extract_document(file_path)
            logger.info(f"Successfully extracted {len(document.raw_markdown)} characters")
            # Upload to S3 and get download URL
            destination_path = self._settings.get_s3_path(str(document.id))
            download_url = self._file_storage.upload_content(
                content=document.raw_markdown,
                destination_path=destination_path,
            )
            # Update document with download URL
            document = document.model_copy(update={"download_url": download_url})
            logger.info(f"Uploaded to S3 and generated download URL")
            return document
        except Exception as e:
            logger.error(f"Failed to extract document: {str(e)}")
            raise