add text api

2026-01-18 19:38:53 +03:30 · 2026-01-18 19:38:53 +03:30 · 90c10c79fa
commit 90c10c79fa
parent 13b887260f
3 changed files with 238 additions and 177 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -5,10 +5,12 @@ This is the incoming adapter that translates HTTP requests into
 domain operations. Routes pull the service directly from bootstrap.
 """
 import logging
 import shutil
 import tempfile
 from pathlib import Path
 from uuid import UUID
-from fastapi import APIRouter, FastAPI, HTTPException, status
+from fastapi import APIRouter, FastAPI, File, Form, HTTPException, UploadFile, status
 from ...core.domain.exceptions import (
    ChunkingError,
@ -93,16 +95,20 @@ def _to_document_response(document) -> DocumentResponse:
    """
    from .api_schemas import DocumentMetadataResponse
    # Extract file type from display_name or source_id
    display_name = document.metadata.display_name
    file_type = Path(display_name).suffix.lstrip('.') if '.' in display_name else 'unknown'
    return DocumentResponse(
        id=str(document.id),
        content=document.content,
        metadata=DocumentMetadataResponse(
-            file_name=document.metadata.file_name,
+            file_name=document.metadata.display_name,
-            file_type=document.metadata.file_type,
+            file_type=file_type,
-            file_size_bytes=document.metadata.file_size_bytes,
+            file_size_bytes=document.metadata.size_bytes,
            created_at=document.metadata.created_at.isoformat(),
            author=document.metadata.author,
-            page_count=document.metadata.page_count,
+            page_count=None,  # Not available in new metadata model
        ),
        is_processed=document.is_processed,
        content_preview=document.get_content_preview(200),
@ -173,66 +179,35 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
@router.post(
-    "/process",
+    "/process-file",
    response_model=ProcessDocumentResponse,
    status_code=status.HTTP_201_CREATED,
    summary="Process a document",
    description="Extract text from document and store it",
 )
 async def process_document(request: ProcessDocumentRequest) -> ProcessDocumentResponse:
    """
    Process a document endpoint.
    Args:
        request: Processing request with file path and strategy
    Returns:
        Processing response with document details
    Raises:
        HTTPException: If processing fails
    """
    try:
        # Pull service from bootstrap
        service: ITextProcessor = _get_service()
        # Convert request to domain models
        file_path = Path(request.file_path)
        strategy = _to_domain_strategy(request.chunking_strategy)
        # Execute use case
        document = service.process_document(file_path, strategy)
        # Convert to response
        return ProcessDocumentResponse(
            document=_to_document_response(document)
        )
    except DomainException as e:
        raise _map_domain_exception(e)
    except Exception as e:
        logger.error(f"Unexpected error processing document: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}",
        )
@router.post(
    "/extract-and-chunk",
    response_model=ExtractAndChunkResponse,
    status_code=status.HTTP_200_OK,
-    summary="Extract and chunk document",
+    summary="Process uploaded file (extraction to chunking)",
-    description="Extract text and split into chunks",
+    description="Upload a file, extract text, parse markdown, and return chunks",
 )
-async def extract_and_chunk(
+async def process_file(
-    request: ExtractAndChunkRequest,
+    file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, zip)"),
    strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
    chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
    overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
    respect_boundaries: bool = Form(True, description="Respect text boundaries"),
 ) -> ExtractAndChunkResponse:
    """
-    Extract and chunk document endpoint.
+    Complete file processing pipeline: Upload → Extract → Parse → Chunk.
    This endpoint handles the full document processing workflow:
    1. Accepts file upload (PDF, DOCX, TXT, ZIP)
    2. Extracts text content using appropriate extractor
    3. Parses markdown structure into sections
    4. Chunks content according to strategy
    5. Returns chunks with metadata
    Args:
-        request: Extract and chunk request
+        file: Uploaded file
        strategy_name: Name of chunking strategy
        chunk_size: Target chunk size
        overlap_size: Overlap between chunks
        respect_boundaries: Whether to respect boundaries
    Returns:
        Response with chunks
@ -240,20 +215,38 @@ async def extract_and_chunk(
    Raises:
        HTTPException: If extraction or chunking fails
    """
    temp_file_path = None
    try:
        # Pull service from bootstrap
        service: ITextProcessor = _get_service()
-        # Convert request to domain models
+        # Create temporary file with appropriate suffix
-        file_path = Path(request.file_path)
+        suffix = Path(file.filename).suffix if file.filename else ".tmp"
-        strategy = _to_domain_strategy(request.chunking_strategy)
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
        temp_file_path = Path(temp_file.name)
-        # Execute use case
+        # Copy uploaded file to temporary location
-        chunks = service.extract_and_chunk(file_path, strategy)
+        logger.info(f"Processing uploaded file: {file.filename}")
        with temp_file:
            shutil.copyfileobj(file.file, temp_file)
        # Create chunking strategy
        strategy = ChunkingStrategy(
            strategy_name=strategy_name,
            chunk_size=chunk_size,
            overlap_size=overlap_size,
            respect_boundaries=respect_boundaries,
        )
        # Execute complete pipeline: extract → parse → chunk
        chunks = service.extract_and_chunk(temp_file_path, strategy)
        # Convert to response
        chunk_responses = [_to_chunk_response(c) for c in chunks]
        logger.info(f"Successfully processed {file.filename}: {len(chunks)} chunks created")
        return ExtractAndChunkResponse(
            chunks=chunk_responses,
            total_chunks=len(chunk_responses),
@ -262,149 +255,109 @@ async def extract_and_chunk(
    except DomainException as e:
        raise _map_domain_exception(e)
    except Exception as e:
-        logger.error(f"Unexpected error extracting and chunking: {str(e)}")
+        logger.error(f"Unexpected error processing file: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}",
        )
    finally:
        # Clean up temporary file
        if temp_file_path and temp_file_path.exists():
            try:
                temp_file_path.unlink()
                logger.debug(f"Cleaned up temporary file: {temp_file_path}")
            except Exception as e:
                logger.warning(f"Failed to delete temporary file {temp_file_path}: {str(e)}")
-@router.get(
+@router.post(
-    "/documents/{document_id}",
+    "/process-text",
-    response_model=DocumentResponse,
+    response_model=ExtractAndChunkResponse,
    status_code=status.HTTP_200_OK,
-    summary="Get document by ID",
+    summary="Process markdown text (parse and chunk)",
-    description="Retrieve a processed document",
+    description="Accept markdown text, parse structure, and return chunks",
 )
-async def get_document(document_id: str) -> DocumentResponse:
+async def process_text(
    text: str = Form(..., description="Markdown text to process"),
    strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
    chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
    overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
    respect_boundaries: bool = Form(True, description="Respect text boundaries"),
    title: str = Form("text_input", description="Optional title for the text document"),
 ) -> ExtractAndChunkResponse:
    """
-    Get document by ID endpoint.
+    Process raw markdown text: Parse → Chunk.
    This endpoint handles text processing workflow:
    1. Accepts markdown text as string
    2. Parses markdown structure into sections
    3. Persists document to repository
    4. Chunks content according to strategy
    5. Returns chunks with metadata
    Args:
-        document_id: UUID of the document
+        text: Markdown text content
        strategy_name: Name of chunking strategy
        chunk_size: Target chunk size
        overlap_size: Overlap between chunks
        respect_boundaries: Whether to respect boundaries
        title: Optional title for the document
    Returns:
-        Document response
+        Response with chunks
    Raises:
-        HTTPException: If document not found
+        HTTPException: If parsing or chunking fails
    """
    try:
-        # Pull service from bootstrap
+        # Basic validation at API boundary
        if not text or not text.strip():
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail="Text content cannot be empty",
            )
        # Get service from bootstrap
        service: ITextProcessor = _get_service()
-        doc_uuid = UUID(document_id)
+        # Create chunking strategy
-        document = service.get_document(doc_uuid)
+        strategy = ChunkingStrategy(
-        return _to_document_response(document)
+            strategy_name=strategy_name,
            chunk_size=chunk_size,
            overlap_size=overlap_size,
            respect_boundaries=respect_boundaries,
        )
-    except ValueError:
+        # Execute complete workflow through service
-        raise HTTPException(
+        logger.info(f"Processing text input via service: {len(text)} characters")
-            status_code=status.HTTP_400_BAD_REQUEST,
+        chunks = service.process_text_to_chunks(
-            detail=f"Invalid document ID format: {document_id}",
+            text=text,
            chunking_strategy=strategy,
            title=title,
        )
-    except DocumentNotFoundError as e:
+
-        raise HTTPException(
+        # Convert to response
-            status_code=status.HTTP_404_NOT_FOUND,
+        chunk_responses = [_to_chunk_response(c) for c in chunks]
-            detail=str(e),
+
        logger.info(f"Successfully processed text: {len(chunks)} chunks created")
        return ExtractAndChunkResponse(
            chunks=chunk_responses,
            total_chunks=len(chunk_responses),
        )
    except HTTPException:
        raise
    except DomainException as e:
        raise _map_domain_exception(e)
    except Exception as e:
-        logger.error(f"Unexpected error retrieving document: {str(e)}")
+        logger.error(f"Unexpected error processing text: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}",
        )
@router.get(
    "/documents",
    response_model=DocumentListResponse,
    status_code=status.HTTP_200_OK,
    summary="List all documents",
    description="Retrieve all documents with pagination",
 )
 async def list_documents(limit: int = 100, offset: int = 0) -> DocumentListResponse:
    """
    List documents endpoint.
    Args:
        limit: Maximum number of documents to return
        offset: Number of documents to skip
    Returns:
        List of documents with pagination info
    """
    try:
        # Pull service from bootstrap
        service: ITextProcessor = _get_service()
        documents = service.list_documents(limit, offset)
        doc_responses = [_to_document_response(d) for d in documents]
        return DocumentListResponse(
            documents=doc_responses,
            total=len(doc_responses),
            limit=limit,
            offset=offset,
        )
    except Exception as e:
        logger.error(f"Unexpected error listing documents: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}",
        )
@router.delete(
    "/documents/{document_id}",
    response_model=DeleteDocumentResponse,
    status_code=status.HTTP_200_OK,
    summary="Delete document",
    description="Delete a document by ID",
 )
 async def delete_document(document_id: str) -> DeleteDocumentResponse:
    """
    Delete document endpoint.
    Args:
        document_id: UUID of the document
    Returns:
        Deletion response
    Raises:
        HTTPException: If document not found or deletion fails
    """
    try:
        # Pull service from bootstrap
        service: ITextProcessor = _get_service()
        doc_uuid = UUID(document_id)
        success = service.delete_document(doc_uuid)
        return DeleteDocumentResponse(
            success=success,
            message=f"Document {document_id} deleted successfully",
            document_id=document_id,
        )
    except ValueError:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=f"Invalid document ID format: {document_id}",
        )
    except DocumentNotFoundError as e:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=str(e),
        )
    except Exception as e:
        logger.error(f"Unexpected error deleting document: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}",
        )
@router.get(
    "/health",
--- a/src/core/ports/incoming/text_processor.py
+++ b/src/core/ports/incoming/text_processor.py
@ -112,3 +112,34 @@ class ITextProcessor(ABC):
            RepositoryError: If deletion fails
        """
        pass
    @abstractmethod
    def process_text_to_chunks(
        self,
        text: str,
        chunking_strategy: ChunkingStrategy,
        title: str = "text_input",
    ) -> List[Chunk]:
        """
        Process raw markdown text into chunks.
        This method handles the complete text processing workflow:
        1. Parse markdown into structured sections
        2. Create Document entity with metadata
        3. Persist document to repository
        4. Chunk the document according to strategy
        Args:
            text: Markdown text content to process
            chunking_strategy: Strategy configuration for chunking
            title: Optional title/identifier for the text input
        Returns:
            List of Chunk entities
        Raises:
            ValidationError: If text is empty or invalid
            ChunkingError: If chunking fails
            ProcessingError: If document processing fails
        """
        pass
--- a/src/core/services/document_processor_service.py
+++ b/src/core/services/document_processor_service.py
@ -242,6 +242,83 @@ class DocumentProcessorService(ITextProcessor):
        extractor = self._extractor_factory.create_extractor(file_path)
        return extractor.extract(file_path)
    def process_text_to_chunks(
        self,
        text: str,
        chunking_strategy: ChunkingStrategy,
        title: str = "text_input",
    ) -> List[Chunk]:
        """
        Process raw markdown text into chunks.
        This method handles the complete text processing workflow:
        1. Parse markdown into structured sections
        2. Create Document entity with metadata
        3. Persist document to repository
        4. Chunk the document according to strategy
        Args:
            text: Markdown text content to process
            chunking_strategy: Strategy configuration for chunking
            title: Optional title/identifier for the text input
        Returns:
            List of Chunk entities
        Raises:
            ValidationError: If text is empty or invalid
            ChunkingError: If chunking fails
            ProcessingError: If document processing fails
        """
        try:
            logger.info(f"Processing text input: {len(text)} characters")
            # Validate text content
            if not text or not text.strip():
                from ..domain.exceptions import ValidationError
                raise ValidationError(
                    message="Text content cannot be empty",
                    field_name="text",
                )
            # Step 1: Parse markdown into sections
            sections = parse_markdown(text)
            logger.debug(f"Parsed {len(sections)} sections from text")
            # Step 2: Create metadata for text input
            from ..domain.models import DocumentMetadata, SourceType
            metadata = DocumentMetadata(
                source_id="text_input",
                source_type=SourceType.WEB,  # Using WEB type for text input
                display_name=f"{title}.md",
                size_bytes=len(text.encode('utf-8')),
            )
            # Step 3: Create Document entity
            document = Document(
                raw_markdown=text,
                sections=sections,
                metadata=metadata,
            )
            # Validate document content
            document.validate_content()
            # Step 4: Persist document to repository
            saved_document = self._repository.save(document)
            logger.info(f"Text document saved with ID: {saved_document.id}")
            # Step 5: Chunk the document
            chunks = self._chunk_document(saved_document, chunking_strategy)
            logger.info(f"Successfully processed text: {len(chunks)} chunks created")
            return chunks
        except Exception as e:
            logger.error(f"Failed to process text: {str(e)}")
            raise
    def _chunk_document(
        self,
        document: Document,