add text api

2026-01-18 19:38:53 +03:30 · 2026-01-18 19:38:53 +03:30 · 90c10c79fa
commit 90c10c79fa
parent 13b887260f
3 changed files with 238 additions and 177 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -5,10 +5,12 @@ This is the incoming adapter that translates HTTP requests into
 domain operations. Routes pull the service directly from bootstrap.
 """
 import logging
+import shutil
+import tempfile
 from pathlib import Path
 from uuid import UUID

-from fastapi import APIRouter, FastAPI, HTTPException, status
+from fastapi import APIRouter, FastAPI, File, Form, HTTPException, UploadFile, status

 from ...core.domain.exceptions import (
    ChunkingError,
@ -93,16 +95,20 @@ def _to_document_response(document) -> DocumentResponse:
    """
    from .api_schemas import DocumentMetadataResponse

+    # Extract file type from display_name or source_id
+    display_name = document.metadata.display_name
+    file_type = Path(display_name).suffix.lstrip('.') if '.' in display_name else 'unknown'
+
    return DocumentResponse(
        id=str(document.id),
        content=document.content,
        metadata=DocumentMetadataResponse(
-            file_name=document.metadata.file_name,
-            file_type=document.metadata.file_type,
-            file_size_bytes=document.metadata.file_size_bytes,
+            file_name=document.metadata.display_name,
+            file_type=file_type,
+            file_size_bytes=document.metadata.size_bytes,
            created_at=document.metadata.created_at.isoformat(),
            author=document.metadata.author,
-            page_count=document.metadata.page_count,
+            page_count=None,  # Not available in new metadata model
        ),
        is_processed=document.is_processed,
        content_preview=document.get_content_preview(200),
@ -173,66 +179,35 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:


@router.post(
-    "/process",
-    response_model=ProcessDocumentResponse,
-    status_code=status.HTTP_201_CREATED,
-    summary="Process a document",
-    description="Extract text from document and store it",
-)
-async def process_document(request: ProcessDocumentRequest) -> ProcessDocumentResponse:
-    """
-    Process a document endpoint.
-
-    Args:
-        request: Processing request with file path and strategy
-
-    Returns:
-        Processing response with document details
-
-    Raises:
-        HTTPException: If processing fails
-    """
-    try:
-        # Pull service from bootstrap
-        service: ITextProcessor = _get_service()
-
-        # Convert request to domain models
-        file_path = Path(request.file_path)
-        strategy = _to_domain_strategy(request.chunking_strategy)
-
-        # Execute use case
-        document = service.process_document(file_path, strategy)
-
-        # Convert to response
-        return ProcessDocumentResponse(
-            document=_to_document_response(document)
-        )
-
-    except DomainException as e:
-        raise _map_domain_exception(e)
-    except Exception as e:
-        logger.error(f"Unexpected error processing document: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Internal server error: {str(e)}",
-        )
-
-
-@router.post(
-    "/extract-and-chunk",
+    "/process-file",
    response_model=ExtractAndChunkResponse,
    status_code=status.HTTP_200_OK,
-    summary="Extract and chunk document",
-    description="Extract text and split into chunks",
+    summary="Process uploaded file (extraction to chunking)",
+    description="Upload a file, extract text, parse markdown, and return chunks",
 )
-async def extract_and_chunk(
-    request: ExtractAndChunkRequest,
+async def process_file(
+    file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, zip)"),
+    strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
+    chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
+    overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
+    respect_boundaries: bool = Form(True, description="Respect text boundaries"),
 ) -> ExtractAndChunkResponse:
    """
-    Extract and chunk document endpoint.
+    Complete file processing pipeline: Upload → Extract → Parse → Chunk.
+
+    This endpoint handles the full document processing workflow:
+    1. Accepts file upload (PDF, DOCX, TXT, ZIP)
+    2. Extracts text content using appropriate extractor
+    3. Parses markdown structure into sections
+    4. Chunks content according to strategy
+    5. Returns chunks with metadata

    Args:
-        request: Extract and chunk request
+        file: Uploaded file
+        strategy_name: Name of chunking strategy
+        chunk_size: Target chunk size
+        overlap_size: Overlap between chunks
+        respect_boundaries: Whether to respect boundaries

    Returns:
        Response with chunks
@ -240,20 +215,38 @@ async def extract_and_chunk(
    Raises:
        HTTPException: If extraction or chunking fails
    """
+    temp_file_path = None
+
    try:
        # Pull service from bootstrap
        service: ITextProcessor = _get_service()

-        # Convert request to domain models
-        file_path = Path(request.file_path)
-        strategy = _to_domain_strategy(request.chunking_strategy)
+        # Create temporary file with appropriate suffix
+        suffix = Path(file.filename).suffix if file.filename else ".tmp"
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
+        temp_file_path = Path(temp_file.name)

-        # Execute use case
-        chunks = service.extract_and_chunk(file_path, strategy)
+        # Copy uploaded file to temporary location
+        logger.info(f"Processing uploaded file: {file.filename}")
+        with temp_file:
+            shutil.copyfileobj(file.file, temp_file)
+
+        # Create chunking strategy
+        strategy = ChunkingStrategy(
+            strategy_name=strategy_name,
+            chunk_size=chunk_size,
+            overlap_size=overlap_size,
+            respect_boundaries=respect_boundaries,
+        )
+
+        # Execute complete pipeline: extract → parse → chunk
+        chunks = service.extract_and_chunk(temp_file_path, strategy)

        # Convert to response
        chunk_responses = [_to_chunk_response(c) for c in chunks]

+        logger.info(f"Successfully processed {file.filename}: {len(chunks)} chunks created")
+
        return ExtractAndChunkResponse(
            chunks=chunk_responses,
            total_chunks=len(chunk_responses),
@ -262,149 +255,109 @@ async def extract_and_chunk(
    except DomainException as e:
        raise _map_domain_exception(e)
    except Exception as e:
-        logger.error(f"Unexpected error extracting and chunking: {str(e)}")
+        logger.error(f"Unexpected error processing file: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}",
        )
+    finally:
+        # Clean up temporary file
+        if temp_file_path and temp_file_path.exists():
+            try:
+                temp_file_path.unlink()
+                logger.debug(f"Cleaned up temporary file: {temp_file_path}")
+            except Exception as e:
+                logger.warning(f"Failed to delete temporary file {temp_file_path}: {str(e)}")


-@router.get(
-    "/documents/{document_id}",
-    response_model=DocumentResponse,
+@router.post(
+    "/process-text",
+    response_model=ExtractAndChunkResponse,
    status_code=status.HTTP_200_OK,
-    summary="Get document by ID",
-    description="Retrieve a processed document",
+    summary="Process markdown text (parse and chunk)",
+    description="Accept markdown text, parse structure, and return chunks",
 )
-async def get_document(document_id: str) -> DocumentResponse:
+async def process_text(
+    text: str = Form(..., description="Markdown text to process"),
+    strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
+    chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
+    overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
+    respect_boundaries: bool = Form(True, description="Respect text boundaries"),
+    title: str = Form("text_input", description="Optional title for the text document"),
+) -> ExtractAndChunkResponse:
    """
-    Get document by ID endpoint.
+    Process raw markdown text: Parse → Chunk.
+
+    This endpoint handles text processing workflow:
+    1. Accepts markdown text as string
+    2. Parses markdown structure into sections
+    3. Persists document to repository
+    4. Chunks content according to strategy
+    5. Returns chunks with metadata

    Args:
-        document_id: UUID of the document
+        text: Markdown text content
+        strategy_name: Name of chunking strategy
+        chunk_size: Target chunk size
+        overlap_size: Overlap between chunks
+        respect_boundaries: Whether to respect boundaries
+        title: Optional title for the document

    Returns:
-        Document response
+        Response with chunks

    Raises:
-        HTTPException: If document not found
+        HTTPException: If parsing or chunking fails
    """
    try:
-        # Pull service from bootstrap
-        service: ITextProcessor = _get_service()
-
-        doc_uuid = UUID(document_id)
-        document = service.get_document(doc_uuid)
-        return _to_document_response(document)
-
-    except ValueError:
+        # Basic validation at API boundary
+        if not text or not text.strip():
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"Invalid document ID format: {document_id}",
-        )
-    except DocumentNotFoundError as e:
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail=str(e),
-        )
-    except Exception as e:
-        logger.error(f"Unexpected error retrieving document: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Internal server error: {str(e)}",
+                detail="Text content cannot be empty",
            )

-
-@router.get(
-    "/documents",
-    response_model=DocumentListResponse,
-    status_code=status.HTTP_200_OK,
-    summary="List all documents",
-    description="Retrieve all documents with pagination",
-)
-async def list_documents(limit: int = 100, offset: int = 0) -> DocumentListResponse:
-    """
-    List documents endpoint.
-
-    Args:
-        limit: Maximum number of documents to return
-        offset: Number of documents to skip
-
-    Returns:
-        List of documents with pagination info
-    """
-    try:
-        # Pull service from bootstrap
+        # Get service from bootstrap
        service: ITextProcessor = _get_service()

-        documents = service.list_documents(limit, offset)
-        doc_responses = [_to_document_response(d) for d in documents]
-
-        return DocumentListResponse(
-            documents=doc_responses,
-            total=len(doc_responses),
-            limit=limit,
-            offset=offset,
+        # Create chunking strategy
+        strategy = ChunkingStrategy(
+            strategy_name=strategy_name,
+            chunk_size=chunk_size,
+            overlap_size=overlap_size,
+            respect_boundaries=respect_boundaries,
        )

+        # Execute complete workflow through service
+        logger.info(f"Processing text input via service: {len(text)} characters")
+        chunks = service.process_text_to_chunks(
+            text=text,
+            chunking_strategy=strategy,
+            title=title,
+        )
+
+        # Convert to response
+        chunk_responses = [_to_chunk_response(c) for c in chunks]
+
+        logger.info(f"Successfully processed text: {len(chunks)} chunks created")
+
+        return ExtractAndChunkResponse(
+            chunks=chunk_responses,
+            total_chunks=len(chunk_responses),
+        )
+
+    except HTTPException:
+        raise
+    except DomainException as e:
+        raise _map_domain_exception(e)
    except Exception as e:
-        logger.error(f"Unexpected error listing documents: {str(e)}")
+        logger.error(f"Unexpected error processing text: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}",
        )


-@router.delete(
-    "/documents/{document_id}",
-    response_model=DeleteDocumentResponse,
-    status_code=status.HTTP_200_OK,
-    summary="Delete document",
-    description="Delete a document by ID",
-)
-async def delete_document(document_id: str) -> DeleteDocumentResponse:
-    """
-    Delete document endpoint.
-
-    Args:
-        document_id: UUID of the document
-
-    Returns:
-        Deletion response
-
-    Raises:
-        HTTPException: If document not found or deletion fails
-    """
-    try:
-        # Pull service from bootstrap
-        service: ITextProcessor = _get_service()
-
-        doc_uuid = UUID(document_id)
-        success = service.delete_document(doc_uuid)
-
-        return DeleteDocumentResponse(
-            success=success,
-            message=f"Document {document_id} deleted successfully",
-            document_id=document_id,
-        )
-
-    except ValueError:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"Invalid document ID format: {document_id}",
-        )
-    except DocumentNotFoundError as e:
-        raise HTTPException(
-            status_code=status.HTTP_404_NOT_FOUND,
-            detail=str(e),
-        )
-    except Exception as e:
-        logger.error(f"Unexpected error deleting document: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Internal server error: {str(e)}",
-        )
-

@router.get(
    "/health",
--- a/src/core/ports/incoming/text_processor.py
+++ b/src/core/ports/incoming/text_processor.py
@ -112,3 +112,34 @@ class ITextProcessor(ABC):
            RepositoryError: If deletion fails
        """
        pass
+
+    @abstractmethod
+    def process_text_to_chunks(
+        self,
+        text: str,
+        chunking_strategy: ChunkingStrategy,
+        title: str = "text_input",
+    ) -> List[Chunk]:
+        """
+        Process raw markdown text into chunks.
+
+        This method handles the complete text processing workflow:
+        1. Parse markdown into structured sections
+        2. Create Document entity with metadata
+        3. Persist document to repository
+        4. Chunk the document according to strategy
+
+        Args:
+            text: Markdown text content to process
+            chunking_strategy: Strategy configuration for chunking
+            title: Optional title/identifier for the text input
+
+        Returns:
+            List of Chunk entities
+
+        Raises:
+            ValidationError: If text is empty or invalid
+            ChunkingError: If chunking fails
+            ProcessingError: If document processing fails
+        """
+        pass
--- a/src/core/services/document_processor_service.py
+++ b/src/core/services/document_processor_service.py
@ -242,6 +242,83 @@ class DocumentProcessorService(ITextProcessor):
        extractor = self._extractor_factory.create_extractor(file_path)
        return extractor.extract(file_path)

+    def process_text_to_chunks(
+        self,
+        text: str,
+        chunking_strategy: ChunkingStrategy,
+        title: str = "text_input",
+    ) -> List[Chunk]:
+        """
+        Process raw markdown text into chunks.
+
+        This method handles the complete text processing workflow:
+        1. Parse markdown into structured sections
+        2. Create Document entity with metadata
+        3. Persist document to repository
+        4. Chunk the document according to strategy
+
+        Args:
+            text: Markdown text content to process
+            chunking_strategy: Strategy configuration for chunking
+            title: Optional title/identifier for the text input
+
+        Returns:
+            List of Chunk entities
+
+        Raises:
+            ValidationError: If text is empty or invalid
+            ChunkingError: If chunking fails
+            ProcessingError: If document processing fails
+        """
+        try:
+            logger.info(f"Processing text input: {len(text)} characters")
+
+            # Validate text content
+            if not text or not text.strip():
+                from ..domain.exceptions import ValidationError
+                raise ValidationError(
+                    message="Text content cannot be empty",
+                    field_name="text",
+                )
+
+            # Step 1: Parse markdown into sections
+            sections = parse_markdown(text)
+            logger.debug(f"Parsed {len(sections)} sections from text")
+
+            # Step 2: Create metadata for text input
+            from ..domain.models import DocumentMetadata, SourceType
+
+            metadata = DocumentMetadata(
+                source_id="text_input",
+                source_type=SourceType.WEB,  # Using WEB type for text input
+                display_name=f"{title}.md",
+                size_bytes=len(text.encode('utf-8')),
+            )
+
+            # Step 3: Create Document entity
+            document = Document(
+                raw_markdown=text,
+                sections=sections,
+                metadata=metadata,
+            )
+
+            # Validate document content
+            document.validate_content()
+
+            # Step 4: Persist document to repository
+            saved_document = self._repository.save(document)
+            logger.info(f"Text document saved with ID: {saved_document.id}")
+
+            # Step 5: Chunk the document
+            chunks = self._chunk_document(saved_document, chunking_strategy)
+
+            logger.info(f"Successfully processed text: {len(chunks)} chunks created")
+            return chunks
+
+        except Exception as e:
+            logger.error(f"Failed to process text: {str(e)}")
+            raise
+
    def _chunk_document(
        self,
        document: Document,