add /chunk route

2026-01-19 21:54:23 +03:30 · 2026-01-19 21:54:23 +03:30 · 6086ddf818
commit 6086ddf818
parent 2c4a59f84b
6 changed files with 155 additions and 96 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -8,6 +8,7 @@ import logging
 import shutil
 import tempfile
 from pathlib import Path
+from typing import Optional
 from uuid import UUID

 from fastapi import APIRouter, FastAPI, File, Form, HTTPException, UploadFile, status
@ -28,7 +29,7 @@ from .api_schemas import (
    DocumentListResponse,
    DocumentResponse,
    ExtractAndChunkRequest,
-    ExtractAndChunkResponse,
+    ChunkListResponse,
    HealthCheckResponse,
    ProcessDocumentRequest,
    ProcessDocumentResponse,
@ -160,6 +161,149 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
        )


+@router.post(
+    "/chunk",
+    response_model=ChunkListResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Process Markdown from file upload or text input",
+    description="Unified endpoint: upload .md file or paste markdown text, then parse and chunk",
+)
+async def perform_chunking(
+    file: Optional[UploadFile] = File(None, description="Markdown file (.md) to upload"),
+    text: Optional[str] = Form(None, description="Markdown text to process", json_schema_extra={"x-textarea": True}),
+    strategy_name: ChunkingMethod = Form(..., description="Chunking method"),
+    chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
+    overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
+    respect_boundaries: bool = Form(True, description="Respect text boundaries"),
+    title: str = Form("markdown_input", description="Optional title for the document"),
+) -> ChunkListResponse:
+    """
+    Unified Markdown processing endpoint supporting both file upload and text input.
+
+    This endpoint handles Markdown from either source:
+    1. **File Upload**: Upload a .md file
+    2. **Text Input**: Paste markdown text directly
+
+    Processing workflow:
+    1. Validates source (file or text, not both)
+    2. Extracts markdown content
+    3. Parses markdown structure into sections
+    4. Persists document to repository
+    5. Chunks content according to strategy
+    6. Returns chunks with metadata
+
+    Args:
+        file: Optional .md file upload
+        text: Optional markdown text input
+        strategy_name: Chunking method (fixed_size or paragraph)
+        chunk_size: Target chunk size
+        overlap_size: Overlap between chunks
+        respect_boundaries: Whether to respect boundaries
+        title: Optional title for the document
+
+    Returns:
+        Response with chunks
+
+    Raises:
+        HTTPException: If validation fails or processing fails
+    """
+    temp_file_path = None
+
+    try:
+        # Validation: Ensure exactly one source is provided
+        if not file and not text:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Either 'file' or 'text' must be provided",
+            )
+
+        if file and text:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Provide either 'file' or 'text', not both",
+            )
+
+        # Get service from bootstrap
+        service: ITextProcessor = _get_service()
+
+        # Create chunking strategy
+        strategy = ChunkingStrategy(
+            strategy_name=strategy_name,
+            chunk_size=chunk_size,
+            overlap_size=overlap_size,
+            respect_boundaries=respect_boundaries,
+        )
+
+        # File Logic: Delegate to extract_and_chunk via MarkdownExtractor
+        if file is not None:
+            # Validate file extension
+            if not file.filename or not file.filename.lower().endswith('.md'):
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST,
+                    detail="Unsupported file type. Only .md files are accepted",
+                )
+
+            # Create temporary directory and file with original filename
+            temp_dir = tempfile.mkdtemp()
+            temp_file_path = Path(temp_dir) / file.filename
+
+            # Save uploaded file to temporary location
+            logger.info(f"Processing uploaded markdown file: {file.filename}")
+            with open(temp_file_path, 'wb') as temp_file:
+                shutil.copyfileobj(file.file, temp_file)
+
+            # Delegate to extract_and_chunk (uses MarkdownExtractor)
+            chunks = service.extract_and_chunk(temp_file_path, strategy)
+
+        # Text Logic: Process text directly
+        else:
+            logger.info("Processing markdown text input")
+
+            # Validate content is not empty
+            if not text or not text.strip():
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST,
+                    detail="Markdown content cannot be empty",
+                )
+
+            # Process text through service
+            chunks = service.process_text_to_chunks(
+                text=text,
+                chunking_strategy=strategy,
+                title=title,
+            )
+
+        # Convert to response
+        chunk_responses = [_to_chunk_response(c) for c in chunks]
+
+        logger.info(f"Successfully processed markdown: {len(chunks)} chunks created")
+
+        return ChunkListResponse(
+            chunks=chunk_responses,
+            total_chunks=len(chunk_responses),
+        )
+
+    except HTTPException:
+        raise
+    except DomainException as e:
+        raise _map_domain_exception(e)
+    except Exception as e:
+        logger.error(f"Unexpected error processing markdown: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}",
+        )
+    finally:
+        # Clean up temporary file and directory if file was uploaded
+        if temp_file_path and temp_file_path.exists():
+            try:
+                temp_dir = temp_file_path.parent
+                shutil.rmtree(temp_dir)
+                logger.debug(f"Cleaned up temporary directory: {temp_dir}")
+            except Exception as e:
+                logger.warning(f"Failed to delete temporary directory: {str(e)}")
+
+
@router.post(
    "/extract",
    response_model=DocumentResponse,
@ -234,7 +378,7 @@ async def extract_document(

@router.post(
    "/process-file",
-    response_model=ExtractAndChunkResponse,
+    response_model=ChunkListResponse,
    status_code=status.HTTP_200_OK,
    summary="Process uploaded file (extraction to chunking)",
    description="Upload a file, extract text, parse markdown, and return chunks",
@ -245,7 +389,7 @@ async def process_file(
    chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
    overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
    respect_boundaries: bool = Form(True, description="Respect text boundaries"),
-) -> ExtractAndChunkResponse:
+) -> ChunkListResponse:
    """
    Complete file processing pipeline: Upload → Extract → Parse → Chunk.

@ -301,7 +445,7 @@ async def process_file(

        logger.info(f"Successfully processed {file.filename}: {len(chunks)} chunks created")

-        return ExtractAndChunkResponse(
+        return ChunkListResponse(
            chunks=chunk_responses,
            total_chunks=len(chunk_responses),
        )
@ -342,7 +486,7 @@ async def health_check() -> HealthCheckResponse:
    return HealthCheckResponse(
        status="healthy",
        version="1.0.0",
-        supported_file_types=["pdf", "docx", "txt", "zip"],
+        supported_file_types=["pdf", "docx", "txt", "md", "markdown", "zip"],
        available_strategies=["fixed_size", "paragraph"],
    )

--- a/src/adapters/incoming/api_schemas.py
+++ b/src/adapters/incoming/api_schemas.py
@ -109,12 +109,12 @@ class ProcessDocumentResponse(BaseModel):
    message: str = Field(default="Document processed successfully")


-class ExtractAndChunkResponse(BaseModel):
+class ChunkListResponse(BaseModel):
    """Response model for extract and chunk operation."""

    chunks: List[ChunkResponse]
    total_chunks: int
-    message: str = Field(default="Document extracted and chunked successfully")
+    message: str = Field(default="Document chunked successfully")


 class DocumentListResponse(BaseModel):
--- a/src/bootstrap.py
+++ b/src/bootstrap.py
@ -15,6 +15,7 @@ from .adapters.outgoing.chunkers.fixed_size_chunker import FixedSizeChunker
 from .adapters.outgoing.chunkers.paragraph_chunker import ParagraphChunker
 from .adapters.outgoing.extractors.docx_extractor import DocxExtractor
 from .adapters.outgoing.extractors.factory import ExtractorFactory
+from .adapters.outgoing.extractors.markdown_extractor import MarkdownExtractor
 from .adapters.outgoing.extractors.pdf_extractor import PDFExtractor
 from .adapters.outgoing.extractors.txt_extractor import TxtExtractor
 from .adapters.outgoing.extractors.zip_extractor import ZipExtractor
@ -100,6 +101,7 @@ class ApplicationContainer:
        factory.register_extractor(PDFExtractor())
        factory.register_extractor(DocxExtractor())
        factory.register_extractor(TxtExtractor())
+        factory.register_extractor(MarkdownExtractor())
        factory.register_extractor(ZipExtractor())

        logger.info(
--- a/src/core/domain/models.py
+++ b/src/core/domain/models.py
@ -17,6 +17,7 @@ class SourceType(str, Enum):
    """Enumeration of supported source types."""
    FILE = "file"
    WEB = "web"
+    TEXT = "text"


 class ChunkingMethod(str, Enum):
--- a/src/core/ports/incoming/text_processor.py
+++ b/src/core/ports/incoming/text_processor.py
@ -20,29 +20,6 @@ class ITextProcessor(ABC):
    the entry point into the core domain logic.
    """

-    @abstractmethod
-    def process_document(
-        self,
-        file_path: Path,
-        chunking_strategy: ChunkingStrategy,
-    ) -> Document:
-        """
-        Process a document by extracting text and storing it.
-
-        Args:
-            file_path: Path to the document file
-            chunking_strategy: Strategy configuration for chunking
-
-        Returns:
-            Processed Document entity
-
-        Raises:
-            ExtractionError: If text extraction fails
-            ProcessingError: If document processing fails
-            UnsupportedFileTypeError: If file type is not supported
-        """
-        pass
-
    @abstractmethod
    def extract_and_chunk(
        self,
--- a/src/core/services/document_processor_service.py
+++ b/src/core/services/document_processor_service.py
@ -53,71 +53,6 @@ class DocumentProcessorService(ITextProcessor):
        self._repository = repository
        logger.info("DocumentProcessorService initialized")

-    def process_document(
-        self,
-        file_path: Path,
-        chunking_strategy: ChunkingStrategy,
-    ) -> Document:
-        """
-        Process a document using the stateless pipeline.
-
-        Pipeline Order:
-        1. Extract Document with raw_markdown and metadata (via Adapter)
-        2. Parse Markdown into DocumentSection objects
-        3. Update Document with sections
-        4. Validate and persist Document
-        5. Mark as processed
-
-        Args:
-            file_path: Path to the document file
-            chunking_strategy: Strategy configuration (for metadata)
-
-        Returns:
-            Fully processed Document entity
-
-        Raises:
-            ExtractionError: If text extraction fails
-            ProcessingError: If document processing fails
-            UnsupportedFileTypeError: If file type is not supported
-        """
-        try:
-            logger.info(f"Processing document: {file_path}")
-
-            # Step 1: Extract Document with raw_markdown and metadata
-            document = self._extract_document(file_path)
-
-            # Step 2: Parse Markdown into structured sections
-            sections = parse_markdown(document.raw_markdown)
-            logger.debug(f"Parsed {len(sections)} sections from document")
-
-            # Step 3: Update Document with sections
-            document = document.model_copy(update={"sections": sections})
-
-            # Step 4: Validate document content
-            document.validate_content()
-
-            # Step 5: Persist to repository
-            saved_document = self._repository.save(document)
-
-            # Step 6: Mark as processed
-            saved_document.mark_as_processed()
-            self._repository.save(saved_document)
-
-            logger.info(
-                f"Document processed successfully: {saved_document.id} "
-                f"({len(sections)} sections)"
-            )
-            return saved_document
-
-        except ExtractionError:
-            raise
-        except Exception as e:
-            logger.error(f"Failed to process document: {str(e)}")
-            raise ProcessingError(
-                message="Document processing failed",
-                details=str(e),
-            )
-
    def extract_and_chunk(
        self,
        file_path: Path,
@ -260,7 +195,7 @@ class DocumentProcessorService(ITextProcessor):

            metadata = DocumentMetadata(
                source_id="text_input",
-                source_type=SourceType.WEB,  # Using WEB type for text input
+                source_type=SourceType.TEXT,
                display_name=f"{title}.md",
                size_bytes=len(text.encode('utf-8')),
            )