add extract endpoint

fix
make chunking method enum and remove some redundant code in core and api
2026-01-19 16:05:55 +03:30 · 2026-01-19 15:42:46 +03:30 · 2026-01-19 15:19:11 +03:30 · 2026-01-19 14:00:17 +03:30
6 changed files with 149 additions and 213 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -20,7 +20,7 @@ from ...core.domain.exceptions import (
    ProcessingError,
    UnsupportedFileTypeError,
 )
-from ...core.domain.models import ChunkingStrategy
+from ...core.domain.models import ChunkingMethod, ChunkingStrategy
 from ...core.ports.incoming.text_processor import ITextProcessor
 from .api_schemas import (
    ChunkResponse,
@ -65,24 +65,6 @@ def _get_service() -> ITextProcessor:
    return get_processor_service()


-def _to_domain_strategy(request_strategy) -> ChunkingStrategy:
-    """
-    Convert API request strategy to domain model.
-
-    Args:
-        request_strategy: API request strategy schema
-
-    Returns:
-        ChunkingStrategy: Domain strategy model
-    """
-    return ChunkingStrategy(
-        strategy_name=request_strategy.strategy_name,
-        chunk_size=request_strategy.chunk_size,
-        overlap_size=request_strategy.overlap_size,
-        respect_boundaries=request_strategy.respect_boundaries,
-    )
-
-
 def _to_document_response(document) -> DocumentResponse:
    """
    Convert domain document to API response.
@ -178,6 +160,78 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
        )


+@router.post(
+    "/extract",
+    response_model=DocumentResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Extract document from uploaded file",
+    description="Upload a file and extract text content with metadata",
+)
+async def extract_document(
+    file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, zip)"),
+) -> DocumentResponse:
+    """
+    Extract text content from uploaded file.
+
+    This endpoint handles file extraction only:
+    1. Accepts file upload (PDF, DOCX, TXT, ZIP)
+    2. Extracts raw text content using appropriate extractor
+    3. Returns Document entity with metadata (no parsing)
+
+    Args:
+        file: Uploaded file
+
+    Returns:
+        Response with extracted document
+
+    Raises:
+        HTTPException: If extraction fails
+    """
+    temp_file_path = None
+
+    try:
+        # Pull service from bootstrap
+        service: ITextProcessor = _get_service()
+
+        # Create temporary directory and file with original filename
+        temp_dir = tempfile.mkdtemp()
+        original_filename = file.filename if file.filename else "uploaded_file.tmp"
+        temp_file_path = Path(temp_dir) / original_filename
+
+        # Copy uploaded file to temporary location
+        logger.info(f"Extracting uploaded file: {file.filename}")
+        with open(temp_file_path, 'wb') as temp_file:
+            shutil.copyfileobj(file.file, temp_file)
+
+        # Execute extraction only (no parsing)
+        document = service.extract_document(temp_file_path)
+
+        # Convert to response
+        document_response = _to_document_response(document)
+
+        logger.info(f"Successfully extracted {file.filename}: {len(document.raw_markdown)} characters")
+
+        return document_response
+
+    except DomainException as e:
+        raise _map_domain_exception(e)
+    except Exception as e:
+        logger.error(f"Unexpected error extracting file: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}",
+        )
+    finally:
+        # Clean up temporary file and directory
+        if temp_file_path and temp_file_path.exists():
+            try:
+                temp_dir = temp_file_path.parent
+                shutil.rmtree(temp_dir)
+                logger.debug(f"Cleaned up temporary directory: {temp_dir}")
+            except Exception as e:
+                logger.warning(f"Failed to delete temporary directory: {str(e)}")
+
+
@router.post(
    "/process-file",
    response_model=ExtractAndChunkResponse,
@ -187,7 +241,7 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
 )
 async def process_file(
    file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, zip)"),
-    strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
+    strategy_name: ChunkingMethod = Form(..., description="Chunking method"),
    chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
    overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
    respect_boundaries: bool = Form(True, description="Respect text boundaries"),
@ -221,14 +275,14 @@ async def process_file(
        # Pull service from bootstrap
        service: ITextProcessor = _get_service()

-        # Create temporary file with appropriate suffix
-        suffix = Path(file.filename).suffix if file.filename else ".tmp"
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
-        temp_file_path = Path(temp_file.name)
+        # Create temporary directory and file with original filename
+        temp_dir = tempfile.mkdtemp()
+        original_filename = file.filename if file.filename else "uploaded_file.tmp"
+        temp_file_path = Path(temp_dir) / original_filename

        # Copy uploaded file to temporary location
        logger.info(f"Processing uploaded file: {file.filename}")
-        with temp_file:
+        with open(temp_file_path, 'wb') as temp_file:
            shutil.copyfileobj(file.file, temp_file)

        # Create chunking strategy
@ -261,102 +315,14 @@ async def process_file(
            detail=f"Internal server error: {str(e)}",
        )
    finally:
-        # Clean up temporary file
+        # Clean up temporary file and directory
        if temp_file_path and temp_file_path.exists():
            try:
-                temp_file_path.unlink()
-                logger.debug(f"Cleaned up temporary file: {temp_file_path}")
+                temp_dir = temp_file_path.parent
+                shutil.rmtree(temp_dir)
+                logger.debug(f"Cleaned up temporary directory: {temp_dir}")
            except Exception as e:
-                logger.warning(f"Failed to delete temporary file {temp_file_path}: {str(e)}")
-
-
-@router.post(
-    "/process-text",
-    response_model=ExtractAndChunkResponse,
-    status_code=status.HTTP_200_OK,
-    summary="Process markdown text (parse and chunk)",
-    description="Accept markdown text, parse structure, and return chunks",
-)
-async def process_text(
-    text: str = Form(..., description="Markdown text to process"),
-    strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
-    chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
-    overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
-    respect_boundaries: bool = Form(True, description="Respect text boundaries"),
-    title: str = Form("text_input", description="Optional title for the text document"),
-) -> ExtractAndChunkResponse:
-    """
-    Process raw markdown text: Parse → Chunk.
-
-    This endpoint handles text processing workflow:
-    1. Accepts markdown text as string
-    2. Parses markdown structure into sections
-    3. Persists document to repository
-    4. Chunks content according to strategy
-    5. Returns chunks with metadata
-
-    Args:
-        text: Markdown text content
-        strategy_name: Name of chunking strategy
-        chunk_size: Target chunk size
-        overlap_size: Overlap between chunks
-        respect_boundaries: Whether to respect boundaries
-        title: Optional title for the document
-
-    Returns:
-        Response with chunks
-
-    Raises:
-        HTTPException: If parsing or chunking fails
-    """
-    try:
-        # Basic validation at API boundary
-        if not text or not text.strip():
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail="Text content cannot be empty",
-            )
-
-        # Get service from bootstrap
-        service: ITextProcessor = _get_service()
-
-        # Create chunking strategy
-        strategy = ChunkingStrategy(
-            strategy_name=strategy_name,
-            chunk_size=chunk_size,
-            overlap_size=overlap_size,
-            respect_boundaries=respect_boundaries,
-        )
-
-        # Execute complete workflow through service
-        logger.info(f"Processing text input via service: {len(text)} characters")
-        chunks = service.process_text_to_chunks(
-            text=text,
-            chunking_strategy=strategy,
-            title=title,
-        )
-
-        # Convert to response
-        chunk_responses = [_to_chunk_response(c) for c in chunks]
-
-        logger.info(f"Successfully processed text: {len(chunks)} chunks created")
-
-        return ExtractAndChunkResponse(
-            chunks=chunk_responses,
-            total_chunks=len(chunk_responses),
-        )
-
-    except HTTPException:
-        raise
-    except DomainException as e:
-        raise _map_domain_exception(e)
-    except Exception as e:
-        logger.error(f"Unexpected error processing text: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"Internal server error: {str(e)}",
-        )
-
+                logger.warning(f"Failed to delete temporary directory: {str(e)}")


@router.get(
--- a/src/adapters/incoming/api_schemas.py
+++ b/src/adapters/incoming/api_schemas.py
@ -9,14 +9,15 @@ from uuid import UUID

 from pydantic import BaseModel, Field

+from ...core.domain.models import ChunkingMethod
+

 class ChunkingStrategyRequest(BaseModel):
    """Request model for chunking strategy configuration."""

-    strategy_name: str = Field(
+    strategy_name: ChunkingMethod = Field(
        ...,
-        description="Name of chunking strategy (e.g., 'fixed_size', 'paragraph')",
-        examples=["fixed_size", "paragraph"],
+        description="Chunking method (FIXED_SIZE or PARAGRAPH)",
    )
    chunk_size: int = Field(
        ...,
--- a/src/adapters/outgoing/extractors/zip_extractor.py
+++ b/src/adapters/outgoing/extractors/zip_extractor.py
@ -5,6 +5,7 @@ This adapter implements the IExtractor port for ZIP files containing
 Markdown documents. It merges all .md files into a single document.
 """
 import logging
+import re
 import zipfile
 from pathlib import Path
 from typing import List
@ -184,7 +185,7 @@ class ZipExtractor(IExtractor):
                )

                # Join all parts with proper spacing
-                return "".join(merged_parts).strip()
+                return "\n".join(merged_parts).strip()

        except EmptyContentError:
            raise
@ -234,11 +235,29 @@ class ZipExtractor(IExtractor):
            if filename.lower().endswith('.md'):
                md_files.append(filename)

-        # Sort alphabetically for deterministic order
-        md_files.sort()
+        # Sort using natural/numeric order (page_1, page_2, ..., page_10)
+        md_files.sort(key=self._natural_sort_key)

        return md_files

+    def _natural_sort_key(self, filename: str):
+        """
+        Generate a natural sort key for proper numeric ordering.
+
+        Converts numeric parts to integers for correct sorting:
+        - 'page_1.md' < 'page_2.md' < 'page_10.md'
+
+        Args:
+            filename: Filename to generate sort key for
+
+        Returns:
+            List of alternating strings and integers for natural sorting
+        """
+        def convert(text):
+            return int(text) if text.isdigit() else text.lower()
+
+        return [convert(c) for c in re.split(r'(\d+)', filename)]
+
    def _extract_file_content(
        self,
        zip_file: zipfile.ZipFile,
--- a/src/core/domain/models.py
+++ b/src/core/domain/models.py
@ -19,6 +19,12 @@ class SourceType(str, Enum):
    WEB = "web"


+class ChunkingMethod(str, Enum):
+    """Enumeration of supported chunking methods."""
+    FIXED_SIZE = "fixed_size"
+    PARAGRAPH = "paragraph"
+
+
 class SourceFile(BaseModel):
    """
    Represents the raw input file before processing.
@ -429,12 +435,12 @@ class ChunkingStrategy(BaseModel):
    Configuration for a chunking strategy.

    Attributes:
-        strategy_name: Name of the chunking strategy
+        strategy_name: Chunking method (fixed_size or paragraph)
        chunk_size: Target size for chunks (in characters)
        overlap_size: Number of characters to overlap between chunks
        respect_boundaries: Whether to respect sentence/paragraph boundaries
    """
-    strategy_name: str = Field(..., min_length=1, description="Strategy name")
+    strategy_name: ChunkingMethod = Field(..., description="Chunking method")
    chunk_size: int = Field(..., ge=1, le=10000, description="Target chunk size")
    overlap_size: int = Field(default=0, ge=0, description="Overlap between chunks")
    respect_boundaries: bool = Field(
--- a/src/core/ports/incoming/text_processor.py
+++ b/src/core/ports/incoming/text_processor.py
@ -66,50 +66,24 @@ class ITextProcessor(ABC):
        pass

    @abstractmethod
-    def get_document(self, document_id: UUID) -> Document:
+    def extract_document(self, file_path: Path) -> Document:
        """
-        Retrieve a document by its ID.
+        Extract text content from document without parsing or chunking.
+
+        This method only performs extraction:
+        1. Extracts raw text content from file
+        2. Creates Document entity with metadata
+        3. Returns Document with raw_markdown (no sections)

        Args:
-            document_id: Unique identifier of the document
+            file_path: Path to the document file

        Returns:
-            Document entity
+            Document entity with raw markdown

        Raises:
-            DocumentNotFoundError: If document doesn't exist
-            RepositoryError: If retrieval fails
-        """
-        pass
-
-    @abstractmethod
-    def list_documents(self, limit: int = 100, offset: int = 0) -> List[Document]:
-        """
-        List documents with pagination.
-
-        Args:
-            limit: Maximum number of documents to return
-            offset: Number of documents to skip
-
-        Returns:
-            List of Document entities
-        """
-        pass
-
-    @abstractmethod
-    def delete_document(self, document_id: UUID) -> bool:
-        """
-        Delete a document by its ID.
-
-        Args:
-            document_id: Unique identifier of the document
-
-        Returns:
-            True if deletion was successful
-
-        Raises:
-            DocumentNotFoundError: If document doesn't exist
-            RepositoryError: If deletion fails
+            ExtractionError: If text extraction fails
+            UnsupportedFileTypeError: If file type is not supported
        """
        pass

--- a/src/core/services/document_processor_service.py
+++ b/src/core/services/document_processor_service.py
@ -165,67 +165,37 @@ class DocumentProcessorService(ITextProcessor):
            logger.error(f"Failed to extract and chunk: {str(e)}")
            raise

-    def get_document(self, document_id: UUID) -> Document:
+    def extract_document(self, file_path: Path) -> Document:
        """
-        Retrieve a document by its ID.
+        Extract text content from document without parsing or chunking.
+
+        This method only performs extraction:
+        1. Extracts raw text content from file
+        2. Creates Document entity with metadata
+        3. Returns Document with raw_markdown (no sections)

        Args:
-            document_id: Unique identifier of the document
+            file_path: Path to the document file

        Returns:
-            Document entity
+            Document entity with raw markdown

        Raises:
-            DocumentNotFoundError: If document doesn't exist
-            RepositoryError: If retrieval fails
+            ExtractionError: If text extraction fails
+            UnsupportedFileTypeError: If file type is not supported
        """
-        logger.debug(f"Retrieving document: {document_id}")
-
-        document = self._repository.find_by_id(document_id)
-
-        if document is None:
-            raise DocumentNotFoundError(str(document_id))
-
-        return document
-
-    def list_documents(self, limit: int = 100, offset: int = 0) -> List[Document]:
-        """
-        List documents with pagination.
-
-        Args:
-            limit: Maximum number of documents to return
-            offset: Number of documents to skip
-
-        Returns:
-            List of Document entities
-        """
-        logger.debug(f"Listing documents: limit={limit}, offset={offset}")
-        return self._repository.find_all(limit=limit, offset=offset)
-
-    def delete_document(self, document_id: UUID) -> bool:
-        """
-        Delete a document by its ID.
-
-        Args:
-            document_id: Unique identifier of the document
-
-        Returns:
-            True if deletion was successful
-
-        Raises:
-            DocumentNotFoundError: If document doesn't exist
-            RepositoryError: If deletion fails
-        """
-        logger.info(f"Deleting document: {document_id}")
-
-        if not self._repository.exists(document_id):
-            raise DocumentNotFoundError(str(document_id))
-
-        return self._repository.delete(document_id)
+        try:
+            logger.info(f"Extracting document: {file_path}")
+            document = self._extract_document(file_path)
+            logger.info(f"Successfully extracted {len(document.raw_markdown)} characters")
+            return document
+        except Exception as e:
+            logger.error(f"Failed to extract document: {str(e)}")
+            raise

    def _extract_document(self, file_path: Path) -> Document:
        """
-        Extract Document using appropriate extractor.
+        Internal helper: Extract Document using appropriate extractor.

        Extractors create Document entities with raw_markdown and metadata.
        Sections will be parsed later in the pipeline.
Author	SHA1	Message	Date
m.dabbagh	2c4a59f84b	add extract endpoint	2026-01-19 16:05:55 +03:30
m.dabbagh	0084ae6bc0	fix	2026-01-19 15:42:46 +03:30
m.dabbagh	e783d92eca	make chunking method enum and remove some redundant code in core and api	2026-01-19 15:19:11 +03:30
m.dabbagh	e2e1c86dd4	fix sorting and merging in zip extractor	2026-01-19 14:00:17 +03:30