add extract endpoint

2026-01-19 16:05:55 +03:30 · 2026-01-19 16:05:55 +03:30 · 2c4a59f84b
commit 2c4a59f84b
parent 0084ae6bc0
3 changed files with 123 additions and 1 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -160,6 +160,78 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
        )
@router.post(
    "/extract",
    response_model=DocumentResponse,
    status_code=status.HTTP_200_OK,
    summary="Extract document from uploaded file",
    description="Upload a file and extract text content with metadata",
 )
 async def extract_document(
    file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, zip)"),
 ) -> DocumentResponse:
    """
    Extract text content from uploaded file.
    This endpoint handles file extraction only:
    1. Accepts file upload (PDF, DOCX, TXT, ZIP)
    2. Extracts raw text content using appropriate extractor
    3. Returns Document entity with metadata (no parsing)
    Args:
        file: Uploaded file
    Returns:
        Response with extracted document
    Raises:
        HTTPException: If extraction fails
    """
    temp_file_path = None
    try:
        # Pull service from bootstrap
        service: ITextProcessor = _get_service()
        # Create temporary directory and file with original filename
        temp_dir = tempfile.mkdtemp()
        original_filename = file.filename if file.filename else "uploaded_file.tmp"
        temp_file_path = Path(temp_dir) / original_filename
        # Copy uploaded file to temporary location
        logger.info(f"Extracting uploaded file: {file.filename}")
        with open(temp_file_path, 'wb') as temp_file:
            shutil.copyfileobj(file.file, temp_file)
        # Execute extraction only (no parsing)
        document = service.extract_document(temp_file_path)
        # Convert to response
        document_response = _to_document_response(document)
        logger.info(f"Successfully extracted {file.filename}: {len(document.raw_markdown)} characters")
        return document_response
    except DomainException as e:
        raise _map_domain_exception(e)
    except Exception as e:
        logger.error(f"Unexpected error extracting file: {str(e)}")
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Internal server error: {str(e)}",
        )
    finally:
        # Clean up temporary file and directory
        if temp_file_path and temp_file_path.exists():
            try:
                temp_dir = temp_file_path.parent
                shutil.rmtree(temp_dir)
                logger.debug(f"Cleaned up temporary directory: {temp_dir}")
            except Exception as e:
                logger.warning(f"Failed to delete temporary directory: {str(e)}")
@router.post(
    "/process-file",
    response_model=ExtractAndChunkResponse,
--- a/src/core/ports/incoming/text_processor.py
+++ b/src/core/ports/incoming/text_processor.py
@ -65,6 +65,28 @@ class ITextProcessor(ABC):
        """
        pass
    @abstractmethod
    def extract_document(self, file_path: Path) -> Document:
        """
        Extract text content from document without parsing or chunking.
        This method only performs extraction:
        1. Extracts raw text content from file
        2. Creates Document entity with metadata
        3. Returns Document with raw_markdown (no sections)
        Args:
            file_path: Path to the document file
        Returns:
            Document entity with raw markdown
        Raises:
            ExtractionError: If text extraction fails
            UnsupportedFileTypeError: If file type is not supported
        """
        pass
    @abstractmethod
    def process_text_to_chunks(
        self,
--- a/src/core/services/document_processor_service.py
+++ b/src/core/services/document_processor_service.py
@ -165,9 +165,37 @@ class DocumentProcessorService(ITextProcessor):
            logger.error(f"Failed to extract and chunk: {str(e)}")
            raise
    def extract_document(self, file_path: Path) -> Document:
        """
        Extract text content from document without parsing or chunking.
        This method only performs extraction:
        1. Extracts raw text content from file
        2. Creates Document entity with metadata
        3. Returns Document with raw_markdown (no sections)
        Args:
            file_path: Path to the document file
        Returns:
            Document entity with raw markdown
        Raises:
            ExtractionError: If text extraction fails
            UnsupportedFileTypeError: If file type is not supported
        """
        try:
            logger.info(f"Extracting document: {file_path}")
            document = self._extract_document(file_path)
            logger.info(f"Successfully extracted {len(document.raw_markdown)} characters")
            return document
        except Exception as e:
            logger.error(f"Failed to extract document: {str(e)}")
            raise
    def _extract_document(self, file_path: Path) -> Document:
        """
-        Extract Document using appropriate extractor.
+        Internal helper: Extract Document using appropriate extractor.
        Extractors create Document entities with raw_markdown and metadata.
        Sections will be parsed later in the pipeline.