add extract endpoint

2026-01-19 16:05:55 +03:30 · 2026-01-19 16:05:55 +03:30 · 2c4a59f84b
commit 2c4a59f84b
parent 0084ae6bc0
3 changed files with 123 additions and 1 deletions
--- a/src/adapters/incoming/api_routes.py
+++ b/src/adapters/incoming/api_routes.py
@ -160,6 +160,78 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
        )


+@router.post(
+    "/extract",
+    response_model=DocumentResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Extract document from uploaded file",
+    description="Upload a file and extract text content with metadata",
+)
+async def extract_document(
+    file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, zip)"),
+) -> DocumentResponse:
+    """
+    Extract text content from uploaded file.
+
+    This endpoint handles file extraction only:
+    1. Accepts file upload (PDF, DOCX, TXT, ZIP)
+    2. Extracts raw text content using appropriate extractor
+    3. Returns Document entity with metadata (no parsing)
+
+    Args:
+        file: Uploaded file
+
+    Returns:
+        Response with extracted document
+
+    Raises:
+        HTTPException: If extraction fails
+    """
+    temp_file_path = None
+
+    try:
+        # Pull service from bootstrap
+        service: ITextProcessor = _get_service()
+
+        # Create temporary directory and file with original filename
+        temp_dir = tempfile.mkdtemp()
+        original_filename = file.filename if file.filename else "uploaded_file.tmp"
+        temp_file_path = Path(temp_dir) / original_filename
+
+        # Copy uploaded file to temporary location
+        logger.info(f"Extracting uploaded file: {file.filename}")
+        with open(temp_file_path, 'wb') as temp_file:
+            shutil.copyfileobj(file.file, temp_file)
+
+        # Execute extraction only (no parsing)
+        document = service.extract_document(temp_file_path)
+
+        # Convert to response
+        document_response = _to_document_response(document)
+
+        logger.info(f"Successfully extracted {file.filename}: {len(document.raw_markdown)} characters")
+
+        return document_response
+
+    except DomainException as e:
+        raise _map_domain_exception(e)
+    except Exception as e:
+        logger.error(f"Unexpected error extracting file: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}",
+        )
+    finally:
+        # Clean up temporary file and directory
+        if temp_file_path and temp_file_path.exists():
+            try:
+                temp_dir = temp_file_path.parent
+                shutil.rmtree(temp_dir)
+                logger.debug(f"Cleaned up temporary directory: {temp_dir}")
+            except Exception as e:
+                logger.warning(f"Failed to delete temporary directory: {str(e)}")
+
+
@router.post(
    "/process-file",
    response_model=ExtractAndChunkResponse,
--- a/src/core/ports/incoming/text_processor.py
+++ b/src/core/ports/incoming/text_processor.py
@ -65,6 +65,28 @@ class ITextProcessor(ABC):
        """
        pass

+    @abstractmethod
+    def extract_document(self, file_path: Path) -> Document:
+        """
+        Extract text content from document without parsing or chunking.
+
+        This method only performs extraction:
+        1. Extracts raw text content from file
+        2. Creates Document entity with metadata
+        3. Returns Document with raw_markdown (no sections)
+
+        Args:
+            file_path: Path to the document file
+
+        Returns:
+            Document entity with raw markdown
+
+        Raises:
+            ExtractionError: If text extraction fails
+            UnsupportedFileTypeError: If file type is not supported
+        """
+        pass
+
    @abstractmethod
    def process_text_to_chunks(
        self,
--- a/src/core/services/document_processor_service.py
+++ b/src/core/services/document_processor_service.py
@ -165,9 +165,37 @@ class DocumentProcessorService(ITextProcessor):
            logger.error(f"Failed to extract and chunk: {str(e)}")
            raise

+    def extract_document(self, file_path: Path) -> Document:
+        """
+        Extract text content from document without parsing or chunking.
+
+        This method only performs extraction:
+        1. Extracts raw text content from file
+        2. Creates Document entity with metadata
+        3. Returns Document with raw_markdown (no sections)
+
+        Args:
+            file_path: Path to the document file
+
+        Returns:
+            Document entity with raw markdown
+
+        Raises:
+            ExtractionError: If text extraction fails
+            UnsupportedFileTypeError: If file type is not supported
+        """
+        try:
+            logger.info(f"Extracting document: {file_path}")
+            document = self._extract_document(file_path)
+            logger.info(f"Successfully extracted {len(document.raw_markdown)} characters")
+            return document
+        except Exception as e:
+            logger.error(f"Failed to extract document: {str(e)}")
+            raise
+
    def _extract_document(self, file_path: Path) -> Document:
        """
-        Extract Document using appropriate extractor.
+        Internal helper: Extract Document using appropriate extractor.

        Extractors create Document entities with raw_markdown and metadata.
        Sections will be parsed later in the pipeline.