diff --git a/src/adapters/incoming/api_routes.py b/src/adapters/incoming/api_routes.py index 65c6497..94d5f91 100644 --- a/src/adapters/incoming/api_routes.py +++ b/src/adapters/incoming/api_routes.py @@ -160,6 +160,78 @@ def _map_domain_exception(exception: DomainException) -> HTTPException: ) +@router.post( + "/extract", + response_model=DocumentResponse, + status_code=status.HTTP_200_OK, + summary="Extract document from uploaded file", + description="Upload a file and extract text content with metadata", +) +async def extract_document( + file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, zip)"), +) -> DocumentResponse: + """ + Extract text content from uploaded file. + + This endpoint handles file extraction only: + 1. Accepts file upload (PDF, DOCX, TXT, ZIP) + 2. Extracts raw text content using appropriate extractor + 3. Returns Document entity with metadata (no parsing) + + Args: + file: Uploaded file + + Returns: + Response with extracted document + + Raises: + HTTPException: If extraction fails + """ + temp_file_path = None + + try: + # Pull service from bootstrap + service: ITextProcessor = _get_service() + + # Create temporary directory and file with original filename + temp_dir = tempfile.mkdtemp() + original_filename = file.filename if file.filename else "uploaded_file.tmp" + temp_file_path = Path(temp_dir) / original_filename + + # Copy uploaded file to temporary location + logger.info(f"Extracting uploaded file: {file.filename}") + with open(temp_file_path, 'wb') as temp_file: + shutil.copyfileobj(file.file, temp_file) + + # Execute extraction only (no parsing) + document = service.extract_document(temp_file_path) + + # Convert to response + document_response = _to_document_response(document) + + logger.info(f"Successfully extracted {file.filename}: {len(document.raw_markdown)} characters") + + return document_response + + except DomainException as e: + raise _map_domain_exception(e) + except Exception as e: + logger.error(f"Unexpected error extracting file: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal server error: {str(e)}", + ) + finally: + # Clean up temporary file and directory + if temp_file_path and temp_file_path.exists(): + try: + temp_dir = temp_file_path.parent + shutil.rmtree(temp_dir) + logger.debug(f"Cleaned up temporary directory: {temp_dir}") + except Exception as e: + logger.warning(f"Failed to delete temporary directory: {str(e)}") + + @router.post( "/process-file", response_model=ExtractAndChunkResponse, diff --git a/src/core/ports/incoming/text_processor.py b/src/core/ports/incoming/text_processor.py index d08ebe6..eeb1b9e 100644 --- a/src/core/ports/incoming/text_processor.py +++ b/src/core/ports/incoming/text_processor.py @@ -65,6 +65,28 @@ class ITextProcessor(ABC): """ pass + @abstractmethod + def extract_document(self, file_path: Path) -> Document: + """ + Extract text content from document without parsing or chunking. + + This method only performs extraction: + 1. Extracts raw text content from file + 2. Creates Document entity with metadata + 3. Returns Document with raw_markdown (no sections) + + Args: + file_path: Path to the document file + + Returns: + Document entity with raw markdown + + Raises: + ExtractionError: If text extraction fails + UnsupportedFileTypeError: If file type is not supported + """ + pass + @abstractmethod def process_text_to_chunks( self, diff --git a/src/core/services/document_processor_service.py b/src/core/services/document_processor_service.py index a20b24f..51892ad 100644 --- a/src/core/services/document_processor_service.py +++ b/src/core/services/document_processor_service.py @@ -165,9 +165,37 @@ class DocumentProcessorService(ITextProcessor): logger.error(f"Failed to extract and chunk: {str(e)}") raise + def extract_document(self, file_path: Path) -> Document: + """ + Extract text content from document without parsing or chunking. + + This method only performs extraction: + 1. Extracts raw text content from file + 2. Creates Document entity with metadata + 3. Returns Document with raw_markdown (no sections) + + Args: + file_path: Path to the document file + + Returns: + Document entity with raw markdown + + Raises: + ExtractionError: If text extraction fails + UnsupportedFileTypeError: If file type is not supported + """ + try: + logger.info(f"Extracting document: {file_path}") + document = self._extract_document(file_path) + logger.info(f"Successfully extracted {len(document.raw_markdown)} characters") + return document + except Exception as e: + logger.error(f"Failed to extract document: {str(e)}") + raise + def _extract_document(self, file_path: Path) -> Document: """ - Extract Document using appropriate extractor. + Internal helper: Extract Document using appropriate extractor. Extractors create Document entities with raw_markdown and metadata. Sections will be parsed later in the pipeline.