From 90c10c79fa2ba5e48f4c8255164c585bdff5affb Mon Sep 17 00:00:00 2001 From: "m.dabbagh" Date: Sun, 18 Jan 2026 19:38:53 +0330 Subject: [PATCH] add text api --- src/adapters/incoming/api_routes.py | 307 ++++++++---------- src/core/ports/incoming/text_processor.py | 31 ++ .../services/document_processor_service.py | 77 +++++ 3 files changed, 238 insertions(+), 177 deletions(-) diff --git a/src/adapters/incoming/api_routes.py b/src/adapters/incoming/api_routes.py index f61333c..57639f5 100644 --- a/src/adapters/incoming/api_routes.py +++ b/src/adapters/incoming/api_routes.py @@ -5,10 +5,12 @@ This is the incoming adapter that translates HTTP requests into domain operations. Routes pull the service directly from bootstrap. """ import logging +import shutil +import tempfile from pathlib import Path from uuid import UUID -from fastapi import APIRouter, FastAPI, HTTPException, status +from fastapi import APIRouter, FastAPI, File, Form, HTTPException, UploadFile, status from ...core.domain.exceptions import ( ChunkingError, @@ -93,16 +95,20 @@ def _to_document_response(document) -> DocumentResponse: """ from .api_schemas import DocumentMetadataResponse + # Extract file type from display_name or source_id + display_name = document.metadata.display_name + file_type = Path(display_name).suffix.lstrip('.') if '.' in display_name else 'unknown' + return DocumentResponse( id=str(document.id), content=document.content, metadata=DocumentMetadataResponse( - file_name=document.metadata.file_name, - file_type=document.metadata.file_type, - file_size_bytes=document.metadata.file_size_bytes, + file_name=document.metadata.display_name, + file_type=file_type, + file_size_bytes=document.metadata.size_bytes, created_at=document.metadata.created_at.isoformat(), author=document.metadata.author, - page_count=document.metadata.page_count, + page_count=None, # Not available in new metadata model ), is_processed=document.is_processed, content_preview=document.get_content_preview(200), @@ -173,66 +179,35 @@ def _map_domain_exception(exception: DomainException) -> HTTPException: @router.post( - "/process", - response_model=ProcessDocumentResponse, - status_code=status.HTTP_201_CREATED, - summary="Process a document", - description="Extract text from document and store it", -) -async def process_document(request: ProcessDocumentRequest) -> ProcessDocumentResponse: - """ - Process a document endpoint. - - Args: - request: Processing request with file path and strategy - - Returns: - Processing response with document details - - Raises: - HTTPException: If processing fails - """ - try: - # Pull service from bootstrap - service: ITextProcessor = _get_service() - - # Convert request to domain models - file_path = Path(request.file_path) - strategy = _to_domain_strategy(request.chunking_strategy) - - # Execute use case - document = service.process_document(file_path, strategy) - - # Convert to response - return ProcessDocumentResponse( - document=_to_document_response(document) - ) - - except DomainException as e: - raise _map_domain_exception(e) - except Exception as e: - logger.error(f"Unexpected error processing document: {str(e)}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Internal server error: {str(e)}", - ) - - -@router.post( - "/extract-and-chunk", + "/process-file", response_model=ExtractAndChunkResponse, status_code=status.HTTP_200_OK, - summary="Extract and chunk document", - description="Extract text and split into chunks", + summary="Process uploaded file (extraction to chunking)", + description="Upload a file, extract text, parse markdown, and return chunks", ) -async def extract_and_chunk( - request: ExtractAndChunkRequest, +async def process_file( + file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, zip)"), + strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]), + chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000), + overlap_size: int = Form(0, description="Overlap between chunks", ge=0), + respect_boundaries: bool = Form(True, description="Respect text boundaries"), ) -> ExtractAndChunkResponse: """ - Extract and chunk document endpoint. + Complete file processing pipeline: Upload → Extract → Parse → Chunk. + + This endpoint handles the full document processing workflow: + 1. Accepts file upload (PDF, DOCX, TXT, ZIP) + 2. Extracts text content using appropriate extractor + 3. Parses markdown structure into sections + 4. Chunks content according to strategy + 5. Returns chunks with metadata Args: - request: Extract and chunk request + file: Uploaded file + strategy_name: Name of chunking strategy + chunk_size: Target chunk size + overlap_size: Overlap between chunks + respect_boundaries: Whether to respect boundaries Returns: Response with chunks @@ -240,20 +215,38 @@ async def extract_and_chunk( Raises: HTTPException: If extraction or chunking fails """ + temp_file_path = None + try: # Pull service from bootstrap service: ITextProcessor = _get_service() - # Convert request to domain models - file_path = Path(request.file_path) - strategy = _to_domain_strategy(request.chunking_strategy) + # Create temporary file with appropriate suffix + suffix = Path(file.filename).suffix if file.filename else ".tmp" + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) + temp_file_path = Path(temp_file.name) - # Execute use case - chunks = service.extract_and_chunk(file_path, strategy) + # Copy uploaded file to temporary location + logger.info(f"Processing uploaded file: {file.filename}") + with temp_file: + shutil.copyfileobj(file.file, temp_file) + + # Create chunking strategy + strategy = ChunkingStrategy( + strategy_name=strategy_name, + chunk_size=chunk_size, + overlap_size=overlap_size, + respect_boundaries=respect_boundaries, + ) + + # Execute complete pipeline: extract → parse → chunk + chunks = service.extract_and_chunk(temp_file_path, strategy) # Convert to response chunk_responses = [_to_chunk_response(c) for c in chunks] + logger.info(f"Successfully processed {file.filename}: {len(chunks)} chunks created") + return ExtractAndChunkResponse( chunks=chunk_responses, total_chunks=len(chunk_responses), @@ -262,149 +255,109 @@ async def extract_and_chunk( except DomainException as e: raise _map_domain_exception(e) except Exception as e: - logger.error(f"Unexpected error extracting and chunking: {str(e)}") + logger.error(f"Unexpected error processing file: {str(e)}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Internal server error: {str(e)}", ) + finally: + # Clean up temporary file + if temp_file_path and temp_file_path.exists(): + try: + temp_file_path.unlink() + logger.debug(f"Cleaned up temporary file: {temp_file_path}") + except Exception as e: + logger.warning(f"Failed to delete temporary file {temp_file_path}: {str(e)}") -@router.get( - "/documents/{document_id}", - response_model=DocumentResponse, +@router.post( + "/process-text", + response_model=ExtractAndChunkResponse, status_code=status.HTTP_200_OK, - summary="Get document by ID", - description="Retrieve a processed document", + summary="Process markdown text (parse and chunk)", + description="Accept markdown text, parse structure, and return chunks", ) -async def get_document(document_id: str) -> DocumentResponse: +async def process_text( + text: str = Form(..., description="Markdown text to process"), + strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]), + chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000), + overlap_size: int = Form(0, description="Overlap between chunks", ge=0), + respect_boundaries: bool = Form(True, description="Respect text boundaries"), + title: str = Form("text_input", description="Optional title for the text document"), +) -> ExtractAndChunkResponse: """ - Get document by ID endpoint. + Process raw markdown text: Parse → Chunk. + + This endpoint handles text processing workflow: + 1. Accepts markdown text as string + 2. Parses markdown structure into sections + 3. Persists document to repository + 4. Chunks content according to strategy + 5. Returns chunks with metadata Args: - document_id: UUID of the document + text: Markdown text content + strategy_name: Name of chunking strategy + chunk_size: Target chunk size + overlap_size: Overlap between chunks + respect_boundaries: Whether to respect boundaries + title: Optional title for the document Returns: - Document response + Response with chunks Raises: - HTTPException: If document not found + HTTPException: If parsing or chunking fails """ try: - # Pull service from bootstrap + # Basic validation at API boundary + if not text or not text.strip(): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Text content cannot be empty", + ) + + # Get service from bootstrap service: ITextProcessor = _get_service() - doc_uuid = UUID(document_id) - document = service.get_document(doc_uuid) - return _to_document_response(document) + # Create chunking strategy + strategy = ChunkingStrategy( + strategy_name=strategy_name, + chunk_size=chunk_size, + overlap_size=overlap_size, + respect_boundaries=respect_boundaries, + ) - except ValueError: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Invalid document ID format: {document_id}", + # Execute complete workflow through service + logger.info(f"Processing text input via service: {len(text)} characters") + chunks = service.process_text_to_chunks( + text=text, + chunking_strategy=strategy, + title=title, ) - except DocumentNotFoundError as e: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=str(e), + + # Convert to response + chunk_responses = [_to_chunk_response(c) for c in chunks] + + logger.info(f"Successfully processed text: {len(chunks)} chunks created") + + return ExtractAndChunkResponse( + chunks=chunk_responses, + total_chunks=len(chunk_responses), ) + + except HTTPException: + raise + except DomainException as e: + raise _map_domain_exception(e) except Exception as e: - logger.error(f"Unexpected error retrieving document: {str(e)}") + logger.error(f"Unexpected error processing text: {str(e)}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Internal server error: {str(e)}", ) -@router.get( - "/documents", - response_model=DocumentListResponse, - status_code=status.HTTP_200_OK, - summary="List all documents", - description="Retrieve all documents with pagination", -) -async def list_documents(limit: int = 100, offset: int = 0) -> DocumentListResponse: - """ - List documents endpoint. - - Args: - limit: Maximum number of documents to return - offset: Number of documents to skip - - Returns: - List of documents with pagination info - """ - try: - # Pull service from bootstrap - service: ITextProcessor = _get_service() - - documents = service.list_documents(limit, offset) - doc_responses = [_to_document_response(d) for d in documents] - - return DocumentListResponse( - documents=doc_responses, - total=len(doc_responses), - limit=limit, - offset=offset, - ) - - except Exception as e: - logger.error(f"Unexpected error listing documents: {str(e)}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Internal server error: {str(e)}", - ) - - -@router.delete( - "/documents/{document_id}", - response_model=DeleteDocumentResponse, - status_code=status.HTTP_200_OK, - summary="Delete document", - description="Delete a document by ID", -) -async def delete_document(document_id: str) -> DeleteDocumentResponse: - """ - Delete document endpoint. - - Args: - document_id: UUID of the document - - Returns: - Deletion response - - Raises: - HTTPException: If document not found or deletion fails - """ - try: - # Pull service from bootstrap - service: ITextProcessor = _get_service() - - doc_uuid = UUID(document_id) - success = service.delete_document(doc_uuid) - - return DeleteDocumentResponse( - success=success, - message=f"Document {document_id} deleted successfully", - document_id=document_id, - ) - - except ValueError: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Invalid document ID format: {document_id}", - ) - except DocumentNotFoundError as e: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=str(e), - ) - except Exception as e: - logger.error(f"Unexpected error deleting document: {str(e)}") - raise HTTPException( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=f"Internal server error: {str(e)}", - ) - @router.get( "/health", diff --git a/src/core/ports/incoming/text_processor.py b/src/core/ports/incoming/text_processor.py index ff2b427..b47e523 100644 --- a/src/core/ports/incoming/text_processor.py +++ b/src/core/ports/incoming/text_processor.py @@ -112,3 +112,34 @@ class ITextProcessor(ABC): RepositoryError: If deletion fails """ pass + + @abstractmethod + def process_text_to_chunks( + self, + text: str, + chunking_strategy: ChunkingStrategy, + title: str = "text_input", + ) -> List[Chunk]: + """ + Process raw markdown text into chunks. + + This method handles the complete text processing workflow: + 1. Parse markdown into structured sections + 2. Create Document entity with metadata + 3. Persist document to repository + 4. Chunk the document according to strategy + + Args: + text: Markdown text content to process + chunking_strategy: Strategy configuration for chunking + title: Optional title/identifier for the text input + + Returns: + List of Chunk entities + + Raises: + ValidationError: If text is empty or invalid + ChunkingError: If chunking fails + ProcessingError: If document processing fails + """ + pass diff --git a/src/core/services/document_processor_service.py b/src/core/services/document_processor_service.py index 1c60860..e249def 100644 --- a/src/core/services/document_processor_service.py +++ b/src/core/services/document_processor_service.py @@ -242,6 +242,83 @@ class DocumentProcessorService(ITextProcessor): extractor = self._extractor_factory.create_extractor(file_path) return extractor.extract(file_path) + def process_text_to_chunks( + self, + text: str, + chunking_strategy: ChunkingStrategy, + title: str = "text_input", + ) -> List[Chunk]: + """ + Process raw markdown text into chunks. + + This method handles the complete text processing workflow: + 1. Parse markdown into structured sections + 2. Create Document entity with metadata + 3. Persist document to repository + 4. Chunk the document according to strategy + + Args: + text: Markdown text content to process + chunking_strategy: Strategy configuration for chunking + title: Optional title/identifier for the text input + + Returns: + List of Chunk entities + + Raises: + ValidationError: If text is empty or invalid + ChunkingError: If chunking fails + ProcessingError: If document processing fails + """ + try: + logger.info(f"Processing text input: {len(text)} characters") + + # Validate text content + if not text or not text.strip(): + from ..domain.exceptions import ValidationError + raise ValidationError( + message="Text content cannot be empty", + field_name="text", + ) + + # Step 1: Parse markdown into sections + sections = parse_markdown(text) + logger.debug(f"Parsed {len(sections)} sections from text") + + # Step 2: Create metadata for text input + from ..domain.models import DocumentMetadata, SourceType + + metadata = DocumentMetadata( + source_id="text_input", + source_type=SourceType.WEB, # Using WEB type for text input + display_name=f"{title}.md", + size_bytes=len(text.encode('utf-8')), + ) + + # Step 3: Create Document entity + document = Document( + raw_markdown=text, + sections=sections, + metadata=metadata, + ) + + # Validate document content + document.validate_content() + + # Step 4: Persist document to repository + saved_document = self._repository.save(document) + logger.info(f"Text document saved with ID: {saved_document.id}") + + # Step 5: Chunk the document + chunks = self._chunk_document(saved_document, chunking_strategy) + + logger.info(f"Successfully processed text: {len(chunks)} chunks created") + return chunks + + except Exception as e: + logger.error(f"Failed to process text: {str(e)}") + raise + def _chunk_document( self, document: Document,