add text api

This commit is contained in:
m.dabbagh 2026-01-18 19:38:53 +03:30
parent 13b887260f
commit 90c10c79fa
3 changed files with 238 additions and 177 deletions

View File

@ -5,10 +5,12 @@ This is the incoming adapter that translates HTTP requests into
domain operations. Routes pull the service directly from bootstrap. domain operations. Routes pull the service directly from bootstrap.
""" """
import logging import logging
import shutil
import tempfile
from pathlib import Path from pathlib import Path
from uuid import UUID from uuid import UUID
from fastapi import APIRouter, FastAPI, HTTPException, status from fastapi import APIRouter, FastAPI, File, Form, HTTPException, UploadFile, status
from ...core.domain.exceptions import ( from ...core.domain.exceptions import (
ChunkingError, ChunkingError,
@ -93,16 +95,20 @@ def _to_document_response(document) -> DocumentResponse:
""" """
from .api_schemas import DocumentMetadataResponse from .api_schemas import DocumentMetadataResponse
# Extract file type from display_name or source_id
display_name = document.metadata.display_name
file_type = Path(display_name).suffix.lstrip('.') if '.' in display_name else 'unknown'
return DocumentResponse( return DocumentResponse(
id=str(document.id), id=str(document.id),
content=document.content, content=document.content,
metadata=DocumentMetadataResponse( metadata=DocumentMetadataResponse(
file_name=document.metadata.file_name, file_name=document.metadata.display_name,
file_type=document.metadata.file_type, file_type=file_type,
file_size_bytes=document.metadata.file_size_bytes, file_size_bytes=document.metadata.size_bytes,
created_at=document.metadata.created_at.isoformat(), created_at=document.metadata.created_at.isoformat(),
author=document.metadata.author, author=document.metadata.author,
page_count=document.metadata.page_count, page_count=None, # Not available in new metadata model
), ),
is_processed=document.is_processed, is_processed=document.is_processed,
content_preview=document.get_content_preview(200), content_preview=document.get_content_preview(200),
@ -173,66 +179,35 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
@router.post( @router.post(
"/process", "/process-file",
response_model=ProcessDocumentResponse,
status_code=status.HTTP_201_CREATED,
summary="Process a document",
description="Extract text from document and store it",
)
async def process_document(request: ProcessDocumentRequest) -> ProcessDocumentResponse:
"""
Process a document endpoint.
Args:
request: Processing request with file path and strategy
Returns:
Processing response with document details
Raises:
HTTPException: If processing fails
"""
try:
# Pull service from bootstrap
service: ITextProcessor = _get_service()
# Convert request to domain models
file_path = Path(request.file_path)
strategy = _to_domain_strategy(request.chunking_strategy)
# Execute use case
document = service.process_document(file_path, strategy)
# Convert to response
return ProcessDocumentResponse(
document=_to_document_response(document)
)
except DomainException as e:
raise _map_domain_exception(e)
except Exception as e:
logger.error(f"Unexpected error processing document: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal server error: {str(e)}",
)
@router.post(
"/extract-and-chunk",
response_model=ExtractAndChunkResponse, response_model=ExtractAndChunkResponse,
status_code=status.HTTP_200_OK, status_code=status.HTTP_200_OK,
summary="Extract and chunk document", summary="Process uploaded file (extraction to chunking)",
description="Extract text and split into chunks", description="Upload a file, extract text, parse markdown, and return chunks",
) )
async def extract_and_chunk( async def process_file(
request: ExtractAndChunkRequest, file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, zip)"),
strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
respect_boundaries: bool = Form(True, description="Respect text boundaries"),
) -> ExtractAndChunkResponse: ) -> ExtractAndChunkResponse:
""" """
Extract and chunk document endpoint. Complete file processing pipeline: Upload Extract Parse Chunk.
This endpoint handles the full document processing workflow:
1. Accepts file upload (PDF, DOCX, TXT, ZIP)
2. Extracts text content using appropriate extractor
3. Parses markdown structure into sections
4. Chunks content according to strategy
5. Returns chunks with metadata
Args: Args:
request: Extract and chunk request file: Uploaded file
strategy_name: Name of chunking strategy
chunk_size: Target chunk size
overlap_size: Overlap between chunks
respect_boundaries: Whether to respect boundaries
Returns: Returns:
Response with chunks Response with chunks
@ -240,20 +215,38 @@ async def extract_and_chunk(
Raises: Raises:
HTTPException: If extraction or chunking fails HTTPException: If extraction or chunking fails
""" """
temp_file_path = None
try: try:
# Pull service from bootstrap # Pull service from bootstrap
service: ITextProcessor = _get_service() service: ITextProcessor = _get_service()
# Convert request to domain models # Create temporary file with appropriate suffix
file_path = Path(request.file_path) suffix = Path(file.filename).suffix if file.filename else ".tmp"
strategy = _to_domain_strategy(request.chunking_strategy) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
temp_file_path = Path(temp_file.name)
# Execute use case # Copy uploaded file to temporary location
chunks = service.extract_and_chunk(file_path, strategy) logger.info(f"Processing uploaded file: {file.filename}")
with temp_file:
shutil.copyfileobj(file.file, temp_file)
# Create chunking strategy
strategy = ChunkingStrategy(
strategy_name=strategy_name,
chunk_size=chunk_size,
overlap_size=overlap_size,
respect_boundaries=respect_boundaries,
)
# Execute complete pipeline: extract → parse → chunk
chunks = service.extract_and_chunk(temp_file_path, strategy)
# Convert to response # Convert to response
chunk_responses = [_to_chunk_response(c) for c in chunks] chunk_responses = [_to_chunk_response(c) for c in chunks]
logger.info(f"Successfully processed {file.filename}: {len(chunks)} chunks created")
return ExtractAndChunkResponse( return ExtractAndChunkResponse(
chunks=chunk_responses, chunks=chunk_responses,
total_chunks=len(chunk_responses), total_chunks=len(chunk_responses),
@ -262,149 +255,109 @@ async def extract_and_chunk(
except DomainException as e: except DomainException as e:
raise _map_domain_exception(e) raise _map_domain_exception(e)
except Exception as e: except Exception as e:
logger.error(f"Unexpected error extracting and chunking: {str(e)}") logger.error(f"Unexpected error processing file: {str(e)}")
raise HTTPException( raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal server error: {str(e)}", detail=f"Internal server error: {str(e)}",
) )
finally:
# Clean up temporary file
if temp_file_path and temp_file_path.exists():
try:
temp_file_path.unlink()
logger.debug(f"Cleaned up temporary file: {temp_file_path}")
except Exception as e:
logger.warning(f"Failed to delete temporary file {temp_file_path}: {str(e)}")
@router.get( @router.post(
"/documents/{document_id}", "/process-text",
response_model=DocumentResponse, response_model=ExtractAndChunkResponse,
status_code=status.HTTP_200_OK, status_code=status.HTTP_200_OK,
summary="Get document by ID", summary="Process markdown text (parse and chunk)",
description="Retrieve a processed document", description="Accept markdown text, parse structure, and return chunks",
) )
async def get_document(document_id: str) -> DocumentResponse: async def process_text(
text: str = Form(..., description="Markdown text to process"),
strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
respect_boundaries: bool = Form(True, description="Respect text boundaries"),
title: str = Form("text_input", description="Optional title for the text document"),
) -> ExtractAndChunkResponse:
""" """
Get document by ID endpoint. Process raw markdown text: Parse Chunk.
This endpoint handles text processing workflow:
1. Accepts markdown text as string
2. Parses markdown structure into sections
3. Persists document to repository
4. Chunks content according to strategy
5. Returns chunks with metadata
Args: Args:
document_id: UUID of the document text: Markdown text content
strategy_name: Name of chunking strategy
chunk_size: Target chunk size
overlap_size: Overlap between chunks
respect_boundaries: Whether to respect boundaries
title: Optional title for the document
Returns: Returns:
Document response Response with chunks
Raises: Raises:
HTTPException: If document not found HTTPException: If parsing or chunking fails
""" """
try: try:
# Pull service from bootstrap # Basic validation at API boundary
if not text or not text.strip():
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Text content cannot be empty",
)
# Get service from bootstrap
service: ITextProcessor = _get_service() service: ITextProcessor = _get_service()
doc_uuid = UUID(document_id) # Create chunking strategy
document = service.get_document(doc_uuid) strategy = ChunkingStrategy(
return _to_document_response(document) strategy_name=strategy_name,
chunk_size=chunk_size,
overlap_size=overlap_size,
respect_boundaries=respect_boundaries,
)
except ValueError: # Execute complete workflow through service
raise HTTPException( logger.info(f"Processing text input via service: {len(text)} characters")
status_code=status.HTTP_400_BAD_REQUEST, chunks = service.process_text_to_chunks(
detail=f"Invalid document ID format: {document_id}", text=text,
chunking_strategy=strategy,
title=title,
) )
except DocumentNotFoundError as e:
raise HTTPException( # Convert to response
status_code=status.HTTP_404_NOT_FOUND, chunk_responses = [_to_chunk_response(c) for c in chunks]
detail=str(e),
logger.info(f"Successfully processed text: {len(chunks)} chunks created")
return ExtractAndChunkResponse(
chunks=chunk_responses,
total_chunks=len(chunk_responses),
) )
except HTTPException:
raise
except DomainException as e:
raise _map_domain_exception(e)
except Exception as e: except Exception as e:
logger.error(f"Unexpected error retrieving document: {str(e)}") logger.error(f"Unexpected error processing text: {str(e)}")
raise HTTPException( raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal server error: {str(e)}", detail=f"Internal server error: {str(e)}",
) )
@router.get(
"/documents",
response_model=DocumentListResponse,
status_code=status.HTTP_200_OK,
summary="List all documents",
description="Retrieve all documents with pagination",
)
async def list_documents(limit: int = 100, offset: int = 0) -> DocumentListResponse:
"""
List documents endpoint.
Args:
limit: Maximum number of documents to return
offset: Number of documents to skip
Returns:
List of documents with pagination info
"""
try:
# Pull service from bootstrap
service: ITextProcessor = _get_service()
documents = service.list_documents(limit, offset)
doc_responses = [_to_document_response(d) for d in documents]
return DocumentListResponse(
documents=doc_responses,
total=len(doc_responses),
limit=limit,
offset=offset,
)
except Exception as e:
logger.error(f"Unexpected error listing documents: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal server error: {str(e)}",
)
@router.delete(
"/documents/{document_id}",
response_model=DeleteDocumentResponse,
status_code=status.HTTP_200_OK,
summary="Delete document",
description="Delete a document by ID",
)
async def delete_document(document_id: str) -> DeleteDocumentResponse:
"""
Delete document endpoint.
Args:
document_id: UUID of the document
Returns:
Deletion response
Raises:
HTTPException: If document not found or deletion fails
"""
try:
# Pull service from bootstrap
service: ITextProcessor = _get_service()
doc_uuid = UUID(document_id)
success = service.delete_document(doc_uuid)
return DeleteDocumentResponse(
success=success,
message=f"Document {document_id} deleted successfully",
document_id=document_id,
)
except ValueError:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Invalid document ID format: {document_id}",
)
except DocumentNotFoundError as e:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e),
)
except Exception as e:
logger.error(f"Unexpected error deleting document: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal server error: {str(e)}",
)
@router.get( @router.get(
"/health", "/health",

View File

@ -112,3 +112,34 @@ class ITextProcessor(ABC):
RepositoryError: If deletion fails RepositoryError: If deletion fails
""" """
pass pass
@abstractmethod
def process_text_to_chunks(
self,
text: str,
chunking_strategy: ChunkingStrategy,
title: str = "text_input",
) -> List[Chunk]:
"""
Process raw markdown text into chunks.
This method handles the complete text processing workflow:
1. Parse markdown into structured sections
2. Create Document entity with metadata
3. Persist document to repository
4. Chunk the document according to strategy
Args:
text: Markdown text content to process
chunking_strategy: Strategy configuration for chunking
title: Optional title/identifier for the text input
Returns:
List of Chunk entities
Raises:
ValidationError: If text is empty or invalid
ChunkingError: If chunking fails
ProcessingError: If document processing fails
"""
pass

View File

@ -242,6 +242,83 @@ class DocumentProcessorService(ITextProcessor):
extractor = self._extractor_factory.create_extractor(file_path) extractor = self._extractor_factory.create_extractor(file_path)
return extractor.extract(file_path) return extractor.extract(file_path)
def process_text_to_chunks(
self,
text: str,
chunking_strategy: ChunkingStrategy,
title: str = "text_input",
) -> List[Chunk]:
"""
Process raw markdown text into chunks.
This method handles the complete text processing workflow:
1. Parse markdown into structured sections
2. Create Document entity with metadata
3. Persist document to repository
4. Chunk the document according to strategy
Args:
text: Markdown text content to process
chunking_strategy: Strategy configuration for chunking
title: Optional title/identifier for the text input
Returns:
List of Chunk entities
Raises:
ValidationError: If text is empty or invalid
ChunkingError: If chunking fails
ProcessingError: If document processing fails
"""
try:
logger.info(f"Processing text input: {len(text)} characters")
# Validate text content
if not text or not text.strip():
from ..domain.exceptions import ValidationError
raise ValidationError(
message="Text content cannot be empty",
field_name="text",
)
# Step 1: Parse markdown into sections
sections = parse_markdown(text)
logger.debug(f"Parsed {len(sections)} sections from text")
# Step 2: Create metadata for text input
from ..domain.models import DocumentMetadata, SourceType
metadata = DocumentMetadata(
source_id="text_input",
source_type=SourceType.WEB, # Using WEB type for text input
display_name=f"{title}.md",
size_bytes=len(text.encode('utf-8')),
)
# Step 3: Create Document entity
document = Document(
raw_markdown=text,
sections=sections,
metadata=metadata,
)
# Validate document content
document.validate_content()
# Step 4: Persist document to repository
saved_document = self._repository.save(document)
logger.info(f"Text document saved with ID: {saved_document.id}")
# Step 5: Chunk the document
chunks = self._chunk_document(saved_document, chunking_strategy)
logger.info(f"Successfully processed text: {len(chunks)} chunks created")
return chunks
except Exception as e:
logger.error(f"Failed to process text: {str(e)}")
raise
def _chunk_document( def _chunk_document(
self, self,
document: Document, document: Document,