add text api
This commit is contained in:
parent
13b887260f
commit
90c10c79fa
@ -5,10 +5,12 @@ This is the incoming adapter that translates HTTP requests into
|
||||
domain operations. Routes pull the service directly from bootstrap.
|
||||
"""
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, FastAPI, HTTPException, status
|
||||
from fastapi import APIRouter, FastAPI, File, Form, HTTPException, UploadFile, status
|
||||
|
||||
from ...core.domain.exceptions import (
|
||||
ChunkingError,
|
||||
@ -93,16 +95,20 @@ def _to_document_response(document) -> DocumentResponse:
|
||||
"""
|
||||
from .api_schemas import DocumentMetadataResponse
|
||||
|
||||
# Extract file type from display_name or source_id
|
||||
display_name = document.metadata.display_name
|
||||
file_type = Path(display_name).suffix.lstrip('.') if '.' in display_name else 'unknown'
|
||||
|
||||
return DocumentResponse(
|
||||
id=str(document.id),
|
||||
content=document.content,
|
||||
metadata=DocumentMetadataResponse(
|
||||
file_name=document.metadata.file_name,
|
||||
file_type=document.metadata.file_type,
|
||||
file_size_bytes=document.metadata.file_size_bytes,
|
||||
file_name=document.metadata.display_name,
|
||||
file_type=file_type,
|
||||
file_size_bytes=document.metadata.size_bytes,
|
||||
created_at=document.metadata.created_at.isoformat(),
|
||||
author=document.metadata.author,
|
||||
page_count=document.metadata.page_count,
|
||||
page_count=None, # Not available in new metadata model
|
||||
),
|
||||
is_processed=document.is_processed,
|
||||
content_preview=document.get_content_preview(200),
|
||||
@ -173,66 +179,35 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
|
||||
|
||||
|
||||
@router.post(
|
||||
"/process",
|
||||
response_model=ProcessDocumentResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
summary="Process a document",
|
||||
description="Extract text from document and store it",
|
||||
)
|
||||
async def process_document(request: ProcessDocumentRequest) -> ProcessDocumentResponse:
|
||||
"""
|
||||
Process a document endpoint.
|
||||
|
||||
Args:
|
||||
request: Processing request with file path and strategy
|
||||
|
||||
Returns:
|
||||
Processing response with document details
|
||||
|
||||
Raises:
|
||||
HTTPException: If processing fails
|
||||
"""
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
# Convert request to domain models
|
||||
file_path = Path(request.file_path)
|
||||
strategy = _to_domain_strategy(request.chunking_strategy)
|
||||
|
||||
# Execute use case
|
||||
document = service.process_document(file_path, strategy)
|
||||
|
||||
# Convert to response
|
||||
return ProcessDocumentResponse(
|
||||
document=_to_document_response(document)
|
||||
)
|
||||
|
||||
except DomainException as e:
|
||||
raise _map_domain_exception(e)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error processing document: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/extract-and-chunk",
|
||||
"/process-file",
|
||||
response_model=ExtractAndChunkResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Extract and chunk document",
|
||||
description="Extract text and split into chunks",
|
||||
summary="Process uploaded file (extraction to chunking)",
|
||||
description="Upload a file, extract text, parse markdown, and return chunks",
|
||||
)
|
||||
async def extract_and_chunk(
|
||||
request: ExtractAndChunkRequest,
|
||||
async def process_file(
|
||||
file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, zip)"),
|
||||
strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
|
||||
chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
|
||||
overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
|
||||
respect_boundaries: bool = Form(True, description="Respect text boundaries"),
|
||||
) -> ExtractAndChunkResponse:
|
||||
"""
|
||||
Extract and chunk document endpoint.
|
||||
Complete file processing pipeline: Upload → Extract → Parse → Chunk.
|
||||
|
||||
This endpoint handles the full document processing workflow:
|
||||
1. Accepts file upload (PDF, DOCX, TXT, ZIP)
|
||||
2. Extracts text content using appropriate extractor
|
||||
3. Parses markdown structure into sections
|
||||
4. Chunks content according to strategy
|
||||
5. Returns chunks with metadata
|
||||
|
||||
Args:
|
||||
request: Extract and chunk request
|
||||
file: Uploaded file
|
||||
strategy_name: Name of chunking strategy
|
||||
chunk_size: Target chunk size
|
||||
overlap_size: Overlap between chunks
|
||||
respect_boundaries: Whether to respect boundaries
|
||||
|
||||
Returns:
|
||||
Response with chunks
|
||||
@ -240,20 +215,38 @@ async def extract_and_chunk(
|
||||
Raises:
|
||||
HTTPException: If extraction or chunking fails
|
||||
"""
|
||||
temp_file_path = None
|
||||
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
# Convert request to domain models
|
||||
file_path = Path(request.file_path)
|
||||
strategy = _to_domain_strategy(request.chunking_strategy)
|
||||
# Create temporary file with appropriate suffix
|
||||
suffix = Path(file.filename).suffix if file.filename else ".tmp"
|
||||
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
||||
temp_file_path = Path(temp_file.name)
|
||||
|
||||
# Execute use case
|
||||
chunks = service.extract_and_chunk(file_path, strategy)
|
||||
# Copy uploaded file to temporary location
|
||||
logger.info(f"Processing uploaded file: {file.filename}")
|
||||
with temp_file:
|
||||
shutil.copyfileobj(file.file, temp_file)
|
||||
|
||||
# Create chunking strategy
|
||||
strategy = ChunkingStrategy(
|
||||
strategy_name=strategy_name,
|
||||
chunk_size=chunk_size,
|
||||
overlap_size=overlap_size,
|
||||
respect_boundaries=respect_boundaries,
|
||||
)
|
||||
|
||||
# Execute complete pipeline: extract → parse → chunk
|
||||
chunks = service.extract_and_chunk(temp_file_path, strategy)
|
||||
|
||||
# Convert to response
|
||||
chunk_responses = [_to_chunk_response(c) for c in chunks]
|
||||
|
||||
logger.info(f"Successfully processed {file.filename}: {len(chunks)} chunks created")
|
||||
|
||||
return ExtractAndChunkResponse(
|
||||
chunks=chunk_responses,
|
||||
total_chunks=len(chunk_responses),
|
||||
@ -262,149 +255,109 @@ async def extract_and_chunk(
|
||||
except DomainException as e:
|
||||
raise _map_domain_exception(e)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error extracting and chunking: {str(e)}")
|
||||
logger.error(f"Unexpected error processing file: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
if temp_file_path and temp_file_path.exists():
|
||||
try:
|
||||
temp_file_path.unlink()
|
||||
logger.debug(f"Cleaned up temporary file: {temp_file_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete temporary file {temp_file_path}: {str(e)}")
|
||||
|
||||
|
||||
@router.get(
|
||||
"/documents/{document_id}",
|
||||
response_model=DocumentResponse,
|
||||
@router.post(
|
||||
"/process-text",
|
||||
response_model=ExtractAndChunkResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Get document by ID",
|
||||
description="Retrieve a processed document",
|
||||
summary="Process markdown text (parse and chunk)",
|
||||
description="Accept markdown text, parse structure, and return chunks",
|
||||
)
|
||||
async def get_document(document_id: str) -> DocumentResponse:
|
||||
async def process_text(
|
||||
text: str = Form(..., description="Markdown text to process"),
|
||||
strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
|
||||
chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
|
||||
overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
|
||||
respect_boundaries: bool = Form(True, description="Respect text boundaries"),
|
||||
title: str = Form("text_input", description="Optional title for the text document"),
|
||||
) -> ExtractAndChunkResponse:
|
||||
"""
|
||||
Get document by ID endpoint.
|
||||
Process raw markdown text: Parse → Chunk.
|
||||
|
||||
This endpoint handles text processing workflow:
|
||||
1. Accepts markdown text as string
|
||||
2. Parses markdown structure into sections
|
||||
3. Persists document to repository
|
||||
4. Chunks content according to strategy
|
||||
5. Returns chunks with metadata
|
||||
|
||||
Args:
|
||||
document_id: UUID of the document
|
||||
text: Markdown text content
|
||||
strategy_name: Name of chunking strategy
|
||||
chunk_size: Target chunk size
|
||||
overlap_size: Overlap between chunks
|
||||
respect_boundaries: Whether to respect boundaries
|
||||
title: Optional title for the document
|
||||
|
||||
Returns:
|
||||
Document response
|
||||
Response with chunks
|
||||
|
||||
Raises:
|
||||
HTTPException: If document not found
|
||||
HTTPException: If parsing or chunking fails
|
||||
"""
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
doc_uuid = UUID(document_id)
|
||||
document = service.get_document(doc_uuid)
|
||||
return _to_document_response(document)
|
||||
|
||||
except ValueError:
|
||||
# Basic validation at API boundary
|
||||
if not text or not text.strip():
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Invalid document ID format: {document_id}",
|
||||
)
|
||||
except DocumentNotFoundError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(e),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error retrieving document: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
detail="Text content cannot be empty",
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/documents",
|
||||
response_model=DocumentListResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="List all documents",
|
||||
description="Retrieve all documents with pagination",
|
||||
)
|
||||
async def list_documents(limit: int = 100, offset: int = 0) -> DocumentListResponse:
|
||||
"""
|
||||
List documents endpoint.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of documents to return
|
||||
offset: Number of documents to skip
|
||||
|
||||
Returns:
|
||||
List of documents with pagination info
|
||||
"""
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
# Get service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
documents = service.list_documents(limit, offset)
|
||||
doc_responses = [_to_document_response(d) for d in documents]
|
||||
|
||||
return DocumentListResponse(
|
||||
documents=doc_responses,
|
||||
total=len(doc_responses),
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
# Create chunking strategy
|
||||
strategy = ChunkingStrategy(
|
||||
strategy_name=strategy_name,
|
||||
chunk_size=chunk_size,
|
||||
overlap_size=overlap_size,
|
||||
respect_boundaries=respect_boundaries,
|
||||
)
|
||||
|
||||
# Execute complete workflow through service
|
||||
logger.info(f"Processing text input via service: {len(text)} characters")
|
||||
chunks = service.process_text_to_chunks(
|
||||
text=text,
|
||||
chunking_strategy=strategy,
|
||||
title=title,
|
||||
)
|
||||
|
||||
# Convert to response
|
||||
chunk_responses = [_to_chunk_response(c) for c in chunks]
|
||||
|
||||
logger.info(f"Successfully processed text: {len(chunks)} chunks created")
|
||||
|
||||
return ExtractAndChunkResponse(
|
||||
chunks=chunk_responses,
|
||||
total_chunks=len(chunk_responses),
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except DomainException as e:
|
||||
raise _map_domain_exception(e)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error listing documents: {str(e)}")
|
||||
logger.error(f"Unexpected error processing text: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/documents/{document_id}",
|
||||
response_model=DeleteDocumentResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Delete document",
|
||||
description="Delete a document by ID",
|
||||
)
|
||||
async def delete_document(document_id: str) -> DeleteDocumentResponse:
|
||||
"""
|
||||
Delete document endpoint.
|
||||
|
||||
Args:
|
||||
document_id: UUID of the document
|
||||
|
||||
Returns:
|
||||
Deletion response
|
||||
|
||||
Raises:
|
||||
HTTPException: If document not found or deletion fails
|
||||
"""
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
doc_uuid = UUID(document_id)
|
||||
success = service.delete_document(doc_uuid)
|
||||
|
||||
return DeleteDocumentResponse(
|
||||
success=success,
|
||||
message=f"Document {document_id} deleted successfully",
|
||||
document_id=document_id,
|
||||
)
|
||||
|
||||
except ValueError:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Invalid document ID format: {document_id}",
|
||||
)
|
||||
except DocumentNotFoundError as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=str(e),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error deleting document: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/health",
|
||||
|
||||
@ -112,3 +112,34 @@ class ITextProcessor(ABC):
|
||||
RepositoryError: If deletion fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process_text_to_chunks(
|
||||
self,
|
||||
text: str,
|
||||
chunking_strategy: ChunkingStrategy,
|
||||
title: str = "text_input",
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Process raw markdown text into chunks.
|
||||
|
||||
This method handles the complete text processing workflow:
|
||||
1. Parse markdown into structured sections
|
||||
2. Create Document entity with metadata
|
||||
3. Persist document to repository
|
||||
4. Chunk the document according to strategy
|
||||
|
||||
Args:
|
||||
text: Markdown text content to process
|
||||
chunking_strategy: Strategy configuration for chunking
|
||||
title: Optional title/identifier for the text input
|
||||
|
||||
Returns:
|
||||
List of Chunk entities
|
||||
|
||||
Raises:
|
||||
ValidationError: If text is empty or invalid
|
||||
ChunkingError: If chunking fails
|
||||
ProcessingError: If document processing fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@ -242,6 +242,83 @@ class DocumentProcessorService(ITextProcessor):
|
||||
extractor = self._extractor_factory.create_extractor(file_path)
|
||||
return extractor.extract(file_path)
|
||||
|
||||
def process_text_to_chunks(
|
||||
self,
|
||||
text: str,
|
||||
chunking_strategy: ChunkingStrategy,
|
||||
title: str = "text_input",
|
||||
) -> List[Chunk]:
|
||||
"""
|
||||
Process raw markdown text into chunks.
|
||||
|
||||
This method handles the complete text processing workflow:
|
||||
1. Parse markdown into structured sections
|
||||
2. Create Document entity with metadata
|
||||
3. Persist document to repository
|
||||
4. Chunk the document according to strategy
|
||||
|
||||
Args:
|
||||
text: Markdown text content to process
|
||||
chunking_strategy: Strategy configuration for chunking
|
||||
title: Optional title/identifier for the text input
|
||||
|
||||
Returns:
|
||||
List of Chunk entities
|
||||
|
||||
Raises:
|
||||
ValidationError: If text is empty or invalid
|
||||
ChunkingError: If chunking fails
|
||||
ProcessingError: If document processing fails
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Processing text input: {len(text)} characters")
|
||||
|
||||
# Validate text content
|
||||
if not text or not text.strip():
|
||||
from ..domain.exceptions import ValidationError
|
||||
raise ValidationError(
|
||||
message="Text content cannot be empty",
|
||||
field_name="text",
|
||||
)
|
||||
|
||||
# Step 1: Parse markdown into sections
|
||||
sections = parse_markdown(text)
|
||||
logger.debug(f"Parsed {len(sections)} sections from text")
|
||||
|
||||
# Step 2: Create metadata for text input
|
||||
from ..domain.models import DocumentMetadata, SourceType
|
||||
|
||||
metadata = DocumentMetadata(
|
||||
source_id="text_input",
|
||||
source_type=SourceType.WEB, # Using WEB type for text input
|
||||
display_name=f"{title}.md",
|
||||
size_bytes=len(text.encode('utf-8')),
|
||||
)
|
||||
|
||||
# Step 3: Create Document entity
|
||||
document = Document(
|
||||
raw_markdown=text,
|
||||
sections=sections,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
# Validate document content
|
||||
document.validate_content()
|
||||
|
||||
# Step 4: Persist document to repository
|
||||
saved_document = self._repository.save(document)
|
||||
logger.info(f"Text document saved with ID: {saved_document.id}")
|
||||
|
||||
# Step 5: Chunk the document
|
||||
chunks = self._chunk_document(saved_document, chunking_strategy)
|
||||
|
||||
logger.info(f"Successfully processed text: {len(chunks)} chunks created")
|
||||
return chunks
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process text: {str(e)}")
|
||||
raise
|
||||
|
||||
def _chunk_document(
|
||||
self,
|
||||
document: Document,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user