add extract endpoint
This commit is contained in:
parent
0084ae6bc0
commit
2c4a59f84b
@ -160,6 +160,78 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/extract",
|
||||
response_model=DocumentResponse,
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Extract document from uploaded file",
|
||||
description="Upload a file and extract text content with metadata",
|
||||
)
|
||||
async def extract_document(
|
||||
file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, zip)"),
|
||||
) -> DocumentResponse:
|
||||
"""
|
||||
Extract text content from uploaded file.
|
||||
|
||||
This endpoint handles file extraction only:
|
||||
1. Accepts file upload (PDF, DOCX, TXT, ZIP)
|
||||
2. Extracts raw text content using appropriate extractor
|
||||
3. Returns Document entity with metadata (no parsing)
|
||||
|
||||
Args:
|
||||
file: Uploaded file
|
||||
|
||||
Returns:
|
||||
Response with extracted document
|
||||
|
||||
Raises:
|
||||
HTTPException: If extraction fails
|
||||
"""
|
||||
temp_file_path = None
|
||||
|
||||
try:
|
||||
# Pull service from bootstrap
|
||||
service: ITextProcessor = _get_service()
|
||||
|
||||
# Create temporary directory and file with original filename
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
original_filename = file.filename if file.filename else "uploaded_file.tmp"
|
||||
temp_file_path = Path(temp_dir) / original_filename
|
||||
|
||||
# Copy uploaded file to temporary location
|
||||
logger.info(f"Extracting uploaded file: {file.filename}")
|
||||
with open(temp_file_path, 'wb') as temp_file:
|
||||
shutil.copyfileobj(file.file, temp_file)
|
||||
|
||||
# Execute extraction only (no parsing)
|
||||
document = service.extract_document(temp_file_path)
|
||||
|
||||
# Convert to response
|
||||
document_response = _to_document_response(document)
|
||||
|
||||
logger.info(f"Successfully extracted {file.filename}: {len(document.raw_markdown)} characters")
|
||||
|
||||
return document_response
|
||||
|
||||
except DomainException as e:
|
||||
raise _map_domain_exception(e)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error extracting file: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Internal server error: {str(e)}",
|
||||
)
|
||||
finally:
|
||||
# Clean up temporary file and directory
|
||||
if temp_file_path and temp_file_path.exists():
|
||||
try:
|
||||
temp_dir = temp_file_path.parent
|
||||
shutil.rmtree(temp_dir)
|
||||
logger.debug(f"Cleaned up temporary directory: {temp_dir}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete temporary directory: {str(e)}")
|
||||
|
||||
|
||||
@router.post(
|
||||
"/process-file",
|
||||
response_model=ExtractAndChunkResponse,
|
||||
|
||||
@ -65,6 +65,28 @@ class ITextProcessor(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def extract_document(self, file_path: Path) -> Document:
|
||||
"""
|
||||
Extract text content from document without parsing or chunking.
|
||||
|
||||
This method only performs extraction:
|
||||
1. Extracts raw text content from file
|
||||
2. Creates Document entity with metadata
|
||||
3. Returns Document with raw_markdown (no sections)
|
||||
|
||||
Args:
|
||||
file_path: Path to the document file
|
||||
|
||||
Returns:
|
||||
Document entity with raw markdown
|
||||
|
||||
Raises:
|
||||
ExtractionError: If text extraction fails
|
||||
UnsupportedFileTypeError: If file type is not supported
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process_text_to_chunks(
|
||||
self,
|
||||
|
||||
@ -165,9 +165,37 @@ class DocumentProcessorService(ITextProcessor):
|
||||
logger.error(f"Failed to extract and chunk: {str(e)}")
|
||||
raise
|
||||
|
||||
def extract_document(self, file_path: Path) -> Document:
|
||||
"""
|
||||
Extract text content from document without parsing or chunking.
|
||||
|
||||
This method only performs extraction:
|
||||
1. Extracts raw text content from file
|
||||
2. Creates Document entity with metadata
|
||||
3. Returns Document with raw_markdown (no sections)
|
||||
|
||||
Args:
|
||||
file_path: Path to the document file
|
||||
|
||||
Returns:
|
||||
Document entity with raw markdown
|
||||
|
||||
Raises:
|
||||
ExtractionError: If text extraction fails
|
||||
UnsupportedFileTypeError: If file type is not supported
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Extracting document: {file_path}")
|
||||
document = self._extract_document(file_path)
|
||||
logger.info(f"Successfully extracted {len(document.raw_markdown)} characters")
|
||||
return document
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract document: {str(e)}")
|
||||
raise
|
||||
|
||||
def _extract_document(self, file_path: Path) -> Document:
|
||||
"""
|
||||
Extract Document using appropriate extractor.
|
||||
Internal helper: Extract Document using appropriate extractor.
|
||||
|
||||
Extractors create Document entities with raw_markdown and metadata.
|
||||
Sections will be parsed later in the pipeline.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user