add extract endpoint
This commit is contained in:
parent
0084ae6bc0
commit
2c4a59f84b
@ -160,6 +160,78 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/extract",
|
||||||
|
response_model=DocumentResponse,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
summary="Extract document from uploaded file",
|
||||||
|
description="Upload a file and extract text content with metadata",
|
||||||
|
)
|
||||||
|
async def extract_document(
|
||||||
|
file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, zip)"),
|
||||||
|
) -> DocumentResponse:
|
||||||
|
"""
|
||||||
|
Extract text content from uploaded file.
|
||||||
|
|
||||||
|
This endpoint handles file extraction only:
|
||||||
|
1. Accepts file upload (PDF, DOCX, TXT, ZIP)
|
||||||
|
2. Extracts raw text content using appropriate extractor
|
||||||
|
3. Returns Document entity with metadata (no parsing)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file: Uploaded file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response with extracted document
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If extraction fails
|
||||||
|
"""
|
||||||
|
temp_file_path = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Pull service from bootstrap
|
||||||
|
service: ITextProcessor = _get_service()
|
||||||
|
|
||||||
|
# Create temporary directory and file with original filename
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
original_filename = file.filename if file.filename else "uploaded_file.tmp"
|
||||||
|
temp_file_path = Path(temp_dir) / original_filename
|
||||||
|
|
||||||
|
# Copy uploaded file to temporary location
|
||||||
|
logger.info(f"Extracting uploaded file: {file.filename}")
|
||||||
|
with open(temp_file_path, 'wb') as temp_file:
|
||||||
|
shutil.copyfileobj(file.file, temp_file)
|
||||||
|
|
||||||
|
# Execute extraction only (no parsing)
|
||||||
|
document = service.extract_document(temp_file_path)
|
||||||
|
|
||||||
|
# Convert to response
|
||||||
|
document_response = _to_document_response(document)
|
||||||
|
|
||||||
|
logger.info(f"Successfully extracted {file.filename}: {len(document.raw_markdown)} characters")
|
||||||
|
|
||||||
|
return document_response
|
||||||
|
|
||||||
|
except DomainException as e:
|
||||||
|
raise _map_domain_exception(e)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error extracting file: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Internal server error: {str(e)}",
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
# Clean up temporary file and directory
|
||||||
|
if temp_file_path and temp_file_path.exists():
|
||||||
|
try:
|
||||||
|
temp_dir = temp_file_path.parent
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
logger.debug(f"Cleaned up temporary directory: {temp_dir}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to delete temporary directory: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
@router.post(
|
||||||
"/process-file",
|
"/process-file",
|
||||||
response_model=ExtractAndChunkResponse,
|
response_model=ExtractAndChunkResponse,
|
||||||
|
|||||||
@ -65,6 +65,28 @@ class ITextProcessor(ABC):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def extract_document(self, file_path: Path) -> Document:
|
||||||
|
"""
|
||||||
|
Extract text content from document without parsing or chunking.
|
||||||
|
|
||||||
|
This method only performs extraction:
|
||||||
|
1. Extracts raw text content from file
|
||||||
|
2. Creates Document entity with metadata
|
||||||
|
3. Returns Document with raw_markdown (no sections)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the document file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document entity with raw markdown
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If text extraction fails
|
||||||
|
UnsupportedFileTypeError: If file type is not supported
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def process_text_to_chunks(
|
def process_text_to_chunks(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -165,9 +165,37 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
logger.error(f"Failed to extract and chunk: {str(e)}")
|
logger.error(f"Failed to extract and chunk: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def extract_document(self, file_path: Path) -> Document:
|
||||||
|
"""
|
||||||
|
Extract text content from document without parsing or chunking.
|
||||||
|
|
||||||
|
This method only performs extraction:
|
||||||
|
1. Extracts raw text content from file
|
||||||
|
2. Creates Document entity with metadata
|
||||||
|
3. Returns Document with raw_markdown (no sections)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the document file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document entity with raw markdown
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ExtractionError: If text extraction fails
|
||||||
|
UnsupportedFileTypeError: If file type is not supported
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Extracting document: {file_path}")
|
||||||
|
document = self._extract_document(file_path)
|
||||||
|
logger.info(f"Successfully extracted {len(document.raw_markdown)} characters")
|
||||||
|
return document
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to extract document: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
def _extract_document(self, file_path: Path) -> Document:
|
def _extract_document(self, file_path: Path) -> Document:
|
||||||
"""
|
"""
|
||||||
Extract Document using appropriate extractor.
|
Internal helper: Extract Document using appropriate extractor.
|
||||||
|
|
||||||
Extractors create Document entities with raw_markdown and metadata.
|
Extractors create Document entities with raw_markdown and metadata.
|
||||||
Sections will be parsed later in the pipeline.
|
Sections will be parsed later in the pipeline.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user