add extract endpoint

This commit is contained in:
m.dabbagh 2026-01-19 16:05:55 +03:30
parent 0084ae6bc0
commit 2c4a59f84b
3 changed files with 123 additions and 1 deletions

View File

@ -160,6 +160,78 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
) )
@router.post(
"/extract",
response_model=DocumentResponse,
status_code=status.HTTP_200_OK,
summary="Extract document from uploaded file",
description="Upload a file and extract text content with metadata",
)
async def extract_document(
file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, zip)"),
) -> DocumentResponse:
"""
Extract text content from uploaded file.
This endpoint handles file extraction only:
1. Accepts file upload (PDF, DOCX, TXT, ZIP)
2. Extracts raw text content using appropriate extractor
3. Returns Document entity with metadata (no parsing)
Args:
file: Uploaded file
Returns:
Response with extracted document
Raises:
HTTPException: If extraction fails
"""
temp_file_path = None
try:
# Pull service from bootstrap
service: ITextProcessor = _get_service()
# Create temporary directory and file with original filename
temp_dir = tempfile.mkdtemp()
original_filename = file.filename if file.filename else "uploaded_file.tmp"
temp_file_path = Path(temp_dir) / original_filename
# Copy uploaded file to temporary location
logger.info(f"Extracting uploaded file: {file.filename}")
with open(temp_file_path, 'wb') as temp_file:
shutil.copyfileobj(file.file, temp_file)
# Execute extraction only (no parsing)
document = service.extract_document(temp_file_path)
# Convert to response
document_response = _to_document_response(document)
logger.info(f"Successfully extracted {file.filename}: {len(document.raw_markdown)} characters")
return document_response
except DomainException as e:
raise _map_domain_exception(e)
except Exception as e:
logger.error(f"Unexpected error extracting file: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Internal server error: {str(e)}",
)
finally:
# Clean up temporary file and directory
if temp_file_path and temp_file_path.exists():
try:
temp_dir = temp_file_path.parent
shutil.rmtree(temp_dir)
logger.debug(f"Cleaned up temporary directory: {temp_dir}")
except Exception as e:
logger.warning(f"Failed to delete temporary directory: {str(e)}")
@router.post( @router.post(
"/process-file", "/process-file",
response_model=ExtractAndChunkResponse, response_model=ExtractAndChunkResponse,

View File

@ -65,6 +65,28 @@ class ITextProcessor(ABC):
""" """
pass pass
@abstractmethod
def extract_document(self, file_path: Path) -> Document:
"""
Extract text content from document without parsing or chunking.
This method only performs extraction:
1. Extracts raw text content from file
2. Creates Document entity with metadata
3. Returns Document with raw_markdown (no sections)
Args:
file_path: Path to the document file
Returns:
Document entity with raw markdown
Raises:
ExtractionError: If text extraction fails
UnsupportedFileTypeError: If file type is not supported
"""
pass
@abstractmethod @abstractmethod
def process_text_to_chunks( def process_text_to_chunks(
self, self,

View File

@ -165,9 +165,37 @@ class DocumentProcessorService(ITextProcessor):
logger.error(f"Failed to extract and chunk: {str(e)}") logger.error(f"Failed to extract and chunk: {str(e)}")
raise raise
def extract_document(self, file_path: Path) -> Document:
"""
Extract text content from document without parsing or chunking.
This method only performs extraction:
1. Extracts raw text content from file
2. Creates Document entity with metadata
3. Returns Document with raw_markdown (no sections)
Args:
file_path: Path to the document file
Returns:
Document entity with raw markdown
Raises:
ExtractionError: If text extraction fails
UnsupportedFileTypeError: If file type is not supported
"""
try:
logger.info(f"Extracting document: {file_path}")
document = self._extract_document(file_path)
logger.info(f"Successfully extracted {len(document.raw_markdown)} characters")
return document
except Exception as e:
logger.error(f"Failed to extract document: {str(e)}")
raise
def _extract_document(self, file_path: Path) -> Document: def _extract_document(self, file_path: Path) -> Document:
""" """
Extract Document using appropriate extractor. Internal helper: Extract Document using appropriate extractor.
Extractors create Document entities with raw_markdown and metadata. Extractors create Document entities with raw_markdown and metadata.
Sections will be parsed later in the pipeline. Sections will be parsed later in the pipeline.