Compare commits
4 Commits
6072bb188c
...
2c4a59f84b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2c4a59f84b | ||
|
|
0084ae6bc0 | ||
|
|
e783d92eca | ||
|
|
e2e1c86dd4 |
@ -20,7 +20,7 @@ from ...core.domain.exceptions import (
|
|||||||
ProcessingError,
|
ProcessingError,
|
||||||
UnsupportedFileTypeError,
|
UnsupportedFileTypeError,
|
||||||
)
|
)
|
||||||
from ...core.domain.models import ChunkingStrategy
|
from ...core.domain.models import ChunkingMethod, ChunkingStrategy
|
||||||
from ...core.ports.incoming.text_processor import ITextProcessor
|
from ...core.ports.incoming.text_processor import ITextProcessor
|
||||||
from .api_schemas import (
|
from .api_schemas import (
|
||||||
ChunkResponse,
|
ChunkResponse,
|
||||||
@ -65,24 +65,6 @@ def _get_service() -> ITextProcessor:
|
|||||||
return get_processor_service()
|
return get_processor_service()
|
||||||
|
|
||||||
|
|
||||||
def _to_domain_strategy(request_strategy) -> ChunkingStrategy:
|
|
||||||
"""
|
|
||||||
Convert API request strategy to domain model.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
request_strategy: API request strategy schema
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
ChunkingStrategy: Domain strategy model
|
|
||||||
"""
|
|
||||||
return ChunkingStrategy(
|
|
||||||
strategy_name=request_strategy.strategy_name,
|
|
||||||
chunk_size=request_strategy.chunk_size,
|
|
||||||
overlap_size=request_strategy.overlap_size,
|
|
||||||
respect_boundaries=request_strategy.respect_boundaries,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _to_document_response(document) -> DocumentResponse:
|
def _to_document_response(document) -> DocumentResponse:
|
||||||
"""
|
"""
|
||||||
Convert domain document to API response.
|
Convert domain document to API response.
|
||||||
@ -178,6 +160,78 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post(
|
||||||
|
"/extract",
|
||||||
|
response_model=DocumentResponse,
|
||||||
|
status_code=status.HTTP_200_OK,
|
||||||
|
summary="Extract document from uploaded file",
|
||||||
|
description="Upload a file and extract text content with metadata",
|
||||||
|
)
|
||||||
|
async def extract_document(
|
||||||
|
file: UploadFile = File(..., description="Document file to extract (pdf, docx, txt, zip)"),
|
||||||
|
) -> DocumentResponse:
|
||||||
|
"""
|
||||||
|
Extract text content from uploaded file.
|
||||||
|
|
||||||
|
This endpoint handles file extraction only:
|
||||||
|
1. Accepts file upload (PDF, DOCX, TXT, ZIP)
|
||||||
|
2. Extracts raw text content using appropriate extractor
|
||||||
|
3. Returns Document entity with metadata (no parsing)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file: Uploaded file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response with extracted document
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HTTPException: If extraction fails
|
||||||
|
"""
|
||||||
|
temp_file_path = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Pull service from bootstrap
|
||||||
|
service: ITextProcessor = _get_service()
|
||||||
|
|
||||||
|
# Create temporary directory and file with original filename
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
original_filename = file.filename if file.filename else "uploaded_file.tmp"
|
||||||
|
temp_file_path = Path(temp_dir) / original_filename
|
||||||
|
|
||||||
|
# Copy uploaded file to temporary location
|
||||||
|
logger.info(f"Extracting uploaded file: {file.filename}")
|
||||||
|
with open(temp_file_path, 'wb') as temp_file:
|
||||||
|
shutil.copyfileobj(file.file, temp_file)
|
||||||
|
|
||||||
|
# Execute extraction only (no parsing)
|
||||||
|
document = service.extract_document(temp_file_path)
|
||||||
|
|
||||||
|
# Convert to response
|
||||||
|
document_response = _to_document_response(document)
|
||||||
|
|
||||||
|
logger.info(f"Successfully extracted {file.filename}: {len(document.raw_markdown)} characters")
|
||||||
|
|
||||||
|
return document_response
|
||||||
|
|
||||||
|
except DomainException as e:
|
||||||
|
raise _map_domain_exception(e)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error extracting file: {str(e)}")
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Internal server error: {str(e)}",
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
# Clean up temporary file and directory
|
||||||
|
if temp_file_path and temp_file_path.exists():
|
||||||
|
try:
|
||||||
|
temp_dir = temp_file_path.parent
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
logger.debug(f"Cleaned up temporary directory: {temp_dir}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to delete temporary directory: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
@router.post(
|
||||||
"/process-file",
|
"/process-file",
|
||||||
response_model=ExtractAndChunkResponse,
|
response_model=ExtractAndChunkResponse,
|
||||||
@ -187,7 +241,7 @@ def _map_domain_exception(exception: DomainException) -> HTTPException:
|
|||||||
)
|
)
|
||||||
async def process_file(
|
async def process_file(
|
||||||
file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, zip)"),
|
file: UploadFile = File(..., description="Document file to process (pdf, docx, txt, zip)"),
|
||||||
strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
|
strategy_name: ChunkingMethod = Form(..., description="Chunking method"),
|
||||||
chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
|
chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
|
||||||
overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
|
overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
|
||||||
respect_boundaries: bool = Form(True, description="Respect text boundaries"),
|
respect_boundaries: bool = Form(True, description="Respect text boundaries"),
|
||||||
@ -221,14 +275,14 @@ async def process_file(
|
|||||||
# Pull service from bootstrap
|
# Pull service from bootstrap
|
||||||
service: ITextProcessor = _get_service()
|
service: ITextProcessor = _get_service()
|
||||||
|
|
||||||
# Create temporary file with appropriate suffix
|
# Create temporary directory and file with original filename
|
||||||
suffix = Path(file.filename).suffix if file.filename else ".tmp"
|
temp_dir = tempfile.mkdtemp()
|
||||||
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
original_filename = file.filename if file.filename else "uploaded_file.tmp"
|
||||||
temp_file_path = Path(temp_file.name)
|
temp_file_path = Path(temp_dir) / original_filename
|
||||||
|
|
||||||
# Copy uploaded file to temporary location
|
# Copy uploaded file to temporary location
|
||||||
logger.info(f"Processing uploaded file: {file.filename}")
|
logger.info(f"Processing uploaded file: {file.filename}")
|
||||||
with temp_file:
|
with open(temp_file_path, 'wb') as temp_file:
|
||||||
shutil.copyfileobj(file.file, temp_file)
|
shutil.copyfileobj(file.file, temp_file)
|
||||||
|
|
||||||
# Create chunking strategy
|
# Create chunking strategy
|
||||||
@ -261,102 +315,14 @@ async def process_file(
|
|||||||
detail=f"Internal server error: {str(e)}",
|
detail=f"Internal server error: {str(e)}",
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
# Clean up temporary file
|
# Clean up temporary file and directory
|
||||||
if temp_file_path and temp_file_path.exists():
|
if temp_file_path and temp_file_path.exists():
|
||||||
try:
|
try:
|
||||||
temp_file_path.unlink()
|
temp_dir = temp_file_path.parent
|
||||||
logger.debug(f"Cleaned up temporary file: {temp_file_path}")
|
shutil.rmtree(temp_dir)
|
||||||
|
logger.debug(f"Cleaned up temporary directory: {temp_dir}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to delete temporary file {temp_file_path}: {str(e)}")
|
logger.warning(f"Failed to delete temporary directory: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
|
||||||
"/process-text",
|
|
||||||
response_model=ExtractAndChunkResponse,
|
|
||||||
status_code=status.HTTP_200_OK,
|
|
||||||
summary="Process markdown text (parse and chunk)",
|
|
||||||
description="Accept markdown text, parse structure, and return chunks",
|
|
||||||
)
|
|
||||||
async def process_text(
|
|
||||||
text: str = Form(..., description="Markdown text to process"),
|
|
||||||
strategy_name: str = Form(..., description="Chunking strategy name", examples=["fixed_size", "paragraph"]),
|
|
||||||
chunk_size: int = Form(..., description="Target chunk size in characters", ge=1, le=10000),
|
|
||||||
overlap_size: int = Form(0, description="Overlap between chunks", ge=0),
|
|
||||||
respect_boundaries: bool = Form(True, description="Respect text boundaries"),
|
|
||||||
title: str = Form("text_input", description="Optional title for the text document"),
|
|
||||||
) -> ExtractAndChunkResponse:
|
|
||||||
"""
|
|
||||||
Process raw markdown text: Parse → Chunk.
|
|
||||||
|
|
||||||
This endpoint handles text processing workflow:
|
|
||||||
1. Accepts markdown text as string
|
|
||||||
2. Parses markdown structure into sections
|
|
||||||
3. Persists document to repository
|
|
||||||
4. Chunks content according to strategy
|
|
||||||
5. Returns chunks with metadata
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Markdown text content
|
|
||||||
strategy_name: Name of chunking strategy
|
|
||||||
chunk_size: Target chunk size
|
|
||||||
overlap_size: Overlap between chunks
|
|
||||||
respect_boundaries: Whether to respect boundaries
|
|
||||||
title: Optional title for the document
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Response with chunks
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
HTTPException: If parsing or chunking fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Basic validation at API boundary
|
|
||||||
if not text or not text.strip():
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
|
||||||
detail="Text content cannot be empty",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get service from bootstrap
|
|
||||||
service: ITextProcessor = _get_service()
|
|
||||||
|
|
||||||
# Create chunking strategy
|
|
||||||
strategy = ChunkingStrategy(
|
|
||||||
strategy_name=strategy_name,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
overlap_size=overlap_size,
|
|
||||||
respect_boundaries=respect_boundaries,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Execute complete workflow through service
|
|
||||||
logger.info(f"Processing text input via service: {len(text)} characters")
|
|
||||||
chunks = service.process_text_to_chunks(
|
|
||||||
text=text,
|
|
||||||
chunking_strategy=strategy,
|
|
||||||
title=title,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert to response
|
|
||||||
chunk_responses = [_to_chunk_response(c) for c in chunks]
|
|
||||||
|
|
||||||
logger.info(f"Successfully processed text: {len(chunks)} chunks created")
|
|
||||||
|
|
||||||
return ExtractAndChunkResponse(
|
|
||||||
chunks=chunk_responses,
|
|
||||||
total_chunks=len(chunk_responses),
|
|
||||||
)
|
|
||||||
|
|
||||||
except HTTPException:
|
|
||||||
raise
|
|
||||||
except DomainException as e:
|
|
||||||
raise _map_domain_exception(e)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Unexpected error processing text: {str(e)}")
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
||||||
detail=f"Internal server error: {str(e)}",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@router.get(
|
@router.get(
|
||||||
|
|||||||
@ -9,14 +9,15 @@ from uuid import UUID
|
|||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from ...core.domain.models import ChunkingMethod
|
||||||
|
|
||||||
|
|
||||||
class ChunkingStrategyRequest(BaseModel):
|
class ChunkingStrategyRequest(BaseModel):
|
||||||
"""Request model for chunking strategy configuration."""
|
"""Request model for chunking strategy configuration."""
|
||||||
|
|
||||||
strategy_name: str = Field(
|
strategy_name: ChunkingMethod = Field(
|
||||||
...,
|
...,
|
||||||
description="Name of chunking strategy (e.g., 'fixed_size', 'paragraph')",
|
description="Chunking method (FIXED_SIZE or PARAGRAPH)",
|
||||||
examples=["fixed_size", "paragraph"],
|
|
||||||
)
|
)
|
||||||
chunk_size: int = Field(
|
chunk_size: int = Field(
|
||||||
...,
|
...,
|
||||||
|
|||||||
@ -5,6 +5,7 @@ This adapter implements the IExtractor port for ZIP files containing
|
|||||||
Markdown documents. It merges all .md files into a single document.
|
Markdown documents. It merges all .md files into a single document.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import zipfile
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -184,7 +185,7 @@ class ZipExtractor(IExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Join all parts with proper spacing
|
# Join all parts with proper spacing
|
||||||
return "".join(merged_parts).strip()
|
return "\n".join(merged_parts).strip()
|
||||||
|
|
||||||
except EmptyContentError:
|
except EmptyContentError:
|
||||||
raise
|
raise
|
||||||
@ -234,11 +235,29 @@ class ZipExtractor(IExtractor):
|
|||||||
if filename.lower().endswith('.md'):
|
if filename.lower().endswith('.md'):
|
||||||
md_files.append(filename)
|
md_files.append(filename)
|
||||||
|
|
||||||
# Sort alphabetically for deterministic order
|
# Sort using natural/numeric order (page_1, page_2, ..., page_10)
|
||||||
md_files.sort()
|
md_files.sort(key=self._natural_sort_key)
|
||||||
|
|
||||||
return md_files
|
return md_files
|
||||||
|
|
||||||
|
def _natural_sort_key(self, filename: str):
|
||||||
|
"""
|
||||||
|
Generate a natural sort key for proper numeric ordering.
|
||||||
|
|
||||||
|
Converts numeric parts to integers for correct sorting:
|
||||||
|
- 'page_1.md' < 'page_2.md' < 'page_10.md'
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename: Filename to generate sort key for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of alternating strings and integers for natural sorting
|
||||||
|
"""
|
||||||
|
def convert(text):
|
||||||
|
return int(text) if text.isdigit() else text.lower()
|
||||||
|
|
||||||
|
return [convert(c) for c in re.split(r'(\d+)', filename)]
|
||||||
|
|
||||||
def _extract_file_content(
|
def _extract_file_content(
|
||||||
self,
|
self,
|
||||||
zip_file: zipfile.ZipFile,
|
zip_file: zipfile.ZipFile,
|
||||||
|
|||||||
@ -19,6 +19,12 @@ class SourceType(str, Enum):
|
|||||||
WEB = "web"
|
WEB = "web"
|
||||||
|
|
||||||
|
|
||||||
|
class ChunkingMethod(str, Enum):
|
||||||
|
"""Enumeration of supported chunking methods."""
|
||||||
|
FIXED_SIZE = "fixed_size"
|
||||||
|
PARAGRAPH = "paragraph"
|
||||||
|
|
||||||
|
|
||||||
class SourceFile(BaseModel):
|
class SourceFile(BaseModel):
|
||||||
"""
|
"""
|
||||||
Represents the raw input file before processing.
|
Represents the raw input file before processing.
|
||||||
@ -429,12 +435,12 @@ class ChunkingStrategy(BaseModel):
|
|||||||
Configuration for a chunking strategy.
|
Configuration for a chunking strategy.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
strategy_name: Name of the chunking strategy
|
strategy_name: Chunking method (fixed_size or paragraph)
|
||||||
chunk_size: Target size for chunks (in characters)
|
chunk_size: Target size for chunks (in characters)
|
||||||
overlap_size: Number of characters to overlap between chunks
|
overlap_size: Number of characters to overlap between chunks
|
||||||
respect_boundaries: Whether to respect sentence/paragraph boundaries
|
respect_boundaries: Whether to respect sentence/paragraph boundaries
|
||||||
"""
|
"""
|
||||||
strategy_name: str = Field(..., min_length=1, description="Strategy name")
|
strategy_name: ChunkingMethod = Field(..., description="Chunking method")
|
||||||
chunk_size: int = Field(..., ge=1, le=10000, description="Target chunk size")
|
chunk_size: int = Field(..., ge=1, le=10000, description="Target chunk size")
|
||||||
overlap_size: int = Field(default=0, ge=0, description="Overlap between chunks")
|
overlap_size: int = Field(default=0, ge=0, description="Overlap between chunks")
|
||||||
respect_boundaries: bool = Field(
|
respect_boundaries: bool = Field(
|
||||||
|
|||||||
@ -66,50 +66,24 @@ class ITextProcessor(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_document(self, document_id: UUID) -> Document:
|
def extract_document(self, file_path: Path) -> Document:
|
||||||
"""
|
"""
|
||||||
Retrieve a document by its ID.
|
Extract text content from document without parsing or chunking.
|
||||||
|
|
||||||
|
This method only performs extraction:
|
||||||
|
1. Extracts raw text content from file
|
||||||
|
2. Creates Document entity with metadata
|
||||||
|
3. Returns Document with raw_markdown (no sections)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
document_id: Unique identifier of the document
|
file_path: Path to the document file
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document entity
|
Document entity with raw markdown
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
DocumentNotFoundError: If document doesn't exist
|
ExtractionError: If text extraction fails
|
||||||
RepositoryError: If retrieval fails
|
UnsupportedFileTypeError: If file type is not supported
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def list_documents(self, limit: int = 100, offset: int = 0) -> List[Document]:
|
|
||||||
"""
|
|
||||||
List documents with pagination.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
limit: Maximum number of documents to return
|
|
||||||
offset: Number of documents to skip
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of Document entities
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def delete_document(self, document_id: UUID) -> bool:
|
|
||||||
"""
|
|
||||||
Delete a document by its ID.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document_id: Unique identifier of the document
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if deletion was successful
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
DocumentNotFoundError: If document doesn't exist
|
|
||||||
RepositoryError: If deletion fails
|
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@ -165,67 +165,37 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
logger.error(f"Failed to extract and chunk: {str(e)}")
|
logger.error(f"Failed to extract and chunk: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def get_document(self, document_id: UUID) -> Document:
|
def extract_document(self, file_path: Path) -> Document:
|
||||||
"""
|
"""
|
||||||
Retrieve a document by its ID.
|
Extract text content from document without parsing or chunking.
|
||||||
|
|
||||||
|
This method only performs extraction:
|
||||||
|
1. Extracts raw text content from file
|
||||||
|
2. Creates Document entity with metadata
|
||||||
|
3. Returns Document with raw_markdown (no sections)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
document_id: Unique identifier of the document
|
file_path: Path to the document file
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document entity
|
Document entity with raw markdown
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
DocumentNotFoundError: If document doesn't exist
|
ExtractionError: If text extraction fails
|
||||||
RepositoryError: If retrieval fails
|
UnsupportedFileTypeError: If file type is not supported
|
||||||
"""
|
"""
|
||||||
logger.debug(f"Retrieving document: {document_id}")
|
try:
|
||||||
|
logger.info(f"Extracting document: {file_path}")
|
||||||
document = self._repository.find_by_id(document_id)
|
document = self._extract_document(file_path)
|
||||||
|
logger.info(f"Successfully extracted {len(document.raw_markdown)} characters")
|
||||||
if document is None:
|
return document
|
||||||
raise DocumentNotFoundError(str(document_id))
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to extract document: {str(e)}")
|
||||||
return document
|
raise
|
||||||
|
|
||||||
def list_documents(self, limit: int = 100, offset: int = 0) -> List[Document]:
|
|
||||||
"""
|
|
||||||
List documents with pagination.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
limit: Maximum number of documents to return
|
|
||||||
offset: Number of documents to skip
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of Document entities
|
|
||||||
"""
|
|
||||||
logger.debug(f"Listing documents: limit={limit}, offset={offset}")
|
|
||||||
return self._repository.find_all(limit=limit, offset=offset)
|
|
||||||
|
|
||||||
def delete_document(self, document_id: UUID) -> bool:
|
|
||||||
"""
|
|
||||||
Delete a document by its ID.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
document_id: Unique identifier of the document
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if deletion was successful
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
DocumentNotFoundError: If document doesn't exist
|
|
||||||
RepositoryError: If deletion fails
|
|
||||||
"""
|
|
||||||
logger.info(f"Deleting document: {document_id}")
|
|
||||||
|
|
||||||
if not self._repository.exists(document_id):
|
|
||||||
raise DocumentNotFoundError(str(document_id))
|
|
||||||
|
|
||||||
return self._repository.delete(document_id)
|
|
||||||
|
|
||||||
def _extract_document(self, file_path: Path) -> Document:
|
def _extract_document(self, file_path: Path) -> Document:
|
||||||
"""
|
"""
|
||||||
Extract Document using appropriate extractor.
|
Internal helper: Extract Document using appropriate extractor.
|
||||||
|
|
||||||
Extractors create Document entities with raw_markdown and metadata.
|
Extractors create Document entities with raw_markdown and metadata.
|
||||||
Sections will be parsed later in the pipeline.
|
Sections will be parsed later in the pipeline.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user