Compare commits

..

No commits in common. "b53f8c47d314d277b224d1fde8e229f58370a873" and "2753b913fb90e99660d5035c67c0cbcff8cfbe0b" have entirely different histories.

11 changed files with 104 additions and 51 deletions

View File

@ -52,8 +52,8 @@ app = FastAPI(
title="Text Processor API", title="Text Processor API",
description="Text extraction and chunking system using Hexagonal Architecture", description="Text extraction and chunking system using Hexagonal Architecture",
version="1.0.0", version="1.0.0",
# docs_url=None, docs_url=None,
# redoc_url=None, redoc_url=None,
) )
router = APIRouter( router = APIRouter(
@ -188,11 +188,15 @@ def to_document_response(document: Document) -> DocumentResponse:
"""Convert domain document to API response.""" """Convert domain document to API response."""
from .api_schemas import DocumentMetadataResponse from .api_schemas import DocumentMetadataResponse
display_name = document.metadata.display_name
file_type = Path(display_name).suffix.lstrip('.') if '.' in display_name else 'unknown'
return DocumentResponse( return DocumentResponse(
id=str(document.id), id=str(document.id),
content=document.content, content=document.content,
title=document.title,
metadata=DocumentMetadataResponse( metadata=DocumentMetadataResponse(
file_name=document.metadata.display_name,
file_type=file_type,
file_size_bytes=document.metadata.size_bytes, file_size_bytes=document.metadata.size_bytes,
created_at=document.metadata.created_at.isoformat(), created_at=document.metadata.created_at.isoformat(),
author=document.metadata.author, author=document.metadata.author,
@ -360,20 +364,20 @@ async def health_check() -> HealthCheckResponse:
# Protected Documentation Routes # Protected Documentation Routes
# ============================================================================= # =============================================================================
# @app.get("/docs", include_in_schema=False) @app.get("/docs", include_in_schema=False)
# def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)): def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
# return get_swagger_ui_html( return get_swagger_ui_html(
# openapi_url="/openapi.json", openapi_url="/openapi.json",
# title="Protected Text-Processor API Docs" title="Protected Text-Processor API Docs"
# ) )
#
#
# @app.get("/redoc", include_in_schema=False) @app.get("/redoc", include_in_schema=False)
# def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)): def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
# return get_redoc_html( return get_redoc_html(
# openapi_url="/openapi.json", openapi_url="/openapi.json",
# title="Protected Text-Processor API Docs" title="Protected Text-Processor API Docs"
# ) )
# ============================================================================= # =============================================================================
# Application Setup # Application Setup

View File

@ -69,6 +69,8 @@ class ExtractAndChunkRequest(BaseModel):
class DocumentMetadataResponse(BaseModel): class DocumentMetadataResponse(BaseModel):
"""Response model for document metadata.""" """Response model for document metadata."""
file_name: str
file_type: str
file_size_bytes: int file_size_bytes: int
created_at: str created_at: str
author: Optional[str] = None author: Optional[str] = None
@ -80,7 +82,6 @@ class DocumentResponse(BaseModel):
id: str id: str
content: str content: str
title: str
metadata: DocumentMetadataResponse metadata: DocumentMetadataResponse
is_processed: bool is_processed: bool
content_preview: str = Field( content_preview: str = Field(
@ -103,6 +104,13 @@ class ChunkResponse(BaseModel):
length: int length: int
class ProcessDocumentResponse(BaseModel):
"""Response model for document processing."""
document: DocumentResponse
message: str = Field(default="Document processed successfully")
class ChunkListResponse(BaseModel): class ChunkListResponse(BaseModel):
"""Response model for extract and chunk operation.""" """Response model for extract and chunk operation."""
@ -111,6 +119,31 @@ class ChunkListResponse(BaseModel):
message: str = Field(default="Document chunked successfully") message: str = Field(default="Document chunked successfully")
class DocumentListResponse(BaseModel):
"""Response model for document list."""
documents: List[DocumentResponse]
total: int
limit: int
offset: int
class ErrorResponse(BaseModel):
"""Response model for errors."""
error: str
details: Optional[str] = None
error_type: str
class DeleteDocumentResponse(BaseModel):
"""Response model for document deletion."""
success: bool
message: str
document_id: str
class HealthCheckResponse(BaseModel): class HealthCheckResponse(BaseModel):
"""Response model for health check.""" """Response model for health check."""

View File

@ -300,7 +300,7 @@ class ParagraphChunker(IChunker):
global_sequence = 0 global_sequence = 0
# Get document title from metadata # Get document title from metadata
document_title = document.title document_title = document.metadata.display_name
for section_index, section in enumerate(document.sections): for section_index, section in enumerate(document.sections):
# Split this section's content into paragraph-based chunks # Split this section's content into paragraph-based chunks

View File

@ -69,11 +69,7 @@ class DocxExtractor(IExtractor):
metadata = self._create_metadata(file_path) metadata = self._create_metadata(file_path)
# Build document with raw_markdown # Build document with raw_markdown
document = Document( document = Document(raw_markdown=markdown_text, metadata=metadata)
raw_markdown=markdown_text,
title=file_path.stem,
metadata=metadata
)
logger.info( logger.info(
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
@ -153,5 +149,6 @@ class DocxExtractor(IExtractor):
return DocumentMetadata( return DocumentMetadata(
source_id=str(file_path.absolute()), source_id=str(file_path.absolute()),
source_type=SourceType.FILE, source_type=SourceType.FILE,
display_name=file_path.stem,
size_bytes=stat.st_size, size_bytes=stat.st_size,
) )

View File

@ -69,11 +69,7 @@ class ExcelExtractor(IExtractor):
metadata = self._create_metadata(file_path) metadata = self._create_metadata(file_path)
# Build document with raw_markdown # Build document with raw_markdown
document = Document( document = Document(raw_markdown=markdown_text, metadata=metadata)
raw_markdown=markdown_text,
title=file_path.stem,
metadata=metadata
)
logger.info( logger.info(
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
@ -153,5 +149,6 @@ class ExcelExtractor(IExtractor):
return DocumentMetadata( return DocumentMetadata(
source_id=str(file_path.absolute()), source_id=str(file_path.absolute()),
source_type=SourceType.FILE, source_type=SourceType.FILE,
display_name=file_path.stem,
size_bytes=stat.st_size, size_bytes=stat.st_size,
) )

View File

@ -65,11 +65,7 @@ class MarkdownExtractor(IExtractor):
metadata = self._create_metadata(file_path) metadata = self._create_metadata(file_path)
# Build document with raw_markdown # Build document with raw_markdown
document = Document( document = Document(raw_markdown=markdown_text, metadata=metadata)
raw_markdown=markdown_text,
title=file_path.stem,
metadata=metadata
)
logger.info( logger.info(
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
@ -185,5 +181,6 @@ class MarkdownExtractor(IExtractor):
return DocumentMetadata( return DocumentMetadata(
source_id=str(file_path.absolute()), source_id=str(file_path.absolute()),
source_type=SourceType.FILE, source_type=SourceType.FILE,
display_name=file_path.stem,
size_bytes=stat.st_size, size_bytes=stat.st_size,
) )

View File

@ -69,11 +69,7 @@ class PDFExtractor(IExtractor):
metadata = self._create_metadata(file_path, result) metadata = self._create_metadata(file_path, result)
# Build document with raw_markdown # Build document with raw_markdown
document = Document( document = Document(raw_markdown=markdown_text, metadata=metadata)
raw_markdown=markdown_text,
title=file_path.stem,
metadata=metadata
)
logger.info( logger.info(
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
@ -166,6 +162,7 @@ class PDFExtractor(IExtractor):
return DocumentMetadata( return DocumentMetadata(
source_id=str(file_path.absolute()), source_id=str(file_path.absolute()),
source_type=SourceType.FILE, source_type=SourceType.FILE,
display_name=file_path.stem,
size_bytes=stat.st_size, size_bytes=stat.st_size,
extra_metadata=extra_metadata, extra_metadata=extra_metadata,
) )

View File

@ -66,11 +66,7 @@ class TxtExtractor(IExtractor):
metadata = self._create_metadata(file_path) metadata = self._create_metadata(file_path)
# Build document with raw_markdown # Build document with raw_markdown
document = Document( document = Document(raw_markdown=text, metadata=metadata)
raw_markdown=text,
title=file_path.stem,
metadata=metadata
)
logger.info( logger.info(
f"Successfully extracted {len(text)} characters from {file_path.name}" f"Successfully extracted {len(text)} characters from {file_path.name}"
@ -204,5 +200,6 @@ class TxtExtractor(IExtractor):
return DocumentMetadata( return DocumentMetadata(
source_id=str(file_path.absolute()), source_id=str(file_path.absolute()),
source_type=SourceType.FILE, source_type=SourceType.FILE,
display_name=file_path.stem,
size_bytes=stat.st_size, size_bytes=stat.st_size,
) )

View File

@ -69,11 +69,7 @@ class ZipExtractor(IExtractor):
metadata = self._create_metadata(file_path) metadata = self._create_metadata(file_path)
# Build document with raw_markdown # Build document with raw_markdown
document = Document( document = Document(raw_markdown=merged_text, metadata=metadata)
raw_markdown=merged_text,
title=file_path.stem,
metadata=metadata
)
logger.info( logger.info(
f"Successfully extracted {len(merged_text)} characters from {file_path.name}" f"Successfully extracted {len(merged_text)} characters from {file_path.name}"
@ -316,5 +312,6 @@ class ZipExtractor(IExtractor):
return DocumentMetadata( return DocumentMetadata(
source_id=str(file_path.absolute()), source_id=str(file_path.absolute()),
source_type=SourceType.FILE, source_type=SourceType.FILE,
display_name=file_path.stem,
size_bytes=stat.st_size, size_bytes=stat.st_size,
) )

View File

@ -161,6 +161,7 @@ class DocumentMetadata(BaseModel):
Attributes: Attributes:
source_id: Path or URL identifying the source source_id: Path or URL identifying the source
source_type: Type of source (FILE or WEB) source_type: Type of source (FILE or WEB)
display_name: Human-readable name (e.g., 'manual.pdf', 'about_us.html')
size_bytes: Size in bytes (file size or content length) size_bytes: Size in bytes (file size or content length)
created_at: Timestamp when metadata was created created_at: Timestamp when metadata was created
author: Optional author information author: Optional author information
@ -168,6 +169,7 @@ class DocumentMetadata(BaseModel):
""" """
source_id: str = Field(..., min_length=1, description="Path or URL") source_id: str = Field(..., min_length=1, description="Path or URL")
source_type: SourceType = Field(..., description="Source type enum") source_type: SourceType = Field(..., description="Source type enum")
display_name: str = Field(..., min_length=1, description="Display name")
size_bytes: int = Field(..., ge=0, description="Size in bytes") size_bytes: int = Field(..., ge=0, description="Size in bytes")
created_at: datetime = Field(default_factory=datetime.utcnow) created_at: datetime = Field(default_factory=datetime.utcnow)
author: Optional[str] = Field(None, description="Author information") author: Optional[str] = Field(None, description="Author information")
@ -176,6 +178,30 @@ class DocumentMetadata(BaseModel):
description="Additional metadata" description="Additional metadata"
) )
@field_validator('display_name')
@classmethod
def normalize_display_name(cls, value: str) -> str:
"""Normalize display name."""
return value.strip()
def get_summary(self) -> str:
"""
Generate a human-readable summary of metadata.
Returns:
Formatted string containing key metadata information
"""
summary_parts = [
f"Source: {self.display_name}",
f"Type: {self.source_type.value}",
f"Size: {self._format_size()}",
]
if self.author:
summary_parts.append(f"Author: {self.author}")
return " | ".join(summary_parts)
def _format_size(self) -> str: def _format_size(self) -> str:
"""Format size in human-readable format.""" """Format size in human-readable format."""
size = self.size_bytes size = self.size_bytes
@ -212,7 +238,6 @@ class Document(BaseModel):
""" """
id: UUID = Field(default_factory=uuid4, description="Unique document ID") id: UUID = Field(default_factory=uuid4, description="Unique document ID")
raw_markdown: str = Field(..., description="Raw Markdown content") raw_markdown: str = Field(..., description="Raw Markdown content")
title: str = Field(..., description="Document title")
sections: List[DocumentSection] = Field( sections: List[DocumentSection] = Field(
default_factory=list, default_factory=list,
description="Structured document sections" description="Structured document sections"
@ -271,6 +296,15 @@ class Document(BaseModel):
return True return True
def get_metadata_summary(self) -> str:
"""
Get a summary of the document's metadata.
Returns:
Human-readable metadata summary
"""
return self.metadata.get_summary()
def mark_as_processed(self) -> None: def mark_as_processed(self) -> None:
"""Mark the document as processed.""" """Mark the document as processed."""
self.is_processed = True self.is_processed = True

View File

@ -221,13 +221,13 @@ class DocumentProcessorService(ITextProcessor):
metadata = DocumentMetadata( metadata = DocumentMetadata(
source_id="text_input", source_id="text_input",
source_type=SourceType.TEXT, source_type=SourceType.TEXT,
display_name=f"{title}.md",
size_bytes=len(text.encode('utf-8')), size_bytes=len(text.encode('utf-8')),
) )
# Step 3: Create Document entity # Step 3: Create Document entity
document = Document( document = Document(
raw_markdown=text, raw_markdown=text,
title=title,
sections=sections, sections=sections,
metadata=metadata, metadata=metadata,
) )