From b53f8c47d314d277b224d1fde8e229f58370a873 Mon Sep 17 00:00:00 2001 From: "m.dabbagh" Date: Wed, 28 Jan 2026 22:13:55 +0330 Subject: [PATCH] add title to Document model and remove display_name form DocumentMetadata --- src/adapters/incoming/api_routes.py | 6 +--- src/adapters/incoming/api_schemas.py | 35 +----------------- .../outgoing/chunkers/paragraph_chunker.py | 2 +- .../outgoing/extractors/docx_extractor.py | 7 ++-- .../outgoing/extractors/excel_extractor.py | 7 ++-- .../outgoing/extractors/markdown_extractor.py | 7 ++-- .../outgoing/extractors/pdf_extractor.py | 7 ++-- .../outgoing/extractors/txt_extractor.py | 7 ++-- .../outgoing/extractors/zip_extractor.py | 7 ++-- src/core/domain/models.py | 36 +------------------ .../services/document_processor_service.py | 2 +- 11 files changed, 35 insertions(+), 88 deletions(-) diff --git a/src/adapters/incoming/api_routes.py b/src/adapters/incoming/api_routes.py index 8367415..6b79439 100644 --- a/src/adapters/incoming/api_routes.py +++ b/src/adapters/incoming/api_routes.py @@ -188,15 +188,11 @@ def to_document_response(document: Document) -> DocumentResponse: """Convert domain document to API response.""" from .api_schemas import DocumentMetadataResponse - display_name = document.metadata.display_name - file_type = Path(display_name).suffix.lstrip('.') if '.' in display_name else 'unknown' - return DocumentResponse( id=str(document.id), content=document.content, + title=document.title, metadata=DocumentMetadataResponse( - file_name=document.metadata.display_name, - file_type=file_type, file_size_bytes=document.metadata.size_bytes, created_at=document.metadata.created_at.isoformat(), author=document.metadata.author, diff --git a/src/adapters/incoming/api_schemas.py b/src/adapters/incoming/api_schemas.py index 113b237..08909be 100644 --- a/src/adapters/incoming/api_schemas.py +++ b/src/adapters/incoming/api_schemas.py @@ -69,8 +69,6 @@ class ExtractAndChunkRequest(BaseModel): class DocumentMetadataResponse(BaseModel): """Response model for document metadata.""" - file_name: str - file_type: str file_size_bytes: int created_at: str author: Optional[str] = None @@ -82,6 +80,7 @@ class DocumentResponse(BaseModel): id: str content: str + title: str metadata: DocumentMetadataResponse is_processed: bool content_preview: str = Field( @@ -104,13 +103,6 @@ class ChunkResponse(BaseModel): length: int -class ProcessDocumentResponse(BaseModel): - """Response model for document processing.""" - - document: DocumentResponse - message: str = Field(default="Document processed successfully") - - class ChunkListResponse(BaseModel): """Response model for extract and chunk operation.""" @@ -119,31 +111,6 @@ class ChunkListResponse(BaseModel): message: str = Field(default="Document chunked successfully") -class DocumentListResponse(BaseModel): - """Response model for document list.""" - - documents: List[DocumentResponse] - total: int - limit: int - offset: int - - -class ErrorResponse(BaseModel): - """Response model for errors.""" - - error: str - details: Optional[str] = None - error_type: str - - -class DeleteDocumentResponse(BaseModel): - """Response model for document deletion.""" - - success: bool - message: str - document_id: str - - class HealthCheckResponse(BaseModel): """Response model for health check.""" diff --git a/src/adapters/outgoing/chunkers/paragraph_chunker.py b/src/adapters/outgoing/chunkers/paragraph_chunker.py index dffcbc3..8675b74 100644 --- a/src/adapters/outgoing/chunkers/paragraph_chunker.py +++ b/src/adapters/outgoing/chunkers/paragraph_chunker.py @@ -300,7 +300,7 @@ class ParagraphChunker(IChunker): global_sequence = 0 # Get document title from metadata - document_title = document.metadata.display_name + document_title = document.title for section_index, section in enumerate(document.sections): # Split this section's content into paragraph-based chunks diff --git a/src/adapters/outgoing/extractors/docx_extractor.py b/src/adapters/outgoing/extractors/docx_extractor.py index ad7946d..dfe6472 100644 --- a/src/adapters/outgoing/extractors/docx_extractor.py +++ b/src/adapters/outgoing/extractors/docx_extractor.py @@ -69,7 +69,11 @@ class DocxExtractor(IExtractor): metadata = self._create_metadata(file_path) # Build document with raw_markdown - document = Document(raw_markdown=markdown_text, metadata=metadata) + document = Document( + raw_markdown=markdown_text, + title=file_path.stem, + metadata=metadata + ) logger.info( f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" @@ -149,6 +153,5 @@ class DocxExtractor(IExtractor): return DocumentMetadata( source_id=str(file_path.absolute()), source_type=SourceType.FILE, - display_name=file_path.stem, size_bytes=stat.st_size, ) diff --git a/src/adapters/outgoing/extractors/excel_extractor.py b/src/adapters/outgoing/extractors/excel_extractor.py index 908ccdd..a1c824b 100644 --- a/src/adapters/outgoing/extractors/excel_extractor.py +++ b/src/adapters/outgoing/extractors/excel_extractor.py @@ -69,7 +69,11 @@ class ExcelExtractor(IExtractor): metadata = self._create_metadata(file_path) # Build document with raw_markdown - document = Document(raw_markdown=markdown_text, metadata=metadata) + document = Document( + raw_markdown=markdown_text, + title=file_path.stem, + metadata=metadata + ) logger.info( f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" @@ -149,6 +153,5 @@ class ExcelExtractor(IExtractor): return DocumentMetadata( source_id=str(file_path.absolute()), source_type=SourceType.FILE, - display_name=file_path.stem, size_bytes=stat.st_size, ) diff --git a/src/adapters/outgoing/extractors/markdown_extractor.py b/src/adapters/outgoing/extractors/markdown_extractor.py index 52d3192..a8d52f7 100644 --- a/src/adapters/outgoing/extractors/markdown_extractor.py +++ b/src/adapters/outgoing/extractors/markdown_extractor.py @@ -65,7 +65,11 @@ class MarkdownExtractor(IExtractor): metadata = self._create_metadata(file_path) # Build document with raw_markdown - document = Document(raw_markdown=markdown_text, metadata=metadata) + document = Document( + raw_markdown=markdown_text, + title=file_path.stem, + metadata=metadata + ) logger.info( f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" @@ -181,6 +185,5 @@ class MarkdownExtractor(IExtractor): return DocumentMetadata( source_id=str(file_path.absolute()), source_type=SourceType.FILE, - display_name=file_path.stem, size_bytes=stat.st_size, ) diff --git a/src/adapters/outgoing/extractors/pdf_extractor.py b/src/adapters/outgoing/extractors/pdf_extractor.py index 17f9f3b..d660755 100644 --- a/src/adapters/outgoing/extractors/pdf_extractor.py +++ b/src/adapters/outgoing/extractors/pdf_extractor.py @@ -69,7 +69,11 @@ class PDFExtractor(IExtractor): metadata = self._create_metadata(file_path, result) # Build document with raw_markdown - document = Document(raw_markdown=markdown_text, metadata=metadata) + document = Document( + raw_markdown=markdown_text, + title=file_path.stem, + metadata=metadata + ) logger.info( f"Successfully extracted {len(markdown_text)} characters from {file_path.name}" @@ -162,7 +166,6 @@ class PDFExtractor(IExtractor): return DocumentMetadata( source_id=str(file_path.absolute()), source_type=SourceType.FILE, - display_name=file_path.stem, size_bytes=stat.st_size, extra_metadata=extra_metadata, ) diff --git a/src/adapters/outgoing/extractors/txt_extractor.py b/src/adapters/outgoing/extractors/txt_extractor.py index 0a70d0e..0a5dd81 100644 --- a/src/adapters/outgoing/extractors/txt_extractor.py +++ b/src/adapters/outgoing/extractors/txt_extractor.py @@ -66,7 +66,11 @@ class TxtExtractor(IExtractor): metadata = self._create_metadata(file_path) # Build document with raw_markdown - document = Document(raw_markdown=text, metadata=metadata) + document = Document( + raw_markdown=text, + title=file_path.stem, + metadata=metadata + ) logger.info( f"Successfully extracted {len(text)} characters from {file_path.name}" @@ -200,6 +204,5 @@ class TxtExtractor(IExtractor): return DocumentMetadata( source_id=str(file_path.absolute()), source_type=SourceType.FILE, - display_name=file_path.stem, size_bytes=stat.st_size, ) diff --git a/src/adapters/outgoing/extractors/zip_extractor.py b/src/adapters/outgoing/extractors/zip_extractor.py index 86ad29e..6795969 100644 --- a/src/adapters/outgoing/extractors/zip_extractor.py +++ b/src/adapters/outgoing/extractors/zip_extractor.py @@ -69,7 +69,11 @@ class ZipExtractor(IExtractor): metadata = self._create_metadata(file_path) # Build document with raw_markdown - document = Document(raw_markdown=merged_text, metadata=metadata) + document = Document( + raw_markdown=merged_text, + title=file_path.stem, + metadata=metadata + ) logger.info( f"Successfully extracted {len(merged_text)} characters from {file_path.name}" @@ -312,6 +316,5 @@ class ZipExtractor(IExtractor): return DocumentMetadata( source_id=str(file_path.absolute()), source_type=SourceType.FILE, - display_name=file_path.stem, size_bytes=stat.st_size, ) diff --git a/src/core/domain/models.py b/src/core/domain/models.py index 4c37edf..4619d55 100644 --- a/src/core/domain/models.py +++ b/src/core/domain/models.py @@ -161,7 +161,6 @@ class DocumentMetadata(BaseModel): Attributes: source_id: Path or URL identifying the source source_type: Type of source (FILE or WEB) - display_name: Human-readable name (e.g., 'manual.pdf', 'about_us.html') size_bytes: Size in bytes (file size or content length) created_at: Timestamp when metadata was created author: Optional author information @@ -169,7 +168,6 @@ class DocumentMetadata(BaseModel): """ source_id: str = Field(..., min_length=1, description="Path or URL") source_type: SourceType = Field(..., description="Source type enum") - display_name: str = Field(..., min_length=1, description="Display name") size_bytes: int = Field(..., ge=0, description="Size in bytes") created_at: datetime = Field(default_factory=datetime.utcnow) author: Optional[str] = Field(None, description="Author information") @@ -178,30 +176,6 @@ class DocumentMetadata(BaseModel): description="Additional metadata" ) - @field_validator('display_name') - @classmethod - def normalize_display_name(cls, value: str) -> str: - """Normalize display name.""" - return value.strip() - - def get_summary(self) -> str: - """ - Generate a human-readable summary of metadata. - - Returns: - Formatted string containing key metadata information - """ - summary_parts = [ - f"Source: {self.display_name}", - f"Type: {self.source_type.value}", - f"Size: {self._format_size()}", - ] - - if self.author: - summary_parts.append(f"Author: {self.author}") - - return " | ".join(summary_parts) - def _format_size(self) -> str: """Format size in human-readable format.""" size = self.size_bytes @@ -238,6 +212,7 @@ class Document(BaseModel): """ id: UUID = Field(default_factory=uuid4, description="Unique document ID") raw_markdown: str = Field(..., description="Raw Markdown content") + title: str = Field(..., description="Document title") sections: List[DocumentSection] = Field( default_factory=list, description="Structured document sections" @@ -296,15 +271,6 @@ class Document(BaseModel): return True - def get_metadata_summary(self) -> str: - """ - Get a summary of the document's metadata. - - Returns: - Human-readable metadata summary - """ - return self.metadata.get_summary() - def mark_as_processed(self) -> None: """Mark the document as processed.""" self.is_processed = True diff --git a/src/core/services/document_processor_service.py b/src/core/services/document_processor_service.py index 36c2610..1d77f65 100644 --- a/src/core/services/document_processor_service.py +++ b/src/core/services/document_processor_service.py @@ -221,13 +221,13 @@ class DocumentProcessorService(ITextProcessor): metadata = DocumentMetadata( source_id="text_input", source_type=SourceType.TEXT, - display_name=f"{title}.md", size_bytes=len(text.encode('utf-8')), ) # Step 3: Create Document entity document = Document( raw_markdown=text, + title=title, sections=sections, metadata=metadata, )