Compare commits
2 Commits
2753b913fb
...
b53f8c47d3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b53f8c47d3 | ||
|
|
6259220629 |
@ -52,8 +52,8 @@ app = FastAPI(
|
|||||||
title="Text Processor API",
|
title="Text Processor API",
|
||||||
description="Text extraction and chunking system using Hexagonal Architecture",
|
description="Text extraction and chunking system using Hexagonal Architecture",
|
||||||
version="1.0.0",
|
version="1.0.0",
|
||||||
docs_url=None,
|
# docs_url=None,
|
||||||
redoc_url=None,
|
# redoc_url=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
router = APIRouter(
|
router = APIRouter(
|
||||||
@ -188,15 +188,11 @@ def to_document_response(document: Document) -> DocumentResponse:
|
|||||||
"""Convert domain document to API response."""
|
"""Convert domain document to API response."""
|
||||||
from .api_schemas import DocumentMetadataResponse
|
from .api_schemas import DocumentMetadataResponse
|
||||||
|
|
||||||
display_name = document.metadata.display_name
|
|
||||||
file_type = Path(display_name).suffix.lstrip('.') if '.' in display_name else 'unknown'
|
|
||||||
|
|
||||||
return DocumentResponse(
|
return DocumentResponse(
|
||||||
id=str(document.id),
|
id=str(document.id),
|
||||||
content=document.content,
|
content=document.content,
|
||||||
|
title=document.title,
|
||||||
metadata=DocumentMetadataResponse(
|
metadata=DocumentMetadataResponse(
|
||||||
file_name=document.metadata.display_name,
|
|
||||||
file_type=file_type,
|
|
||||||
file_size_bytes=document.metadata.size_bytes,
|
file_size_bytes=document.metadata.size_bytes,
|
||||||
created_at=document.metadata.created_at.isoformat(),
|
created_at=document.metadata.created_at.isoformat(),
|
||||||
author=document.metadata.author,
|
author=document.metadata.author,
|
||||||
@ -364,20 +360,20 @@ async def health_check() -> HealthCheckResponse:
|
|||||||
# Protected Documentation Routes
|
# Protected Documentation Routes
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
@app.get("/docs", include_in_schema=False)
|
# @app.get("/docs", include_in_schema=False)
|
||||||
def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
|
# def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
|
||||||
return get_swagger_ui_html(
|
# return get_swagger_ui_html(
|
||||||
openapi_url="/openapi.json",
|
# openapi_url="/openapi.json",
|
||||||
title="Protected Text-Processor API Docs"
|
# title="Protected Text-Processor API Docs"
|
||||||
)
|
# )
|
||||||
|
#
|
||||||
|
#
|
||||||
@app.get("/redoc", include_in_schema=False)
|
# @app.get("/redoc", include_in_schema=False)
|
||||||
def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
|
# def api_docs(_: HTTPBasicCredentials = Depends(check_docs_credentials)):
|
||||||
return get_redoc_html(
|
# return get_redoc_html(
|
||||||
openapi_url="/openapi.json",
|
# openapi_url="/openapi.json",
|
||||||
title="Protected Text-Processor API Docs"
|
# title="Protected Text-Processor API Docs"
|
||||||
)
|
# )
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Application Setup
|
# Application Setup
|
||||||
|
|||||||
@ -69,8 +69,6 @@ class ExtractAndChunkRequest(BaseModel):
|
|||||||
class DocumentMetadataResponse(BaseModel):
|
class DocumentMetadataResponse(BaseModel):
|
||||||
"""Response model for document metadata."""
|
"""Response model for document metadata."""
|
||||||
|
|
||||||
file_name: str
|
|
||||||
file_type: str
|
|
||||||
file_size_bytes: int
|
file_size_bytes: int
|
||||||
created_at: str
|
created_at: str
|
||||||
author: Optional[str] = None
|
author: Optional[str] = None
|
||||||
@ -82,6 +80,7 @@ class DocumentResponse(BaseModel):
|
|||||||
|
|
||||||
id: str
|
id: str
|
||||||
content: str
|
content: str
|
||||||
|
title: str
|
||||||
metadata: DocumentMetadataResponse
|
metadata: DocumentMetadataResponse
|
||||||
is_processed: bool
|
is_processed: bool
|
||||||
content_preview: str = Field(
|
content_preview: str = Field(
|
||||||
@ -104,13 +103,6 @@ class ChunkResponse(BaseModel):
|
|||||||
length: int
|
length: int
|
||||||
|
|
||||||
|
|
||||||
class ProcessDocumentResponse(BaseModel):
|
|
||||||
"""Response model for document processing."""
|
|
||||||
|
|
||||||
document: DocumentResponse
|
|
||||||
message: str = Field(default="Document processed successfully")
|
|
||||||
|
|
||||||
|
|
||||||
class ChunkListResponse(BaseModel):
|
class ChunkListResponse(BaseModel):
|
||||||
"""Response model for extract and chunk operation."""
|
"""Response model for extract and chunk operation."""
|
||||||
|
|
||||||
@ -119,31 +111,6 @@ class ChunkListResponse(BaseModel):
|
|||||||
message: str = Field(default="Document chunked successfully")
|
message: str = Field(default="Document chunked successfully")
|
||||||
|
|
||||||
|
|
||||||
class DocumentListResponse(BaseModel):
|
|
||||||
"""Response model for document list."""
|
|
||||||
|
|
||||||
documents: List[DocumentResponse]
|
|
||||||
total: int
|
|
||||||
limit: int
|
|
||||||
offset: int
|
|
||||||
|
|
||||||
|
|
||||||
class ErrorResponse(BaseModel):
|
|
||||||
"""Response model for errors."""
|
|
||||||
|
|
||||||
error: str
|
|
||||||
details: Optional[str] = None
|
|
||||||
error_type: str
|
|
||||||
|
|
||||||
|
|
||||||
class DeleteDocumentResponse(BaseModel):
|
|
||||||
"""Response model for document deletion."""
|
|
||||||
|
|
||||||
success: bool
|
|
||||||
message: str
|
|
||||||
document_id: str
|
|
||||||
|
|
||||||
|
|
||||||
class HealthCheckResponse(BaseModel):
|
class HealthCheckResponse(BaseModel):
|
||||||
"""Response model for health check."""
|
"""Response model for health check."""
|
||||||
|
|
||||||
|
|||||||
@ -300,7 +300,7 @@ class ParagraphChunker(IChunker):
|
|||||||
global_sequence = 0
|
global_sequence = 0
|
||||||
|
|
||||||
# Get document title from metadata
|
# Get document title from metadata
|
||||||
document_title = document.metadata.display_name
|
document_title = document.title
|
||||||
|
|
||||||
for section_index, section in enumerate(document.sections):
|
for section_index, section in enumerate(document.sections):
|
||||||
# Split this section's content into paragraph-based chunks
|
# Split this section's content into paragraph-based chunks
|
||||||
|
|||||||
@ -69,7 +69,11 @@ class DocxExtractor(IExtractor):
|
|||||||
metadata = self._create_metadata(file_path)
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
document = Document(
|
||||||
|
raw_markdown=markdown_text,
|
||||||
|
title=file_path.stem,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
@ -149,6 +153,5 @@ class DocxExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.stem,
|
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -69,7 +69,11 @@ class ExcelExtractor(IExtractor):
|
|||||||
metadata = self._create_metadata(file_path)
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
document = Document(
|
||||||
|
raw_markdown=markdown_text,
|
||||||
|
title=file_path.stem,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
@ -149,6 +153,5 @@ class ExcelExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.stem,
|
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -65,7 +65,11 @@ class MarkdownExtractor(IExtractor):
|
|||||||
metadata = self._create_metadata(file_path)
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
document = Document(
|
||||||
|
raw_markdown=markdown_text,
|
||||||
|
title=file_path.stem,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
@ -181,6 +185,5 @@ class MarkdownExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.stem,
|
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -69,7 +69,11 @@ class PDFExtractor(IExtractor):
|
|||||||
metadata = self._create_metadata(file_path, result)
|
metadata = self._create_metadata(file_path, result)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=markdown_text, metadata=metadata)
|
document = Document(
|
||||||
|
raw_markdown=markdown_text,
|
||||||
|
title=file_path.stem,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
f"Successfully extracted {len(markdown_text)} characters from {file_path.name}"
|
||||||
@ -162,7 +166,6 @@ class PDFExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.stem,
|
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
extra_metadata=extra_metadata,
|
extra_metadata=extra_metadata,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -66,7 +66,11 @@ class TxtExtractor(IExtractor):
|
|||||||
metadata = self._create_metadata(file_path)
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=text, metadata=metadata)
|
document = Document(
|
||||||
|
raw_markdown=text,
|
||||||
|
title=file_path.stem,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(text)} characters from {file_path.name}"
|
f"Successfully extracted {len(text)} characters from {file_path.name}"
|
||||||
@ -200,6 +204,5 @@ class TxtExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.stem,
|
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -69,7 +69,11 @@ class ZipExtractor(IExtractor):
|
|||||||
metadata = self._create_metadata(file_path)
|
metadata = self._create_metadata(file_path)
|
||||||
|
|
||||||
# Build document with raw_markdown
|
# Build document with raw_markdown
|
||||||
document = Document(raw_markdown=merged_text, metadata=metadata)
|
document = Document(
|
||||||
|
raw_markdown=merged_text,
|
||||||
|
title=file_path.stem,
|
||||||
|
metadata=metadata
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Successfully extracted {len(merged_text)} characters from {file_path.name}"
|
f"Successfully extracted {len(merged_text)} characters from {file_path.name}"
|
||||||
@ -312,6 +316,5 @@ class ZipExtractor(IExtractor):
|
|||||||
return DocumentMetadata(
|
return DocumentMetadata(
|
||||||
source_id=str(file_path.absolute()),
|
source_id=str(file_path.absolute()),
|
||||||
source_type=SourceType.FILE,
|
source_type=SourceType.FILE,
|
||||||
display_name=file_path.stem,
|
|
||||||
size_bytes=stat.st_size,
|
size_bytes=stat.st_size,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -161,7 +161,6 @@ class DocumentMetadata(BaseModel):
|
|||||||
Attributes:
|
Attributes:
|
||||||
source_id: Path or URL identifying the source
|
source_id: Path or URL identifying the source
|
||||||
source_type: Type of source (FILE or WEB)
|
source_type: Type of source (FILE or WEB)
|
||||||
display_name: Human-readable name (e.g., 'manual.pdf', 'about_us.html')
|
|
||||||
size_bytes: Size in bytes (file size or content length)
|
size_bytes: Size in bytes (file size or content length)
|
||||||
created_at: Timestamp when metadata was created
|
created_at: Timestamp when metadata was created
|
||||||
author: Optional author information
|
author: Optional author information
|
||||||
@ -169,7 +168,6 @@ class DocumentMetadata(BaseModel):
|
|||||||
"""
|
"""
|
||||||
source_id: str = Field(..., min_length=1, description="Path or URL")
|
source_id: str = Field(..., min_length=1, description="Path or URL")
|
||||||
source_type: SourceType = Field(..., description="Source type enum")
|
source_type: SourceType = Field(..., description="Source type enum")
|
||||||
display_name: str = Field(..., min_length=1, description="Display name")
|
|
||||||
size_bytes: int = Field(..., ge=0, description="Size in bytes")
|
size_bytes: int = Field(..., ge=0, description="Size in bytes")
|
||||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||||
author: Optional[str] = Field(None, description="Author information")
|
author: Optional[str] = Field(None, description="Author information")
|
||||||
@ -178,30 +176,6 @@ class DocumentMetadata(BaseModel):
|
|||||||
description="Additional metadata"
|
description="Additional metadata"
|
||||||
)
|
)
|
||||||
|
|
||||||
@field_validator('display_name')
|
|
||||||
@classmethod
|
|
||||||
def normalize_display_name(cls, value: str) -> str:
|
|
||||||
"""Normalize display name."""
|
|
||||||
return value.strip()
|
|
||||||
|
|
||||||
def get_summary(self) -> str:
|
|
||||||
"""
|
|
||||||
Generate a human-readable summary of metadata.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Formatted string containing key metadata information
|
|
||||||
"""
|
|
||||||
summary_parts = [
|
|
||||||
f"Source: {self.display_name}",
|
|
||||||
f"Type: {self.source_type.value}",
|
|
||||||
f"Size: {self._format_size()}",
|
|
||||||
]
|
|
||||||
|
|
||||||
if self.author:
|
|
||||||
summary_parts.append(f"Author: {self.author}")
|
|
||||||
|
|
||||||
return " | ".join(summary_parts)
|
|
||||||
|
|
||||||
def _format_size(self) -> str:
|
def _format_size(self) -> str:
|
||||||
"""Format size in human-readable format."""
|
"""Format size in human-readable format."""
|
||||||
size = self.size_bytes
|
size = self.size_bytes
|
||||||
@ -238,6 +212,7 @@ class Document(BaseModel):
|
|||||||
"""
|
"""
|
||||||
id: UUID = Field(default_factory=uuid4, description="Unique document ID")
|
id: UUID = Field(default_factory=uuid4, description="Unique document ID")
|
||||||
raw_markdown: str = Field(..., description="Raw Markdown content")
|
raw_markdown: str = Field(..., description="Raw Markdown content")
|
||||||
|
title: str = Field(..., description="Document title")
|
||||||
sections: List[DocumentSection] = Field(
|
sections: List[DocumentSection] = Field(
|
||||||
default_factory=list,
|
default_factory=list,
|
||||||
description="Structured document sections"
|
description="Structured document sections"
|
||||||
@ -296,15 +271,6 @@ class Document(BaseModel):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_metadata_summary(self) -> str:
|
|
||||||
"""
|
|
||||||
Get a summary of the document's metadata.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Human-readable metadata summary
|
|
||||||
"""
|
|
||||||
return self.metadata.get_summary()
|
|
||||||
|
|
||||||
def mark_as_processed(self) -> None:
|
def mark_as_processed(self) -> None:
|
||||||
"""Mark the document as processed."""
|
"""Mark the document as processed."""
|
||||||
self.is_processed = True
|
self.is_processed = True
|
||||||
|
|||||||
@ -221,13 +221,13 @@ class DocumentProcessorService(ITextProcessor):
|
|||||||
metadata = DocumentMetadata(
|
metadata = DocumentMetadata(
|
||||||
source_id="text_input",
|
source_id="text_input",
|
||||||
source_type=SourceType.TEXT,
|
source_type=SourceType.TEXT,
|
||||||
display_name=f"{title}.md",
|
|
||||||
size_bytes=len(text.encode('utf-8')),
|
size_bytes=len(text.encode('utf-8')),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Step 3: Create Document entity
|
# Step 3: Create Document entity
|
||||||
document = Document(
|
document = Document(
|
||||||
raw_markdown=text,
|
raw_markdown=text,
|
||||||
|
title=title,
|
||||||
sections=sections,
|
sections=sections,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user