-
Notifications
You must be signed in to change notification settings - Fork 0
fix(infra): fix some dependency hells and add some lazy loading to reduce celery worker RAM usage #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: eval-pr-5478-target-1758731865359
Are you sure you want to change the base?
fix(infra): fix some dependency hells and add some lazy loading to reduce celery worker RAM usage #1
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| from collections.abc import Sequence | ||
| from datetime import datetime | ||
| from typing import Any | ||
|
|
||
|
|
@@ -355,6 +356,97 @@ class SearchDoc(BaseModel): | |
| secondary_owners: list[str] | None = None | ||
| is_internet: bool = False | ||
|
|
||
| @classmethod | ||
| def chunks_or_sections_to_search_docs( | ||
| cls, | ||
| items: "Sequence[InferenceChunk | InferenceSection] | None", | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quoted type annotation reduces clarity and type tool effectiveness; remove quotes for consistency with the rest of the module. Reasoning: Prompt for AI agents[internal] Confidence score: 8/10 [internal] Posted by: General AI Review Agent |
||
| ) -> list["SearchDoc"]: | ||
| """Convert a sequence of InferenceChunk or InferenceSection objects to SearchDoc objects.""" | ||
| if not items: | ||
| return [] | ||
|
|
||
| search_docs = [ | ||
| cls( | ||
| document_id=( | ||
| chunk := ( | ||
| item.center_chunk | ||
| if isinstance(item, InferenceSection) | ||
| else item | ||
| ) | ||
| ).document_id, | ||
| chunk_ind=chunk.chunk_id, | ||
| semantic_identifier=chunk.semantic_identifier or "Unknown", | ||
| link=chunk.source_links[0] if chunk.source_links else None, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Accessing source_links with [0] can raise KeyError; use .get(0) to safely retrieve the zero-index entry when present. Prompt for AI agents[internal] Confidence score: 9/10 [internal] Posted by: General AI Review Agent |
||
| blurb=chunk.blurb, | ||
| source_type=chunk.source_type, | ||
| boost=chunk.boost, | ||
| hidden=chunk.hidden, | ||
| metadata=chunk.metadata, | ||
| score=chunk.score, | ||
| match_highlights=chunk.match_highlights, | ||
| updated_at=chunk.updated_at, | ||
| primary_owners=chunk.primary_owners, | ||
| secondary_owners=chunk.secondary_owners, | ||
| is_internet=False, | ||
| ) | ||
| for item in items | ||
| ] | ||
|
|
||
| return search_docs | ||
|
|
||
| @classmethod | ||
| def from_inference_section( | ||
| cls, inference_section: "InferenceSection" | ||
| ) -> "SearchDoc": | ||
| """Convert an InferenceSection to a SearchDoc using the center chunk's data.""" | ||
| chunk = inference_section.center_chunk | ||
| return cls( | ||
| document_id=chunk.document_id, | ||
| chunk_ind=chunk.chunk_id, | ||
| semantic_identifier=chunk.semantic_identifier or "Unknown", | ||
| link=chunk.source_links[0] if chunk.source_links else None, | ||
| blurb=chunk.blurb, | ||
| source_type=chunk.source_type, | ||
| boost=chunk.boost, | ||
| hidden=chunk.hidden, | ||
| metadata=chunk.metadata, | ||
| score=chunk.score, | ||
| is_relevant=chunk.is_relevant, | ||
| relevance_explanation=chunk.relevance_explanation, | ||
| match_highlights=chunk.match_highlights, | ||
| updated_at=chunk.updated_at, | ||
| primary_owners=chunk.primary_owners, | ||
| secondary_owners=chunk.secondary_owners, | ||
| is_internet=False, | ||
| ) | ||
|
|
||
| @classmethod | ||
| def from_inference_chunk(cls, inference_chunk: "InferenceChunk") -> "SearchDoc": | ||
| """Convert an InferenceChunk to a SearchDoc.""" | ||
| return cls( | ||
| document_id=inference_chunk.document_id, | ||
| chunk_ind=inference_chunk.chunk_id, | ||
| semantic_identifier=inference_chunk.semantic_identifier or "Unknown", | ||
| link=( | ||
| inference_chunk.source_links[0] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Accessing source_links with [0] can raise KeyError; replace with .get(0) to avoid exceptions when key 0 is missing. Prompt for AI agents[internal] Confidence score: 9/10 [internal] Posted by: General AI Review Agent |
||
| if inference_chunk.source_links | ||
| else None | ||
| ), | ||
| blurb=inference_chunk.blurb, | ||
| source_type=inference_chunk.source_type, | ||
| boost=inference_chunk.boost, | ||
| hidden=inference_chunk.hidden, | ||
| metadata=inference_chunk.metadata, | ||
| score=inference_chunk.score, | ||
| is_relevant=inference_chunk.is_relevant, | ||
| relevance_explanation=inference_chunk.relevance_explanation, | ||
| match_highlights=inference_chunk.match_highlights, | ||
| updated_at=inference_chunk.updated_at, | ||
| primary_owners=inference_chunk.primary_owners, | ||
| secondary_owners=inference_chunk.secondary_owners, | ||
| is_internet=False, | ||
| ) | ||
|
|
||
| def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]: # type: ignore | ||
| initial_dict = super().model_dump(*args, **kwargs) # type: ignore | ||
| initial_dict["updated_at"] = ( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,7 +14,6 @@ | |
| from onyx.context.search.preprocessing.access_filters import ( | ||
| build_access_filters_for_user, | ||
| ) | ||
| from onyx.context.search.utils import chunks_or_sections_to_search_docs | ||
| from onyx.db.chat import get_chat_messages_by_session | ||
| from onyx.db.chat import get_chat_session_by_id | ||
| from onyx.db.chat import get_chat_sessions_by_user | ||
|
|
@@ -74,7 +73,7 @@ def admin_search( | |
| ) | ||
| matching_chunks = document_index.admin_retrieval(query=query, filters=final_filters) | ||
|
|
||
| documents = chunks_or_sections_to_search_docs(matching_chunks) | ||
| documents = SearchDoc.chunks_or_sections_to_search_docs(matching_chunks) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This PR's goal of reducing memory via lazy-loading is undermined by Prompt for AI agents[internal] Confidence score: 10/10 [internal] Posted by: System Design Agent |
||
|
|
||
| # Deduplicate documents by id | ||
| deduplicated_documents: list[SearchDoc] = [] | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The core logic within
chunks_or_sections_to_search_docsfor convertingInferenceChunkorInferenceSection.center_chunktoSearchDocis a direct duplication ofSearchDoc.from_inference_chunk(). This method should be refactored to reusefrom_inference_chunkandfrom_inference_section.Prompt for AI agents
[internal] Confidence score: 9.5/10
[internal] Posted by: Duplicate Detection Agent