Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

from onyx.agents.agent_search.dr.sub_agents.states import SubAgentMainState
from onyx.agents.agent_search.dr.sub_agents.states import SubAgentUpdate
from onyx.agents.agent_search.dr.utils import chunks_or_sections_to_search_docs
from onyx.agents.agent_search.shared_graph_utils.utils import (
get_langgraph_node_log_string,
)
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
from onyx.context.search.models import SavedSearchDoc
from onyx.context.search.models import SearchDoc
from onyx.server.query_and_chat.streaming_models import SectionEnd
from onyx.utils.logger import setup_logger

Expand Down Expand Up @@ -47,7 +47,7 @@ def is_reducer(
doc_list.append(x)

# Convert InferenceSections to SavedSearchDocs
search_docs = chunks_or_sections_to_search_docs(doc_list)
search_docs = SearchDoc.chunks_or_sections_to_search_docs(doc_list)
retrieved_saved_search_docs = [
SavedSearchDoc.from_search_doc(search_doc, db_doc_id=0)
for search_doc in search_docs
Expand Down
4 changes: 2 additions & 2 deletions backend/onyx/agents/agent_search/dr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from onyx.context.search.models import InferenceSection
from onyx.context.search.models import SavedSearchDoc
from onyx.context.search.utils import chunks_or_sections_to_search_docs
from onyx.context.search.models import SearchDoc
from onyx.tools.tool_implementations.web_search.web_search_tool import (
WebSearchTool,
)
Expand Down Expand Up @@ -266,7 +266,7 @@ def convert_inference_sections_to_search_docs(
is_internet: bool = False,
) -> list[SavedSearchDoc]:
# Convert InferenceSections to SavedSearchDocs
search_docs = chunks_or_sections_to_search_docs(inference_sections)
search_docs = SearchDoc.chunks_or_sections_to_search_docs(inference_sections)
for search_doc in search_docs:
search_doc.is_internet = is_internet

Expand Down
92 changes: 92 additions & 0 deletions backend/onyx/context/search/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections.abc import Sequence
from datetime import datetime
from typing import Any

Expand Down Expand Up @@ -355,6 +356,97 @@ class SearchDoc(BaseModel):
secondary_owners: list[str] | None = None
is_internet: bool = False

@classmethod
def chunks_or_sections_to_search_docs(
Copy link

@cubic-dev-ai cubic-dev-ai bot Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The core logic within chunks_or_sections_to_search_docs for converting InferenceChunk or InferenceSection.center_chunk to SearchDoc is a direct duplication of SearchDoc.from_inference_chunk(). This method should be refactored to reuse from_inference_chunk and from_inference_section.

Prompt for AI agents
Address the following comment on backend/onyx/context/search/models.py at line 360:

<comment>The core logic within `chunks_or_sections_to_search_docs` for converting `InferenceChunk` or `InferenceSection.center_chunk` to `SearchDoc` is a direct duplication of `SearchDoc.from_inference_chunk()`. This method should be refactored to reuse `from_inference_chunk` and `from_inference_section`.</comment>

<file context>
@@ -355,6 +356,97 @@ class SearchDoc(BaseModel):
     is_internet: bool = False
 
+    @classmethod
+    def chunks_or_sections_to_search_docs(
+        cls,
+        items: &quot;Sequence[InferenceChunk | InferenceSection] | None&quot;,
</file context>

[internal] Confidence score: 9.5/10

[internal] Posted by: Duplicate Detection Agent

Fix with Cubic

cls,
items: "Sequence[InferenceChunk | InferenceSection] | None",
Copy link

@cubic-dev-ai cubic-dev-ai bot Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quoted type annotation reduces clarity and type tool effectiveness; remove quotes for consistency with the rest of the module.

    DEV MODE: This violation would have been filtered out by GPT-5.

Reasoning:
GPT-5: Stylistic only; quoted forward refs are used throughout the module and avoid forward-ref evaluation issues. No functional impact.

Prompt for AI agents
Address the following comment on backend/onyx/context/search/models.py at line 362:

<comment>Quoted type annotation reduces clarity and type tool effectiveness; remove quotes for consistency with the rest of the module.

        DEV MODE: This violation would have been filtered out by GPT-5.
Reasoning:
• **GPT-5**: Stylistic only; quoted forward refs are used throughout the module and avoid forward-ref evaluation issues. No functional impact.</comment>

<file context>
@@ -355,6 +356,97 @@ class SearchDoc(BaseModel):
+    @classmethod
+    def chunks_or_sections_to_search_docs(
+        cls,
+        items: &quot;Sequence[InferenceChunk | InferenceSection] | None&quot;,
+    ) -&gt; list[&quot;SearchDoc&quot;]:
+        &quot;&quot;&quot;Convert a sequence of InferenceChunk or InferenceSection objects to SearchDoc objects.&quot;&quot;&quot;
</file context>

[internal] Confidence score: 8/10

[internal] Posted by: General AI Review Agent

Fix with Cubic

) -> list["SearchDoc"]:
"""Convert a sequence of InferenceChunk or InferenceSection objects to SearchDoc objects."""
if not items:
return []

search_docs = [
cls(
document_id=(
chunk := (
item.center_chunk
if isinstance(item, InferenceSection)
else item
)
).document_id,
chunk_ind=chunk.chunk_id,
semantic_identifier=chunk.semantic_identifier or "Unknown",
link=chunk.source_links[0] if chunk.source_links else None,
Copy link

@cubic-dev-ai cubic-dev-ai bot Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Accessing source_links with [0] can raise KeyError; use .get(0) to safely retrieve the zero-index entry when present.

Prompt for AI agents
Address the following comment on backend/onyx/context/search/models.py at line 379:

<comment>Accessing source_links with [0] can raise KeyError; use .get(0) to safely retrieve the zero-index entry when present.</comment>

<file context>
@@ -355,6 +356,97 @@ class SearchDoc(BaseModel):
+                ).document_id,
+                chunk_ind=chunk.chunk_id,
+                semantic_identifier=chunk.semantic_identifier or &quot;Unknown&quot;,
+                link=chunk.source_links[0] if chunk.source_links else None,
+                blurb=chunk.blurb,
+                source_type=chunk.source_type,
</file context>

[internal] Confidence score: 9/10

[internal] Posted by: General AI Review Agent

Fix with Cubic

blurb=chunk.blurb,
source_type=chunk.source_type,
boost=chunk.boost,
hidden=chunk.hidden,
metadata=chunk.metadata,
score=chunk.score,
match_highlights=chunk.match_highlights,
updated_at=chunk.updated_at,
primary_owners=chunk.primary_owners,
secondary_owners=chunk.secondary_owners,
is_internet=False,
)
for item in items
]

return search_docs

@classmethod
def from_inference_section(
cls, inference_section: "InferenceSection"
) -> "SearchDoc":
"""Convert an InferenceSection to a SearchDoc using the center chunk's data."""
chunk = inference_section.center_chunk
return cls(
document_id=chunk.document_id,
chunk_ind=chunk.chunk_id,
semantic_identifier=chunk.semantic_identifier or "Unknown",
link=chunk.source_links[0] if chunk.source_links else None,
blurb=chunk.blurb,
source_type=chunk.source_type,
boost=chunk.boost,
hidden=chunk.hidden,
metadata=chunk.metadata,
score=chunk.score,
is_relevant=chunk.is_relevant,
relevance_explanation=chunk.relevance_explanation,
match_highlights=chunk.match_highlights,
updated_at=chunk.updated_at,
primary_owners=chunk.primary_owners,
secondary_owners=chunk.secondary_owners,
is_internet=False,
)

@classmethod
def from_inference_chunk(cls, inference_chunk: "InferenceChunk") -> "SearchDoc":
"""Convert an InferenceChunk to a SearchDoc."""
return cls(
document_id=inference_chunk.document_id,
chunk_ind=inference_chunk.chunk_id,
semantic_identifier=inference_chunk.semantic_identifier or "Unknown",
link=(
inference_chunk.source_links[0]
Copy link

@cubic-dev-ai cubic-dev-ai bot Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Accessing source_links with [0] can raise KeyError; replace with .get(0) to avoid exceptions when key 0 is missing.

Prompt for AI agents
Address the following comment on backend/onyx/context/search/models.py at line 431:

<comment>Accessing source_links with [0] can raise KeyError; replace with .get(0) to avoid exceptions when key 0 is missing.</comment>

<file context>
@@ -355,6 +356,97 @@ class SearchDoc(BaseModel):
+            chunk_ind=inference_chunk.chunk_id,
+            semantic_identifier=inference_chunk.semantic_identifier or &quot;Unknown&quot;,
+            link=(
+                inference_chunk.source_links[0]
+                if inference_chunk.source_links
+                else None
</file context>

[internal] Confidence score: 9/10

[internal] Posted by: General AI Review Agent

Fix with Cubic

if inference_chunk.source_links
else None
),
blurb=inference_chunk.blurb,
source_type=inference_chunk.source_type,
boost=inference_chunk.boost,
hidden=inference_chunk.hidden,
metadata=inference_chunk.metadata,
score=inference_chunk.score,
is_relevant=inference_chunk.is_relevant,
relevance_explanation=inference_chunk.relevance_explanation,
match_highlights=inference_chunk.match_highlights,
updated_at=inference_chunk.updated_at,
primary_owners=inference_chunk.primary_owners,
secondary_owners=inference_chunk.secondary_owners,
is_internet=False,
)

def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]: # type: ignore
initial_dict = super().model_dump(*args, **kwargs) # type: ignore
initial_dict["updated_at"] = (
Expand Down
34 changes: 0 additions & 34 deletions backend/onyx/context/search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,40 +118,6 @@ def inference_section_from_chunks(
)


def chunks_or_sections_to_search_docs(
items: Sequence[InferenceChunk | InferenceSection] | None,
) -> list[SearchDoc]:
if not items:
return []

search_docs = [
SearchDoc(
document_id=(
chunk := (
item.center_chunk if isinstance(item, InferenceSection) else item
)
).document_id,
chunk_ind=chunk.chunk_id,
semantic_identifier=chunk.semantic_identifier or "Unknown",
link=chunk.source_links[0] if chunk.source_links else None,
blurb=chunk.blurb,
source_type=chunk.source_type,
boost=chunk.boost,
hidden=chunk.hidden,
metadata=chunk.metadata,
score=chunk.score,
match_highlights=chunk.match_highlights,
updated_at=chunk.updated_at,
primary_owners=chunk.primary_owners,
secondary_owners=chunk.secondary_owners,
is_internet=False,
)
for item in items
]

return search_docs


def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
try:
# Re-tokenize using the NLTK tokenizer for better matching
Expand Down
3 changes: 1 addition & 2 deletions backend/onyx/db/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
from onyx.context.search.models import RetrievalDocs
from onyx.context.search.models import SavedSearchDoc
from onyx.context.search.models import SearchDoc as ServerSearchDoc
from onyx.context.search.utils import chunks_or_sections_to_search_docs
from onyx.db.models import AgentSearchMetrics
from onyx.db.models import AgentSubQuery
from onyx.db.models import AgentSubQuestion
Expand Down Expand Up @@ -1147,7 +1146,7 @@ def log_agent_sub_question_results(
db_session.add(sub_query_object)
db_session.commit()

search_docs = chunks_or_sections_to_search_docs(
search_docs = ServerSearchDoc.chunks_or_sections_to_search_docs(
sub_query.retrieved_documents
)
for doc in search_docs:
Expand Down
3 changes: 1 addition & 2 deletions backend/onyx/server/query_and_chat/query_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from onyx.context.search.preprocessing.access_filters import (
build_access_filters_for_user,
)
from onyx.context.search.utils import chunks_or_sections_to_search_docs
from onyx.db.chat import get_chat_messages_by_session
from onyx.db.chat import get_chat_session_by_id
from onyx.db.chat import get_chat_sessions_by_user
Expand Down Expand Up @@ -74,7 +73,7 @@ def admin_search(
)
matching_chunks = document_index.admin_retrieval(query=query, filters=final_filters)

documents = chunks_or_sections_to_search_docs(matching_chunks)
documents = SearchDoc.chunks_or_sections_to_search_docs(matching_chunks)
Copy link

@cubic-dev-ai cubic-dev-ai bot Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR's goal of reducing memory via lazy-loading is undermined by onyx/llm/utils.py, which still eagerly imports heavy libraries like litellm and tiktoken at the top level. This file's import chain causes these libraries to be loaded on startup, negating the memory savings from other lazy-loading efforts.

Prompt for AI agents
Address the following comment on backend/onyx/server/query_and_chat/query_backend.py at line 76:

<comment>This PR&#39;s goal of reducing memory via lazy-loading is undermined by `onyx/llm/utils.py`, which still eagerly imports heavy libraries like `litellm` and `tiktoken` at the top level. This file&#39;s import chain causes these libraries to be loaded on startup, negating the memory savings from other lazy-loading efforts.</comment>

<file context>
@@ -74,7 +73,7 @@ def admin_search(
     matching_chunks = document_index.admin_retrieval(query=query, filters=final_filters)
 
-    documents = chunks_or_sections_to_search_docs(matching_chunks)
+    documents = SearchDoc.chunks_or_sections_to_search_docs(matching_chunks)
 
     # Deduplicate documents by id
</file context>

[internal] Confidence score: 10/10

[internal] Posted by: System Design Agent

Fix with Cubic


# Deduplicate documents by id
deduplicated_documents: list[SearchDoc] = []
Expand Down