diff --git a/backend/onyx/agents/agent_search/dr/sub_agents/basic_search/dr_basic_search_3_reduce.py b/backend/onyx/agents/agent_search/dr/sub_agents/basic_search/dr_basic_search_3_reduce.py index 7b7e04b4172..b87a7120e8c 100644 --- a/backend/onyx/agents/agent_search/dr/sub_agents/basic_search/dr_basic_search_3_reduce.py +++ b/backend/onyx/agents/agent_search/dr/sub_agents/basic_search/dr_basic_search_3_reduce.py @@ -5,12 +5,12 @@ from onyx.agents.agent_search.dr.sub_agents.states import SubAgentMainState from onyx.agents.agent_search.dr.sub_agents.states import SubAgentUpdate -from onyx.agents.agent_search.dr.utils import chunks_or_sections_to_search_docs from onyx.agents.agent_search.shared_graph_utils.utils import ( get_langgraph_node_log_string, ) from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event from onyx.context.search.models import SavedSearchDoc +from onyx.context.search.models import SearchDoc from onyx.server.query_and_chat.streaming_models import SectionEnd from onyx.utils.logger import setup_logger @@ -47,7 +47,7 @@ def is_reducer( doc_list.append(x) # Convert InferenceSections to SavedSearchDocs - search_docs = chunks_or_sections_to_search_docs(doc_list) + search_docs = SearchDoc.chunks_or_sections_to_search_docs(doc_list) retrieved_saved_search_docs = [ SavedSearchDoc.from_search_doc(search_doc, db_doc_id=0) for search_doc in search_docs diff --git a/backend/onyx/agents/agent_search/dr/utils.py b/backend/onyx/agents/agent_search/dr/utils.py index b0e86d9b52d..817b6e19dde 100644 --- a/backend/onyx/agents/agent_search/dr/utils.py +++ b/backend/onyx/agents/agent_search/dr/utils.py @@ -13,7 +13,7 @@ ) from onyx.context.search.models import InferenceSection from onyx.context.search.models import SavedSearchDoc -from onyx.context.search.utils import chunks_or_sections_to_search_docs +from onyx.context.search.models import SearchDoc from onyx.tools.tool_implementations.web_search.web_search_tool import ( WebSearchTool, ) @@ -266,7 +266,7 @@ def convert_inference_sections_to_search_docs( is_internet: bool = False, ) -> list[SavedSearchDoc]: # Convert InferenceSections to SavedSearchDocs - search_docs = chunks_or_sections_to_search_docs(inference_sections) + search_docs = SearchDoc.chunks_or_sections_to_search_docs(inference_sections) for search_doc in search_docs: search_doc.is_internet = is_internet diff --git a/backend/onyx/context/search/models.py b/backend/onyx/context/search/models.py index 14e7c5bcb40..1c31105df50 100644 --- a/backend/onyx/context/search/models.py +++ b/backend/onyx/context/search/models.py @@ -1,3 +1,4 @@ +from collections.abc import Sequence from datetime import datetime from typing import Any @@ -355,6 +356,97 @@ class SearchDoc(BaseModel): secondary_owners: list[str] | None = None is_internet: bool = False + @classmethod + def chunks_or_sections_to_search_docs( + cls, + items: "Sequence[InferenceChunk | InferenceSection] | None", + ) -> list["SearchDoc"]: + """Convert a sequence of InferenceChunk or InferenceSection objects to SearchDoc objects.""" + if not items: + return [] + + search_docs = [ + cls( + document_id=( + chunk := ( + item.center_chunk + if isinstance(item, InferenceSection) + else item + ) + ).document_id, + chunk_ind=chunk.chunk_id, + semantic_identifier=chunk.semantic_identifier or "Unknown", + link=chunk.source_links[0] if chunk.source_links else None, + blurb=chunk.blurb, + source_type=chunk.source_type, + boost=chunk.boost, + hidden=chunk.hidden, + metadata=chunk.metadata, + score=chunk.score, + match_highlights=chunk.match_highlights, + updated_at=chunk.updated_at, + primary_owners=chunk.primary_owners, + secondary_owners=chunk.secondary_owners, + is_internet=False, + ) + for item in items + ] + + return search_docs + + @classmethod + def from_inference_section( + cls, inference_section: "InferenceSection" + ) -> "SearchDoc": + """Convert an InferenceSection to a SearchDoc using the center chunk's data.""" + chunk = inference_section.center_chunk + return cls( + document_id=chunk.document_id, + chunk_ind=chunk.chunk_id, + semantic_identifier=chunk.semantic_identifier or "Unknown", + link=chunk.source_links[0] if chunk.source_links else None, + blurb=chunk.blurb, + source_type=chunk.source_type, + boost=chunk.boost, + hidden=chunk.hidden, + metadata=chunk.metadata, + score=chunk.score, + is_relevant=chunk.is_relevant, + relevance_explanation=chunk.relevance_explanation, + match_highlights=chunk.match_highlights, + updated_at=chunk.updated_at, + primary_owners=chunk.primary_owners, + secondary_owners=chunk.secondary_owners, + is_internet=False, + ) + + @classmethod + def from_inference_chunk(cls, inference_chunk: "InferenceChunk") -> "SearchDoc": + """Convert an InferenceChunk to a SearchDoc.""" + return cls( + document_id=inference_chunk.document_id, + chunk_ind=inference_chunk.chunk_id, + semantic_identifier=inference_chunk.semantic_identifier or "Unknown", + link=( + inference_chunk.source_links[0] + if inference_chunk.source_links + else None + ), + blurb=inference_chunk.blurb, + source_type=inference_chunk.source_type, + boost=inference_chunk.boost, + hidden=inference_chunk.hidden, + metadata=inference_chunk.metadata, + score=inference_chunk.score, + is_relevant=inference_chunk.is_relevant, + relevance_explanation=inference_chunk.relevance_explanation, + match_highlights=inference_chunk.match_highlights, + updated_at=inference_chunk.updated_at, + primary_owners=inference_chunk.primary_owners, + secondary_owners=inference_chunk.secondary_owners, + is_internet=False, + ) + def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]: # type: ignore initial_dict = super().model_dump(*args, **kwargs) # type: ignore initial_dict["updated_at"] = ( diff --git a/backend/onyx/context/search/utils.py b/backend/onyx/context/search/utils.py index 607eb877fff..8edd75f7e51 100644 --- a/backend/onyx/context/search/utils.py +++ b/backend/onyx/context/search/utils.py @@ -118,40 +118,6 @@ def inference_section_from_chunks( ) -def chunks_or_sections_to_search_docs( - items: Sequence[InferenceChunk | InferenceSection] | None, -) -> list[SearchDoc]: - if not items: - return [] - - search_docs = [ - SearchDoc( - document_id=( - chunk := ( - item.center_chunk if isinstance(item, InferenceSection) else item - ) - ).document_id, - chunk_ind=chunk.chunk_id, - semantic_identifier=chunk.semantic_identifier or "Unknown", - link=chunk.source_links[0] if chunk.source_links else None, - blurb=chunk.blurb, - source_type=chunk.source_type, - boost=chunk.boost, - hidden=chunk.hidden, - metadata=chunk.metadata, - score=chunk.score, - match_highlights=chunk.match_highlights, - updated_at=chunk.updated_at, - primary_owners=chunk.primary_owners, - secondary_owners=chunk.secondary_owners, - is_internet=False, - ) - for item in items - ] - - return search_docs - - def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]: try: # Re-tokenize using the NLTK tokenizer for better matching diff --git a/backend/onyx/db/chat.py b/backend/onyx/db/chat.py index 440d23c28b4..d6c0eadebe5 100644 --- a/backend/onyx/db/chat.py +++ b/backend/onyx/db/chat.py @@ -34,7 +34,6 @@ from onyx.context.search.models import RetrievalDocs from onyx.context.search.models import SavedSearchDoc from onyx.context.search.models import SearchDoc as ServerSearchDoc -from onyx.context.search.utils import chunks_or_sections_to_search_docs from onyx.db.models import AgentSearchMetrics from onyx.db.models import AgentSubQuery from onyx.db.models import AgentSubQuestion @@ -1147,7 +1146,7 @@ def log_agent_sub_question_results( db_session.add(sub_query_object) db_session.commit() - search_docs = chunks_or_sections_to_search_docs( + search_docs = ServerSearchDoc.chunks_or_sections_to_search_docs( sub_query.retrieved_documents ) for doc in search_docs: diff --git a/backend/onyx/server/query_and_chat/query_backend.py b/backend/onyx/server/query_and_chat/query_backend.py index dc54056fd01..9cbe9584754 100644 --- a/backend/onyx/server/query_and_chat/query_backend.py +++ b/backend/onyx/server/query_and_chat/query_backend.py @@ -14,7 +14,6 @@ from onyx.context.search.preprocessing.access_filters import ( build_access_filters_for_user, ) -from onyx.context.search.utils import chunks_or_sections_to_search_docs from onyx.db.chat import get_chat_messages_by_session from onyx.db.chat import get_chat_session_by_id from onyx.db.chat import get_chat_sessions_by_user @@ -74,7 +73,7 @@ def admin_search( ) matching_chunks = document_index.admin_retrieval(query=query, filters=final_filters) - documents = chunks_or_sections_to_search_docs(matching_chunks) + documents = SearchDoc.chunks_or_sections_to_search_docs(matching_chunks) # Deduplicate documents by id deduplicated_documents: list[SearchDoc] = []