Skip to content

Commit 5f2b76d

Browse files
authored
Merge branch 'main' into delete-all-astra-ds
2 parents f670801 + 4873ada commit 5f2b76d

File tree

10 files changed

+297
-15
lines changed

10 files changed

+297
-15
lines changed

.github/workflows/amazon_bedrock.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ jobs:
7373
# Do not authenticate on PRs from forks and on PRs created by dependabot
7474
- name: AWS authentication
7575
id: aws-auth
76-
if: github.event.pull_request.head.repo.full_name == github.repository && !startsWith(github.event.pull_request.head.ref, 'dependabot/')
77-
uses: aws-actions/configure-aws-credentials@a03048d87541d1d9fcf2ecf528a4a65ba9bd7838
76+
if: github.event_name == 'schedule' || (github.event.pull_request.head.repo.full_name == github.repository && !startsWith(github.event.pull_request.head.ref, 'dependabot/'))
77+
uses: aws-actions/configure-aws-credentials@00943011d9042930efac3dcd3a170e4273319bc8
7878
with:
7979
aws-region: ${{ env.AWS_REGION }}
8080
role-to-assume: ${{ secrets.AWS_CI_ROLE_ARN }}

integrations/mcp/tests/test_mcp_toolset.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77
import time
88
from unittest.mock import patch
99

10+
import haystack
1011
import pytest
1112
import pytest_asyncio
1213
from haystack import logging
14+
from haystack.core.pipeline import Pipeline
1315
from haystack.tools import Tool
1416

1517
from haystack_integrations.tools.mcp import MCPToolset
@@ -413,3 +415,72 @@ def subtract(a: int, b: int) -> int:
413415
# Remove the temporary file
414416
if os.path.exists(server_script_path):
415417
os.remove(server_script_path)
418+
419+
def test_pipeline_deserialization_fails_without_github_token(self, monkeypatch):
420+
"""
421+
Test that pipeline deserialization + MCPToolset initialization fails when GitHub
422+
token is not resolved during deserialization.
423+
424+
The issue:
425+
- Setup: Agent pipeline template with MCPToolset with a token from env var (PERSONAL_ACCESS_TOKEN_GITHUB)
426+
- MCPToolset tries to connect immediately during __init__ after validation
427+
- Secrets get resolved during validation, after MCPToolset is initialized
428+
- Connection fails because token can't be resolved in __init__
429+
- Pipeline deserialization fails with DeserializationError
430+
431+
This test demonstrates why we need warmup for MCPToolset on first use rather than during deserialization.
432+
"""
433+
pipeline_yaml = """
434+
components:
435+
agent:
436+
init_parameters:
437+
chat_generator:
438+
init_parameters:
439+
api_base_url:
440+
api_key:
441+
env_vars:
442+
- OPENAI_API_KEY
443+
strict: false
444+
type: env_var
445+
generation_kwargs: {}
446+
max_retries:
447+
model: gpt-4o
448+
organization:
449+
streaming_callback:
450+
timeout:
451+
tools:
452+
tools_strict: false
453+
type: haystack.components.generators.chat.openai.OpenAIChatGenerator
454+
exit_conditions:
455+
- text
456+
max_agent_steps: 100
457+
raise_on_tool_invocation_failure: false
458+
state_schema: {}
459+
streaming_callback:
460+
system_prompt: |-
461+
You are an assistant that summarizes latest issues and PRs on a github repository
462+
that happened within a certain time frame (e.g. last day or last week). Make sure
463+
that you always use the current date as a basis for the time frame. Iterate over
464+
issues and PRs where necessary to get a comprehensive overview.
465+
tools:
466+
data:
467+
server_info:
468+
type: haystack_integrations.tools.mcp.mcp_tool.StreamableHttpServerInfo
469+
url: https://api.githubcopilot.com/mcp/
470+
token:
471+
env_vars:
472+
- PERSONAL_ACCESS_TOKEN_GITHUB
473+
strict: true
474+
type: env_var
475+
timeout: 10
476+
tool_names: [get_issue, get_issue_comments]
477+
type: haystack_integrations.tools.mcp.MCPToolset
478+
type: haystack.components.agents.agent.Agent
479+
480+
connections: []
481+
"""
482+
monkeypatch.setenv("PERSONAL_ACCESS_TOKEN_GITHUB", "SOME_OBVIOUSLY_INVALID_TOKEN")
483+
# Attempt to deserialize the pipeline - this will fail because MCPToolset
484+
# tries to connect immediately and the token isn't available
485+
with pytest.raises(haystack.core.errors.DeserializationError):
486+
Pipeline.loads(pipeline_yaml)

integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py

Lines changed: 96 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def __init__(
8787
Creates a new OpenSearchDocumentStore instance.
8888
8989
The ``embeddings_dim``, ``method``, ``mappings``, and ``settings`` arguments are only used if the index does not
90-
exists and needs to be created. If the index already exists, its current configurations will be used.
90+
exist and needs to be created. If the index already exists, its current configurations will be used.
9191
9292
For more information on connection parameters, see the [official OpenSearch documentation](https://opensearch.org/docs/latest/clients/python-low-level/#connecting-to-opensearch)
9393
@@ -107,7 +107,7 @@ def __init__(
107107
:param settings: The settings of the index to be created. Please see the [official OpenSearch docs](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#index-settings)
108108
for more information. Defaults to {"index.knn": True}
109109
:param create_index: Whether to create the index if it doesn't exist. Defaults to True
110-
:param http_auth: http_auth param passed to the underying connection class.
110+
:param http_auth: http_auth param passed to the underlying connection class.
111111
For basic authentication with default connection class `Urllib3HttpConnection` this can be
112112
- a tuple of (username, password)
113113
- a list of [username, password]
@@ -319,7 +319,8 @@ async def count_documents_async(self) -> int:
319319
assert self._async_client is not None
320320
return (await self._async_client.count(index=self._index))["count"]
321321

322-
def _deserialize_search_hits(self, hits: List[Dict[str, Any]]) -> List[Document]:
322+
@staticmethod
323+
def _deserialize_search_hits(hits: List[Dict[str, Any]]) -> List[Document]:
323324
out = []
324325
for hit in hits:
325326
data = hit["_source"]
@@ -344,12 +345,12 @@ def _prepare_filter_search_request(self, filters: Optional[Dict[str, Any]]) -> D
344345
def _search_documents(self, request_body: Dict[str, Any]) -> List[Document]:
345346
assert self._client is not None
346347
search_results = self._client.search(index=self._index, body=request_body)
347-
return self._deserialize_search_hits(search_results["hits"]["hits"])
348+
return OpenSearchDocumentStore._deserialize_search_hits(search_results["hits"]["hits"])
348349

349350
async def _search_documents_async(self, request_body: Dict[str, Any]) -> List[Document]:
350351
assert self._async_client is not None
351352
search_results = await self._async_client.search(index=self._index, body=request_body)
352-
return self._deserialize_search_hits(search_results["hits"]["hits"])
353+
return OpenSearchDocumentStore._deserialize_search_hits(search_results["hits"]["hits"])
353354

354355
def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
355356
"""
@@ -418,7 +419,8 @@ def _prepare_bulk_write_request(
418419
"stats_only": False,
419420
}
420421

421-
def _process_bulk_write_errors(self, errors: List[Dict[str, Any]], policy: DuplicatePolicy) -> None:
422+
@staticmethod
423+
def _process_bulk_write_errors(errors: List[Dict[str, Any]], policy: DuplicatePolicy) -> None:
422424
if len(errors) == 0:
423425
return
424426

@@ -461,7 +463,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
461463

462464
bulk_params = self._prepare_bulk_write_request(documents=documents, policy=policy, is_async=False)
463465
documents_written, errors = bulk(**bulk_params)
464-
self._process_bulk_write_errors(errors, policy)
466+
OpenSearchDocumentStore._process_bulk_write_errors(errors, policy)
465467
return documents_written
466468

467469
async def write_documents_async(
@@ -478,10 +480,11 @@ async def write_documents_async(
478480
bulk_params = self._prepare_bulk_write_request(documents=documents, policy=policy, is_async=True)
479481
documents_written, errors = await async_bulk(**bulk_params)
480482
# since we call async_bulk with stats_only=False, errors is guaranteed to be a list (not int)
481-
self._process_bulk_write_errors(errors=errors, policy=policy) # type: ignore[arg-type]
483+
OpenSearchDocumentStore._process_bulk_write_errors(errors=errors, policy=policy) # type: ignore[arg-type]
482484
return documents_written
483485

484-
def _deserialize_document(self, hit: Dict[str, Any]) -> Document:
486+
@staticmethod
487+
def _deserialize_document(hit: Dict[str, Any]) -> Document:
485488
"""
486489
Creates a Document from the search hit provided.
487490
This is mostly useful in self.filter_documents().
@@ -525,6 +528,86 @@ async def delete_documents_async(self, document_ids: List[str]) -> None:
525528

526529
await async_bulk(**self._prepare_bulk_delete_request(document_ids=document_ids, is_async=True))
527530

531+
def _prepare_delete_all_request(self, *, is_async: bool) -> Dict[str, Any]:
532+
return {
533+
"index": self._index,
534+
"body": {"query": {"match_all": {}}}, # Delete all documents
535+
"wait_for_completion": False if is_async else True, # block until done (set False for async)
536+
}
537+
538+
def delete_all_documents(self, recreate_index: bool = False) -> None: # noqa: FBT002, FBT001
539+
"""
540+
Deletes all documents in the document store.
541+
542+
:param recreate_index: If True, the index will be deleted and recreated with the original mappings and
543+
settings. If False, all documents will be deleted using the `delete_by_query` API.
544+
"""
545+
self._ensure_initialized()
546+
assert self._client is not None
547+
548+
try:
549+
if recreate_index:
550+
# get the current index mappings and settings
551+
index_name = self._index
552+
body = {
553+
"mappings": self._client.indices.get(self._index)[index_name]["mappings"],
554+
"settings": self._client.indices.get(self._index)[index_name]["settings"],
555+
}
556+
body["settings"]["index"].pop("uuid", None)
557+
body["settings"]["index"].pop("creation_date", None)
558+
body["settings"]["index"].pop("provided_name", None)
559+
body["settings"]["index"].pop("version", None)
560+
self._client.indices.delete(index=self._index)
561+
self._client.indices.create(index=self._index, body=body)
562+
logger.info(
563+
"The index '{index}' recreated with the original mappings and settings.",
564+
index=self._index,
565+
)
566+
567+
else:
568+
result = self._client.delete_by_query(**self._prepare_delete_all_request(is_async=False))
569+
logger.info(
570+
"Deleted all the {n_docs} documents from the index '{index}'.",
571+
index=self._index,
572+
n_docs=result["deleted"],
573+
)
574+
except Exception as e:
575+
msg = f"Failed to delete all documents from OpenSearch: {e!s}"
576+
raise DocumentStoreError(msg) from e
577+
578+
async def delete_all_documents_async(self, recreate_index: bool = False) -> None: # noqa: FBT002, FBT001
579+
"""
580+
Asynchronously deletes all documents in the document store.
581+
582+
:param recreate_index: If True, the index will be deleted and recreated with the original mappings and
583+
settings. If False, all documents will be deleted using the `delete_by_query` API.
584+
"""
585+
self._ensure_initialized()
586+
assert self._async_client is not None
587+
588+
try:
589+
if recreate_index:
590+
# get the current index mappings and settings
591+
index_name = self._index
592+
index_info = await self._async_client.indices.get(self._index)
593+
body = {
594+
"mappings": index_info[index_name]["mappings"],
595+
"settings": index_info[index_name]["settings"],
596+
}
597+
body["settings"]["index"].pop("uuid", None)
598+
body["settings"]["index"].pop("creation_date", None)
599+
body["settings"]["index"].pop("provided_name", None)
600+
body["settings"]["index"].pop("version", None)
601+
602+
await self._async_client.indices.delete(index=self._index)
603+
await self._async_client.indices.create(index=self._index, body=body)
604+
else:
605+
await self._async_client.delete_by_query(**self._prepare_delete_all_request(is_async=True))
606+
607+
except Exception as e:
608+
msg = f"Failed to delete all documents from OpenSearch: {e!s}"
609+
raise DocumentStoreError(msg) from e
610+
528611
def _prepare_bm25_search_request(
529612
self,
530613
*,
@@ -580,7 +663,8 @@ def _prepare_bm25_search_request(
580663

581664
return body
582665

583-
def _postprocess_bm25_search_results(self, *, results: List[Document], scale_score: bool) -> None:
666+
@staticmethod
667+
def _postprocess_bm25_search_results(*, results: List[Document], scale_score: bool) -> None:
584668
if not scale_score:
585669
return
586670

@@ -624,7 +708,7 @@ def _bm25_retrieval(
624708
custom_query=custom_query,
625709
)
626710
documents = self._search_documents(search_params)
627-
self._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
711+
OpenSearchDocumentStore._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
628712
return documents
629713

630714
async def _bm25_retrieval_async(
@@ -663,7 +747,7 @@ async def _bm25_retrieval_async(
663747
custom_query=custom_query,
664748
)
665749
documents = await self._search_documents_async(search_params)
666-
self._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
750+
OpenSearchDocumentStore._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
667751
return documents
668752

669753
def _prepare_embedding_search_request(

integrations/opensearch/tests/test_auth.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
15
from unittest.mock import Mock, patch
26

37
import pytest

integrations/opensearch/tests/test_bm25_retriever.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
22
#
33
# SPDX-License-Identifier: Apache-2.0
4+
45
from unittest.mock import Mock, patch
56

67
import pytest

integrations/opensearch/tests/test_document_store.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
22
#
33
# SPDX-License-Identifier: Apache-2.0
4+
45
import random
6+
import time
57
from typing import List
68
from unittest.mock import patch
79

@@ -453,7 +455,7 @@ def test_embedding_retrieval_but_dont_return_embeddings_for_bm25_retrieval(
453455
assert len(results) == 2
454456
assert results[0].embedding is None
455457

456-
def filter_documents_no_embedding_returned(
458+
def test_filter_documents_no_embedding_returned(
457459
self, document_store_embedding_dim_4_no_emb_returned: OpenSearchDocumentStore
458460
):
459461
docs = [
@@ -468,3 +470,56 @@ def filter_documents_no_embedding_returned(
468470
assert results[0].embedding is None
469471
assert results[1].embedding is None
470472
assert results[2].embedding is None
473+
474+
def test_delete_all_documents_index_recreation(self, document_store: OpenSearchDocumentStore):
475+
# populate the index with some documents
476+
docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")]
477+
document_store.write_documents(docs)
478+
479+
# capture index structure before deletion
480+
assert document_store._client is not None
481+
index_info_before = document_store._client.indices.get(index=document_store._index)
482+
mappings_before = index_info_before[document_store._index]["mappings"]
483+
settings_before = index_info_before[document_store._index]["settings"]
484+
485+
# delete all documents
486+
document_store.delete_all_documents(recreate_index=True)
487+
assert document_store.count_documents() == 0
488+
489+
# verify index structure is preserved
490+
index_info_after = document_store._client.indices.get(index=document_store._index)
491+
mappings_after = index_info_after[document_store._index]["mappings"]
492+
settings_after = index_info_after[document_store._index]["settings"]
493+
494+
assert mappings_after == mappings_before, "delete_all_documents should preserve index mappings"
495+
496+
settings_after["index"].pop("uuid", None)
497+
settings_after["index"].pop("creation_date", None)
498+
settings_before["index"].pop("uuid", None)
499+
settings_before["index"].pop("creation_date", None)
500+
assert settings_after == settings_before, "delete_all_documents should preserve index settings"
501+
502+
new_doc = Document(id="4", content="New document after delete all")
503+
document_store.write_documents([new_doc])
504+
assert document_store.count_documents() == 1
505+
506+
results = document_store.filter_documents()
507+
assert len(results) == 1
508+
assert results[0].content == "New document after delete all"
509+
510+
def test_delete_all_documents_no_index_recreation(self, document_store: OpenSearchDocumentStore):
511+
docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")]
512+
document_store.write_documents(docs)
513+
assert document_store.count_documents() == 2
514+
515+
document_store.delete_all_documents(recreate_index=False)
516+
time.sleep(2) # need to wait for the deletion to be reflected in count_documents
517+
assert document_store.count_documents() == 0
518+
519+
new_doc = Document(id="3", content="New document after delete all")
520+
document_store.write_documents([new_doc])
521+
assert document_store.count_documents() == 1
522+
523+
results = document_store.filter_documents()
524+
assert len(results) == 1
525+
assert results[0].content == "New document after delete all"

0 commit comments

Comments
 (0)