deepset-ai
diff --git a/‎.github/workflows/amazon_bedrock.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/amazon_bedrock.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎integrations/mcp/tests/test_mcp_toolset.py‎
Lines changed: 71 additions & 0 deletions b/‎integrations/mcp/tests/test_mcp_toolset.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py‎
Lines changed: 96 additions & 12 deletions b/‎integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py‎
Lines changed: 96 additions & 12 deletions
diff --git a/‎integrations/opensearch/tests/test_auth.py‎
Lines changed: 4 additions & 0 deletions b/‎integrations/opensearch/tests/test_auth.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎integrations/opensearch/tests/test_bm25_retriever.py‎
Lines changed: 1 addition & 0 deletions b/‎integrations/opensearch/tests/test_bm25_retriever.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎integrations/opensearch/tests/test_document_store.py‎
Lines changed: 56 additions & 1 deletion b/‎integrations/opensearch/tests/test_document_store.py‎
Lines changed: 56 additions & 1 deletion
@@ -73,8 +73,8 @@ jobs:
       # Do not authenticate on PRs from forks and on PRs created by dependabot
       - name: AWS authentication
         id: aws-auth
-        if: github.event.pull_request.head.repo.full_name == github.repository && !startsWith(github.event.pull_request.head.ref, 'dependabot/')
-        uses: aws-actions/configure-aws-credentials@a03048d87541d1d9fcf2ecf528a4a65ba9bd7838
+        if: github.event_name == 'schedule' || (github.event.pull_request.head.repo.full_name == github.repository && !startsWith(github.event.pull_request.head.ref, 'dependabot/'))
+        uses: aws-actions/configure-aws-credentials@00943011d9042930efac3dcd3a170e4273319bc8
         with:
           aws-region: ${{ env.AWS_REGION }}
           role-to-assume: ${{ secrets.AWS_CI_ROLE_ARN }}
 
@@ -7,9 +7,11 @@
 import time
 from unittest.mock import patch
 
+import haystack
 import pytest
 import pytest_asyncio
 from haystack import logging
+from haystack.core.pipeline import Pipeline
 from haystack.tools import Tool
 
 from haystack_integrations.tools.mcp import MCPToolset
@@ -413,3 +415,72 @@ def subtract(a: int, b: int) -> int:
             # Remove the temporary file
             if os.path.exists(server_script_path):
                 os.remove(server_script_path)
+
+    def test_pipeline_deserialization_fails_without_github_token(self, monkeypatch):
+        """
+        Test that pipeline deserialization + MCPToolset initialization fails when GitHub
+        token is not resolved during deserialization.
+
+        The issue:
+        - Setup: Agent pipeline template with MCPToolset with a token from env var (PERSONAL_ACCESS_TOKEN_GITHUB)
+        - MCPToolset tries to connect immediately during __init__ after validation
+        - Secrets get resolved during validation, after MCPToolset is initialized
+        - Connection fails because token can't be resolved in __init__
+        - Pipeline deserialization fails with DeserializationError
+
+        This test demonstrates why we need warmup for MCPToolset on first use rather than during deserialization.
+        """
+        pipeline_yaml = """
+components:
+  agent:
+    init_parameters:
+      chat_generator:
+        init_parameters:
+          api_base_url:
+          api_key:
+            env_vars:
+            - OPENAI_API_KEY
+            strict: false
+            type: env_var
+          generation_kwargs: {}
+          max_retries:
+          model: gpt-4o
+          organization:
+          streaming_callback:
+          timeout:
+          tools:
+          tools_strict: false
+        type: haystack.components.generators.chat.openai.OpenAIChatGenerator
+      exit_conditions:
+      - text
+      max_agent_steps: 100
+      raise_on_tool_invocation_failure: false
+      state_schema: {}
+      streaming_callback:
+      system_prompt: |-
+        You are an assistant that summarizes latest issues and PRs on a github repository
+        that happened within a certain time frame (e.g. last day or last week). Make sure
+        that you always use the current date as a basis for the time frame. Iterate over
+        issues and PRs where necessary to get a comprehensive overview.
+      tools:
+        data:
+          server_info:
+            type: haystack_integrations.tools.mcp.mcp_tool.StreamableHttpServerInfo
+            url: https://api.githubcopilot.com/mcp/
+            token:
+              env_vars:
+              - PERSONAL_ACCESS_TOKEN_GITHUB
+              strict: true
+              type: env_var
+            timeout: 10
+          tool_names: [get_issue, get_issue_comments]
+        type: haystack_integrations.tools.mcp.MCPToolset
+    type: haystack.components.agents.agent.Agent
+
+connections: []
+"""
+        monkeypatch.setenv("PERSONAL_ACCESS_TOKEN_GITHUB", "SOME_OBVIOUSLY_INVALID_TOKEN")
+        # Attempt to deserialize the pipeline - this will fail because MCPToolset
+        # tries to connect immediately and the token isn't available
+        with pytest.raises(haystack.core.errors.DeserializationError):
+            Pipeline.loads(pipeline_yaml)
@@ -87,7 +87,7 @@ def __init__(
         Creates a new OpenSearchDocumentStore instance.
 
         The ``embeddings_dim``, ``method``, ``mappings``, and ``settings`` arguments are only used if the index does not
-        exists and needs to be created. If the index already exists, its current configurations will be used.
+        exist and needs to be created. If the index already exists, its current configurations will be used.
 
         For more information on connection parameters, see the [official OpenSearch documentation](https://opensearch.org/docs/latest/clients/python-low-level/#connecting-to-opensearch)
 
@@ -107,7 +107,7 @@ def __init__(
         :param settings: The settings of the index to be created. Please see the [official OpenSearch docs](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#index-settings)
             for more information. Defaults to {"index.knn": True}
         :param create_index: Whether to create the index if it doesn't exist. Defaults to True
-        :param http_auth: http_auth param passed to the underying connection class.
+        :param http_auth: http_auth param passed to the underlying connection class.
             For basic authentication with default connection class `Urllib3HttpConnection` this can be
             - a tuple of (username, password)
             - a list of [username, password]
@@ -319,7 +319,8 @@ async def count_documents_async(self) -> int:
         assert self._async_client is not None
         return (await self._async_client.count(index=self._index))["count"]
 
-    def _deserialize_search_hits(self, hits: List[Dict[str, Any]]) -> List[Document]:
+    @staticmethod
+    def _deserialize_search_hits(hits: List[Dict[str, Any]]) -> List[Document]:
         out = []
         for hit in hits:
             data = hit["_source"]
@@ -344,12 +345,12 @@ def _prepare_filter_search_request(self, filters: Optional[Dict[str, Any]]) -> D
     def _search_documents(self, request_body: Dict[str, Any]) -> List[Document]:
         assert self._client is not None
         search_results = self._client.search(index=self._index, body=request_body)
-        return self._deserialize_search_hits(search_results["hits"]["hits"])
+        return OpenSearchDocumentStore._deserialize_search_hits(search_results["hits"]["hits"])
 
     async def _search_documents_async(self, request_body: Dict[str, Any]) -> List[Document]:
         assert self._async_client is not None
         search_results = await self._async_client.search(index=self._index, body=request_body)
-        return self._deserialize_search_hits(search_results["hits"]["hits"])
+        return OpenSearchDocumentStore._deserialize_search_hits(search_results["hits"]["hits"])
 
     def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
         """
@@ -418,7 +419,8 @@ def _prepare_bulk_write_request(
             "stats_only": False,
         }
 
-    def _process_bulk_write_errors(self, errors: List[Dict[str, Any]], policy: DuplicatePolicy) -> None:
+    @staticmethod
+    def _process_bulk_write_errors(errors: List[Dict[str, Any]], policy: DuplicatePolicy) -> None:
         if len(errors) == 0:
             return
 
@@ -461,7 +463,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
 
         bulk_params = self._prepare_bulk_write_request(documents=documents, policy=policy, is_async=False)
         documents_written, errors = bulk(**bulk_params)
-        self._process_bulk_write_errors(errors, policy)
+        OpenSearchDocumentStore._process_bulk_write_errors(errors, policy)
         return documents_written
 
     async def write_documents_async(
@@ -478,10 +480,11 @@ async def write_documents_async(
         bulk_params = self._prepare_bulk_write_request(documents=documents, policy=policy, is_async=True)
         documents_written, errors = await async_bulk(**bulk_params)
         # since we call async_bulk with stats_only=False, errors is guaranteed to be a list (not int)
-        self._process_bulk_write_errors(errors=errors, policy=policy)  # type: ignore[arg-type]
+        OpenSearchDocumentStore._process_bulk_write_errors(errors=errors, policy=policy)  # type: ignore[arg-type]
         return documents_written
 
-    def _deserialize_document(self, hit: Dict[str, Any]) -> Document:
+    @staticmethod
+    def _deserialize_document(hit: Dict[str, Any]) -> Document:
         """
         Creates a Document from the search hit provided.
         This is mostly useful in self.filter_documents().
@@ -525,6 +528,86 @@ async def delete_documents_async(self, document_ids: List[str]) -> None:
 
         await async_bulk(**self._prepare_bulk_delete_request(document_ids=document_ids, is_async=True))
 
+    def _prepare_delete_all_request(self, *, is_async: bool) -> Dict[str, Any]:
+        return {
+            "index": self._index,
+            "body": {"query": {"match_all": {}}},  # Delete all documents
+            "wait_for_completion": False if is_async else True,  # block until done (set False for async)
+        }
+
+    def delete_all_documents(self, recreate_index: bool = False) -> None:  # noqa: FBT002, FBT001
+        """
+        Deletes all documents in the document store.
+
+        :param recreate_index: If True, the index will be deleted and recreated with the original mappings and
+            settings. If False, all documents will be deleted using the `delete_by_query` API.
+        """
+        self._ensure_initialized()
+        assert self._client is not None
+
+        try:
+            if recreate_index:
+                # get the current index mappings and settings
+                index_name = self._index
+                body = {
+                    "mappings": self._client.indices.get(self._index)[index_name]["mappings"],
+                    "settings": self._client.indices.get(self._index)[index_name]["settings"],
+                }
+                body["settings"]["index"].pop("uuid", None)
+                body["settings"]["index"].pop("creation_date", None)
+                body["settings"]["index"].pop("provided_name", None)
+                body["settings"]["index"].pop("version", None)
+                self._client.indices.delete(index=self._index)
+                self._client.indices.create(index=self._index, body=body)
+                logger.info(
+                    "The index '{index}' recreated with the original mappings and settings.",
+                    index=self._index,
+                )
+
+            else:
+                result = self._client.delete_by_query(**self._prepare_delete_all_request(is_async=False))
+                logger.info(
+                    "Deleted all the {n_docs} documents from the index '{index}'.",
+                    index=self._index,
+                    n_docs=result["deleted"],
+                )
+        except Exception as e:
+            msg = f"Failed to delete all documents from OpenSearch: {e!s}"
+            raise DocumentStoreError(msg) from e
+
+    async def delete_all_documents_async(self, recreate_index: bool = False) -> None:  # noqa: FBT002, FBT001
+        """
+        Asynchronously deletes all documents in the document store.
+
+        :param recreate_index: If True, the index will be deleted and recreated with the original mappings and
+            settings. If False, all documents will be deleted using the `delete_by_query` API.
+        """
+        self._ensure_initialized()
+        assert self._async_client is not None
+
+        try:
+            if recreate_index:
+                # get the current index mappings and settings
+                index_name = self._index
+                index_info = await self._async_client.indices.get(self._index)
+                body = {
+                    "mappings": index_info[index_name]["mappings"],
+                    "settings": index_info[index_name]["settings"],
+                }
+                body["settings"]["index"].pop("uuid", None)
+                body["settings"]["index"].pop("creation_date", None)
+                body["settings"]["index"].pop("provided_name", None)
+                body["settings"]["index"].pop("version", None)
+
+                await self._async_client.indices.delete(index=self._index)
+                await self._async_client.indices.create(index=self._index, body=body)
+            else:
+                await self._async_client.delete_by_query(**self._prepare_delete_all_request(is_async=True))
+
+        except Exception as e:
+            msg = f"Failed to delete all documents from OpenSearch: {e!s}"
+            raise DocumentStoreError(msg) from e
+
     def _prepare_bm25_search_request(
         self,
         *,
@@ -580,7 +663,8 @@ def _prepare_bm25_search_request(
 
         return body
 
-    def _postprocess_bm25_search_results(self, *, results: List[Document], scale_score: bool) -> None:
+    @staticmethod
+    def _postprocess_bm25_search_results(*, results: List[Document], scale_score: bool) -> None:
         if not scale_score:
             return
 
@@ -624,7 +708,7 @@ def _bm25_retrieval(
             custom_query=custom_query,
         )
         documents = self._search_documents(search_params)
-        self._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
+        OpenSearchDocumentStore._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
         return documents
 
     async def _bm25_retrieval_async(
@@ -663,7 +747,7 @@ async def _bm25_retrieval_async(
             custom_query=custom_query,
         )
         documents = await self._search_documents_async(search_params)
-        self._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
+        OpenSearchDocumentStore._postprocess_bm25_search_results(results=documents, scale_score=scale_score)
         return documents
 
     def _prepare_embedding_search_request(
 
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import Mock, patch
 
 import pytest
 
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
+
 from unittest.mock import Mock, patch
 
 import pytest
 
@@ -1,7 +1,9 @@
 # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>
 #
 # SPDX-License-Identifier: Apache-2.0
+
 import random
+import time
 from typing import List
 from unittest.mock import patch
 
@@ -453,7 +455,7 @@ def test_embedding_retrieval_but_dont_return_embeddings_for_bm25_retrieval(
         assert len(results) == 2
         assert results[0].embedding is None
 
-    def filter_documents_no_embedding_returned(
+    def test_filter_documents_no_embedding_returned(
         self, document_store_embedding_dim_4_no_emb_returned: OpenSearchDocumentStore
     ):
         docs = [
@@ -468,3 +470,56 @@ def filter_documents_no_embedding_returned(
         assert results[0].embedding is None
         assert results[1].embedding is None
         assert results[2].embedding is None
+
+    def test_delete_all_documents_index_recreation(self, document_store: OpenSearchDocumentStore):
+        # populate the index with some documents
+        docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")]
+        document_store.write_documents(docs)
+
+        # capture index structure before deletion
+        assert document_store._client is not None
+        index_info_before = document_store._client.indices.get(index=document_store._index)
+        mappings_before = index_info_before[document_store._index]["mappings"]
+        settings_before = index_info_before[document_store._index]["settings"]
+
+        # delete all documents
+        document_store.delete_all_documents(recreate_index=True)
+        assert document_store.count_documents() == 0
+
+        # verify index structure is preserved
+        index_info_after = document_store._client.indices.get(index=document_store._index)
+        mappings_after = index_info_after[document_store._index]["mappings"]
+        settings_after = index_info_after[document_store._index]["settings"]
+
+        assert mappings_after == mappings_before, "delete_all_documents should preserve index mappings"
+
+        settings_after["index"].pop("uuid", None)
+        settings_after["index"].pop("creation_date", None)
+        settings_before["index"].pop("uuid", None)
+        settings_before["index"].pop("creation_date", None)
+        assert settings_after == settings_before, "delete_all_documents should preserve index settings"
+
+        new_doc = Document(id="4", content="New document after delete all")
+        document_store.write_documents([new_doc])
+        assert document_store.count_documents() == 1
+
+        results = document_store.filter_documents()
+        assert len(results) == 1
+        assert results[0].content == "New document after delete all"
+
+    def test_delete_all_documents_no_index_recreation(self, document_store: OpenSearchDocumentStore):
+        docs = [Document(id="1", content="A first document"), Document(id="2", content="Second document")]
+        document_store.write_documents(docs)
+        assert document_store.count_documents() == 2
+
+        document_store.delete_all_documents(recreate_index=False)
+        time.sleep(2)  # need to wait for the deletion to be reflected in count_documents
+        assert document_store.count_documents() == 0
+
+        new_doc = Document(id="3", content="New document after delete all")
+        document_store.write_documents([new_doc])
+        assert document_store.count_documents() == 1
+
+        results = document_store.filter_documents()
+        assert len(results) == 1
+        assert results[0].content == "New document after delete all"
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`# SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]>`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: Apache-2.0`
	`4`	`+`
`4`	`5`	`from unittest.mock import Mock, patch`
`5`	`6`
`6`	`7`	`import pytest`