Skip to content

Commit a46bc60

Browse files
authored
fix: ensure that headers keys are unique in link_content (#10111)
* fix: ensure that headers keys are unique in link_content * Add release note * Fix mypy issue * Rename variable * Add unit tests + improve release note * Revert mypy issue
1 parent dce828c commit a46bc60

File tree

3 files changed

+77
-11
lines changed

3 files changed

+77
-11
lines changed

haystack/components/fetchers/link_content.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222

2323
logger = logging.getLogger(__name__)
2424

25-
2625
DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}"
2726

2827
REQUEST_HEADERS = {
@@ -33,6 +32,25 @@
3332
}
3433

3534

35+
def _merge_headers(*args: dict[str, str]) -> dict[str, str]:
36+
"""
37+
Merge a list of dict using case-insensitively
38+
39+
:param args: a list of dict to merge
40+
:returns: The merged dict
41+
"""
42+
merged = {}
43+
keymap = {}
44+
45+
for d in args:
46+
for k, v in d.items():
47+
kl = k.lower()
48+
keymap[kl] = k
49+
merged[kl] = v
50+
51+
return {keymap[kl]: v for kl, v in merged.items()}
52+
53+
3654
def _text_content_handler(response: httpx.Response) -> ByteStream:
3755
"""
3856
Handles text content.
@@ -169,17 +187,24 @@ def __init__( # pylint: disable=too-many-positional-arguments
169187
after=self._switch_user_agent,
170188
)
171189
def get_response(url):
172-
# Build headers with precedence:
173-
# client defaults -> component defaults -> user-provided -> rotating UA
174-
base = dict(self._client.headers)
175-
headers = {**base, **REQUEST_HEADERS, **self.request_headers}
176-
headers["User-Agent"] = self.user_agents[self.current_user_agent_idx] # rotation wins
177-
response = self._client.get(url, headers=headers)
190+
response = self._client.get(url, headers=self._get_headers())
178191
response.raise_for_status()
179192
return response
180193

181194
self._get_response: Callable = get_response
182195

196+
def _get_headers(self):
197+
"""
198+
Build headers with precedence
199+
200+
client defaults -> component defaults -> user-provided -> rotating UA
201+
"""
202+
base = dict(self._client.headers)
203+
headers = _merge_headers(
204+
base, REQUEST_HEADERS, self.request_headers, {"User-Agent": self.user_agents[self.current_user_agent_idx]}
205+
)
206+
return headers
207+
183208
def __del__(self):
184209
"""
185210
Clean up resources when the component is deleted.
@@ -378,10 +403,7 @@ async def _get_response_async(self, url: str, client: httpx.AsyncClient) -> http
378403

379404
while attempt <= self.retry_attempts:
380405
try:
381-
base = dict(client.headers)
382-
headers = {**base, **REQUEST_HEADERS, **self.request_headers}
383-
headers["User-Agent"] = self.user_agents[self.current_user_agent_idx]
384-
response = await client.get(url, headers=headers)
406+
response = await client.get(url, headers=self._get_headers())
385407
response.raise_for_status()
386408
return response
387409
except (httpx.HTTPStatusError, httpx.RequestError) as e:
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
fixes:
3+
- |
4+
Ensure request header keys are unique in link_content to prevent 400 Bad Request errors.
5+
6+
Some image providers return a 400 Bad Request when using ImageContent.from_url() because the User-Agent
7+
header appears multiple times with different casing (e.g., user-agent, User-Agent).
8+
This update normalizes header keys in a case-insensitive way, removes duplicates, and
9+
preserves only the last occurrence.

test/components/fetchers/test_link_content_fetcher.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,3 +404,38 @@ async def test_request_headers_merging_and_ua_override(self):
404404
assert sent_headers["X-Async"] == "true"
405405
assert sent_headers["Accept-Language"] == "de-DE"
406406
assert sent_headers["User-Agent"] == "ua-async-1" # rotating UA wins
407+
408+
@pytest.mark.asyncio
409+
async def test_duplicated_request_headers_merging(self):
410+
# Patch the AsyncClient class to control the instance created by LinkContentFetcher
411+
with patch("haystack.components.fetchers.link_content.httpx.AsyncClient") as AsyncClientMock:
412+
aclient = AsyncClientMock.return_value
413+
aclient.headers = {} # base headers used in the merge
414+
415+
mock_response = Mock(status_code=200, text="OK", headers={"Content-Type": "text/plain"})
416+
aclient.get = AsyncMock(return_value=mock_response)
417+
418+
fetcher = LinkContentFetcher(
419+
request_headers={
420+
"x-test-header": "header-1",
421+
"X-Test-Header": "agent-2",
422+
"X-TEST-HEADER": "agent-3",
423+
"X-TeSt-HeAdEr": "good-one",
424+
}
425+
)
426+
427+
_ = (await fetcher.run_async(urls=["https://example.com"]))["streams"]
428+
429+
assert aclient.get.await_count == 1
430+
sent_headers = aclient.get.call_args.kwargs["headers"]
431+
existing_keys = {}
432+
for key, value in sent_headers.items():
433+
lower_key = key.lower()
434+
if lower_key in existing_keys:
435+
assert False
436+
elif lower_key == "x-test-header":
437+
assert value == "good-one"
438+
existing_keys[lower_key] = key
439+
440+
assert "x-test-header" in existing_keys
441+
assert existing_keys["x-test-header"] == "X-TeSt-HeAdEr"

0 commit comments

Comments
 (0)