Skip to content

Commit f05f5a7

Browse files
authored
Fixed an error that caused the pipe break to not be handled. (#30096) (#30151)
2 parents 20ebf5c + ceaebd8 commit f05f5a7

File tree

3 files changed

+137
-59
lines changed

3 files changed

+137
-59
lines changed

ydb/core/kafka_proxy/actors/kafka_produce_actor.cpp

Lines changed: 95 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@ TString TKafkaProduceActor::LogPrefix() {
3030
sb << "Init ";
3131
} else if (stateFunc == &TKafkaProduceActor::StateWork) {
3232
sb << "Work ";
33-
} else if (stateFunc == &TKafkaProduceActor::StateAccepting) {
34-
sb << "Accepting ";
3533
} else {
3634
sb << "Unknown ";
3735
}
@@ -229,22 +227,22 @@ void TKafkaProduceActor::Handle(TEvKafka::TEvProduceRequest::TPtr request, const
229227

230228
void TKafkaProduceActor::ProcessRequests(const TActorContext& ctx) {
231229
if (&TKafkaProduceActor::StateWork != CurrentStateFunc()) {
232-
KAFKA_LOG_ERROR("Produce actor: Unexpected state");
233230
return;
234231
}
235232

236233
if (Requests.empty()) {
237234
return;
238235
}
239236

240-
if (EnqueueInitialization()) {
237+
auto canProcess = EnqueueInitialization();
238+
while (canProcess--) {
241239
PendingRequests.push_back(std::make_shared<TPendingRequest>(Requests.front()));
242240
Requests.pop_front();
243241

244242
ProcessRequest(PendingRequests.back(), ctx);
245-
} else {
246-
ProcessInitializationRequests(ctx);
247243
}
244+
245+
ProcessInitializationRequests(ctx);
248246
}
249247

250248
size_t TKafkaProduceActor::EnqueueInitialization() {
@@ -399,12 +397,10 @@ void TKafkaProduceActor::ProcessRequest(TPendingRequest::TPtr pendingRequest, co
399397
if (pendingRequest->WaitResultCookies.empty()) {
400398
// All request for unknown topic or empty request
401399
SendResults(ctx);
402-
} else {
403-
Become(&TKafkaProduceActor::StateAccepting);
404400
}
405401
}
406402

407-
void TKafkaProduceActor::HandleAccepting(TEvPartitionWriter::TEvWriteAccepted::TPtr request, const TActorContext& ctx) {
403+
void TKafkaProduceActor::Handle(TEvPartitionWriter::TEvWriteAccepted::TPtr request, const TActorContext& ctx) {
408404
auto r = request->Get();
409405
auto cookie = r->Cookie;
410406

@@ -422,12 +418,87 @@ void TKafkaProduceActor::HandleAccepting(TEvPartitionWriter::TEvWriteAccepted::T
422418
Become(&TKafkaProduceActor::StateWork);
423419
ProcessRequests(ctx);
424420
} else {
425-
KAFKA_LOG_W("Still in Accepting state after TEvPartitionWriter::TEvWriteAccepted cause cookies are expected: " << JoinSeq(", ", expectedCookies));
421+
KAFKA_LOG_W("Still in accepting after receive TEvPartitionWriter::TEvWriteAccepted cause cookies are expected: " << JoinSeq(", ", expectedCookies));
426422
}
427423
}
428424

429425
void TKafkaProduceActor::Handle(TEvPartitionWriter::TEvInitResult::TPtr request, const TActorContext& /*ctx*/) {
430426
KAFKA_LOG_D("Produce actor: Init " << request->Get()->ToString());
427+
428+
if (!request->Get()->IsSuccess()) {
429+
auto sender = request->Sender;
430+
431+
if (WriterDied(sender, EKafkaErrors::UNKNOWN_SERVER_ERROR, request->Get()->GetError().Reason)) {
432+
KAFKA_LOG_D("Produce actor: Received TEvPartitionWriter::TEvInitResult for " << sender << " with error: " << request->Get()->GetError().Reason);
433+
return;
434+
}
435+
436+
KAFKA_LOG_D("Produce actor: Received TEvPartitionWriter::TEvInitResult with unexpected writer " << sender);
437+
}
438+
}
439+
440+
void TKafkaProduceActor::Handle(TEvPartitionWriter::TEvDisconnected::TPtr request, const TActorContext& /*ctx*/) {
441+
auto sender = request->Sender;
442+
443+
if (WriterDied(sender, EKafkaErrors::NOT_LEADER_OR_FOLLOWER, TStringBuilder() << "Partition writer " << sender << " disconnected")) {
444+
KAFKA_LOG_D("Produce actor: Received TEvPartitionWriter::TEvDisconnected for " << sender);
445+
return;
446+
}
447+
448+
KAFKA_LOG_D("Produce actor: Received TEvPartitionWriter::TEvDisconnected with unexpected writer " << sender);
449+
}
450+
451+
bool TKafkaProduceActor::WriterDied(const TActorId& writerId, EKafkaErrors errorCode, TStringBuf errorMessage) {
452+
auto findAndCleanWriter = [&]() -> std::pair<TString, ui32> {
453+
for (auto it = TransactionalWriters.begin(); it != TransactionalWriters.end(); ++it) {
454+
if (it->second.ActorId == writerId) {
455+
auto id = it->first;
456+
CleanWriter(id, writerId);
457+
TransactionalWriters.erase(it);
458+
return {id.TopicPath, id.PartitionId};
459+
}
460+
}
461+
462+
for (auto& [topicPath, partitionWriters] : NonTransactionalWriters) {
463+
for (auto it = partitionWriters.begin(); it != partitionWriters.end(); ++it) {
464+
if (it->second.ActorId == writerId) {
465+
auto id = it->first;
466+
CleanWriter({topicPath, static_cast<ui32>(id)}, writerId);
467+
partitionWriters.erase(it);
468+
return {topicPath, static_cast<ui32>(id)};
469+
}
470+
}
471+
}
472+
473+
return {"", 0};
474+
};
475+
476+
auto [topicPath, partitionId] = findAndCleanWriter();
477+
if (topicPath.empty()) {
478+
return false;
479+
}
480+
481+
for (auto it = Cookies.begin(); it != Cookies.end();) {
482+
auto cookie = it->first;
483+
auto& info = it->second;
484+
485+
if (info.TopicPath == topicPath && info.PartitionId == partitionId) {
486+
info.Request->Results[info.Position].ErrorCode = errorCode;
487+
info.Request->Results[info.Position].ErrorMessage = errorMessage;
488+
info.Request->WaitAcceptingCookies.erase(cookie);
489+
info.Request->WaitResultCookies.erase(cookie);
490+
491+
if (info.Request->WaitAcceptingCookies.empty() && info.Request->WaitResultCookies.empty()) {
492+
SendResults(ActorContext());
493+
}
494+
495+
it = Cookies.erase(it);
496+
} else {
497+
++it;
498+
}
499+
}
500+
501+
return true;
431502
}
432503

433504
void TKafkaProduceActor::Handle(TEvPartitionWriter::TEvWriteResponse::TPtr request, const TActorContext& ctx) {
@@ -532,7 +603,8 @@ void TKafkaProduceActor::SendResults(const TActorContext& ctx) {
532603
size_t recordsCount = partitionData.Records.has_value() ? partitionData.Records->Records.size() : 0;
533604
partitionResponse.Index = partitionData.Index;
534605
if (EKafkaErrors::NONE_ERROR != result.ErrorCode) {
535-
KAFKA_LOG_ERROR("Produce actor: Partition result with error: ErrorCode=" << static_cast<int>(result.ErrorCode) << ", ErrorMessage=" << result.ErrorMessage << ", #01");
606+
KAFKA_LOG_ERROR("Produce actor: Partition result with error: ErrorCode=" << static_cast<int>(result.ErrorCode)
607+
<< ", ErrorMessage=" << result.ErrorMessage << ", #01");
536608
partitionResponse.ErrorCode = result.ErrorCode;
537609
metricsErrorCode = result.ErrorCode;
538610
partitionResponse.ErrorMessage = result.ErrorMessage;
@@ -584,20 +656,6 @@ void TKafkaProduceActor::SendResults(const TActorContext& ctx) {
584656

585657
Send(Context->ConnectionId, new TEvKafka::TEvResponse(correlationId, response, metricsErrorCode));
586658

587-
if (!pendingRequest->WaitAcceptingCookies.empty()) {
588-
if (!expired) {
589-
TStringBuilder sb;
590-
sb << "Produce actor: All TEvWriteResponse were received, but not all TEvWriteAccepted. Unreceived cookies:";
591-
for(auto cookie : pendingRequest->WaitAcceptingCookies) {
592-
sb << " " << cookie;
593-
}
594-
KAFKA_LOG_W(sb);
595-
}
596-
if (&TKafkaProduceActor::StateAccepting == CurrentStateFunc()) {
597-
Become(&TKafkaProduceActor::StateWork);
598-
}
599-
}
600-
601659
for(auto cookie : pendingRequest->WaitAcceptingCookies) {
602660
Cookies.erase(cookie);
603661
}
@@ -607,6 +665,8 @@ void TKafkaProduceActor::SendResults(const TActorContext& ctx) {
607665

608666
PendingRequests.pop_front();
609667
}
668+
669+
ProcessRequests(ctx);
610670
}
611671

612672
void TKafkaProduceActor::ProcessInitializationRequests(const TActorContext& ctx) {
@@ -681,18 +741,19 @@ void TKafkaProduceActor::SendWriteRequest(const TProduceRequestData::TTopicProdu
681741
auto& result = pendingRequest->Results[position];
682742
if (OK == writer.first) {
683743
auto ownCookie = ++Cookie;
684-
auto& cookieInfo = Cookies[ownCookie];
685-
cookieInfo.TopicPath = topicPath;
686-
cookieInfo.PartitionId = partitionId;
687-
cookieInfo.Position = position;
688-
cookieInfo.RuPerRequest = ruPerRequest;
689-
cookieInfo.Request = pendingRequest;
690-
691-
pendingRequest->WaitAcceptingCookies.insert(ownCookie);
692-
pendingRequest->WaitResultCookies.insert(ownCookie);
693744

694745
auto [error, ev] = Convert(transactionalId.GetOrElse(""), partitionData, topicPath, ownCookie, ClientDC, ruPerRequest);
695746
if (error == EKafkaErrors::NONE_ERROR) {
747+
auto& cookieInfo = Cookies[ownCookie];
748+
cookieInfo.TopicPath = topicPath;
749+
cookieInfo.PartitionId = partitionId;
750+
cookieInfo.Position = position;
751+
cookieInfo.RuPerRequest = ruPerRequest;
752+
cookieInfo.Request = pendingRequest;
753+
754+
pendingRequest->WaitAcceptingCookies.insert(ownCookie);
755+
pendingRequest->WaitResultCookies.insert(ownCookie);
756+
696757
ruPerRequest = false;
697758
KAFKA_LOG_T("Sending TEvPartitionWriter::TEvWriteRequest to " << writer.second << " with cookie " << ownCookie);
698759
Send(writer.second, std::move(ev));

ydb/core/kafka_proxy/actors/kafka_produce_actor.h

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,6 @@ using namespace NKikimrClient;
2424
// Requests are processed in parallel, but it is guaranteed that the recording order will be preserved.
2525
// The order of responses to requests is also guaranteed.
2626
//
27-
// When the request begins to be processed, the actor enters the Accepting state. In this state, responses
28-
// are expected from all TPartitionWriters confirming acceptance of the request (TEvWriteAccepted). After that,
29-
// the actor switches back to the Work state. This guarantees the order of writing to each partition.
30-
//
3127
class TKafkaProduceActor: public NActors::TActorBootstrapped<TKafkaProduceActor> {
3228
struct TPendingRequest;
3329

@@ -52,9 +48,14 @@ class TKafkaProduceActor: public NActors::TActorBootstrapped<TKafkaProduceActor>
5248

5349
// Handlers for many StateFunc
5450
void Handle(TEvKafka::TEvWakeup::TPtr request, const TActorContext& ctx);
51+
52+
void Handle(TEvPartitionWriter::TEvWriteAccepted::TPtr request, const TActorContext& ctx);
5553
void Handle(TEvPartitionWriter::TEvWriteResponse::TPtr request, const TActorContext& ctx);
5654
void Handle(TEvPartitionWriter::TEvInitResult::TPtr request, const TActorContext& ctx);
55+
void Handle(TEvPartitionWriter::TEvDisconnected::TPtr request, const TActorContext& ctx);
56+
5757
void EnqueueRequest(TEvKafka::TEvProduceRequest::TPtr request, const TActorContext& ctx);
58+
5859
void Handle(TEvTxProxySchemeCache::TEvWatchNotifyDeleted::TPtr& ev, const TActorContext& ctx);
5960
void Handle(TEvTxProxySchemeCache::TEvWatchNotifyUpdated::TPtr& ev, const TActorContext& ctx);
6061

@@ -67,8 +68,11 @@ class TKafkaProduceActor: public NActors::TActorBootstrapped<TKafkaProduceActor>
6768
HFunc(TEvTxProxySchemeCache::TEvNavigateKeySetResult, HandleInit);
6869

6970
HFunc(TEvKafka::TEvProduceRequest, EnqueueRequest);
71+
7072
HFunc(TEvPartitionWriter::TEvInitResult, Handle);
73+
HFunc(TEvPartitionWriter::TEvWriteAccepted, Handle);
7174
HFunc(TEvPartitionWriter::TEvWriteResponse, Handle);
75+
HFunc(TEvPartitionWriter::TEvDisconnected, Handle);
7276

7377
HFunc(TEvTxProxySchemeCache::TEvWatchNotifyDeleted, Handle);
7478
HFunc(TEvTxProxySchemeCache::TEvWatchNotifyUpdated, Handle);
@@ -85,29 +89,11 @@ class TKafkaProduceActor: public NActors::TActorBootstrapped<TKafkaProduceActor>
8589
LogEvent(*ev.Get());
8690
switch (ev->GetTypeRewrite()) {
8791
HFunc(TEvKafka::TEvProduceRequest, Handle);
88-
HFunc(TEvPartitionWriter::TEvInitResult, Handle);
89-
HFunc(TEvPartitionWriter::TEvWriteResponse, Handle);
9092

91-
HFunc(TEvTxProxySchemeCache::TEvWatchNotifyDeleted, Handle);
92-
HFunc(TEvTxProxySchemeCache::TEvWatchNotifyUpdated, Handle);
93-
94-
HFunc(TEvKafka::TEvWakeup, Handle);
95-
sFunc(TEvents::TEvPoison, PassAway);
96-
}
97-
}
98-
99-
// StateAccepting - enqueue ProduceRequest parts to PartitionWriters
100-
// This guarantees the order of responses according order of request
101-
void HandleAccepting(TEvPartitionWriter::TEvWriteAccepted::TPtr request, const TActorContext& ctx);
102-
103-
STATEFN(StateAccepting) {
104-
LogEvent(*ev.Get());
105-
switch (ev->GetTypeRewrite()) {
106-
HFunc(TEvPartitionWriter::TEvWriteAccepted, HandleAccepting);
107-
108-
HFunc(TEvKafka::TEvProduceRequest, EnqueueRequest);
10993
HFunc(TEvPartitionWriter::TEvInitResult, Handle);
94+
HFunc(TEvPartitionWriter::TEvWriteAccepted, Handle);
11095
HFunc(TEvPartitionWriter::TEvWriteResponse, Handle);
96+
HFunc(TEvPartitionWriter::TEvDisconnected, Handle);
11197

11298
HFunc(TEvTxProxySchemeCache::TEvWatchNotifyDeleted, Handle);
11399
HFunc(TEvTxProxySchemeCache::TEvWatchNotifyUpdated, Handle);
@@ -117,7 +103,6 @@ class TKafkaProduceActor: public NActors::TActorBootstrapped<TKafkaProduceActor>
117103
}
118104
}
119105

120-
121106
// Logic
122107
void ProcessRequests(const TActorContext& ctx);
123108
void ProcessRequest(std::shared_ptr<TPendingRequest> pendingRequest, const TActorContext& ctx);
@@ -129,6 +114,7 @@ class TKafkaProduceActor: public NActors::TActorBootstrapped<TKafkaProduceActor>
129114
void CleanTopics(const TActorContext& ctx);
130115
void CleanWriters(const TActorContext& ctx);
131116
std::pair<ETopicStatus, TActorId> PartitionWriter(const TTopicPartition& topicPartition, const TProducerInstanceId& producerInstanceId, const TMaybe<TString>& transactionalId, const TActorContext& ctx);
117+
bool WriterDied(const TActorId& writerId, EKafkaErrors errorCode, TStringBuf errorMessage);
132118

133119
TString LogPrefix();
134120
void LogEvent(IEventHandle& ev);

ydb/core/kafka_proxy/ut/ut_produce_actor.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,5 +301,36 @@ namespace {
301301
UNIT_ASSERT(response != nullptr);
302302
UNIT_ASSERT_VALUES_EQUAL(response->ErrorCode, NKafka::EKafkaErrors::REQUEST_TIMED_OUT);
303303
}
304+
305+
Y_UNIT_TEST(OnProduce_andPipeDisconnected) {
306+
i64 producerId = 1;
307+
i32 producerEpoch = 2;
308+
309+
int writeRequestsCounter = 0;
310+
int poisonPillCounter = 0;
311+
312+
auto observer = [&](TAutoPtr<IEventHandle>& input) {
313+
if (input->CastAsLocal<TEvPartitionWriter::TEvWriteRequest>()) {
314+
if (writeRequestsCounter++ == 0) {
315+
auto r = std::make_unique<TEvPartitionWriter::TEvDisconnected>(TEvPartitionWriter::TEvWriteResponse::EErrorCode::InternalError);
316+
Ctx->Runtime->Send(new IEventHandle(input->Sender, input->Recipient, r.release()));
317+
return TTestActorRuntimeBase::EEventAction::DROP;
318+
}
319+
} else if (input->CastAsLocal<TEvents::TEvPoison>()) {
320+
poisonPillCounter++;
321+
}
322+
323+
return TTestActorRuntimeBase::EEventAction::PROCESS;
324+
};
325+
326+
Ctx->Runtime->SetObserverFunc(observer);
327+
328+
SendProduce({}, producerId, producerEpoch);
329+
330+
auto response = Ctx->Runtime->GrabEdgeEvent<NKafka::TEvKafka::TEvResponse>();
331+
UNIT_ASSERT(response);
332+
UNIT_ASSERT_VALUES_EQUAL(response->ErrorCode, NKafka::EKafkaErrors::NOT_LEADER_OR_FOLLOWER);
333+
UNIT_ASSERT_VALUES_EQUAL(std::dynamic_pointer_cast<NKafka::TProduceResponseData>(response->Response)->Responses[0].PartitionResponses[0].ErrorCode, NKafka::EKafkaErrors::NOT_LEADER_OR_FOLLOWER);
334+
}
304335
}
305336
} // anonymous namespace

0 commit comments

Comments
 (0)