-
Notifications
You must be signed in to change notification settings - Fork 667
fix(grpc): Add keepalive and fix reconnect issue #777
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c534c26
32fd222
c1b8ab4
67a44dc
f5c2641
af2ddd4
8f3c2a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,6 +57,7 @@ public class GRPCChannelManager implements BootService, Runnable { | |
| private volatile List<String> grpcServers; | ||
| private volatile int selectedIdx = -1; | ||
| private volatile int reconnectCount = 0; | ||
| private final Object statusLock = new Object(); | ||
|
|
||
| @Override | ||
| public void prepare() { | ||
|
|
@@ -99,7 +100,10 @@ public void shutdown() { | |
|
|
||
| @Override | ||
| public void run() { | ||
| LOGGER.debug("Selected collector grpc service running, reconnect:{}.", reconnect); | ||
| if (reconnect) { | ||
| LOGGER.warn("Selected collector grpc service running, reconnect:{}.", reconnect); | ||
| } | ||
|
|
||
| if (IS_RESOLVE_DNS_PERIODICALLY && reconnect) { | ||
| grpcServers = Arrays.stream(Config.Collector.BACKEND_SERVICE.split(",")) | ||
| .filter(StringUtil::isNotBlank) | ||
|
|
@@ -130,32 +134,28 @@ public void run() { | |
| String server = ""; | ||
| try { | ||
| int index = Math.abs(random.nextInt()) % grpcServers.size(); | ||
| server = grpcServers.get(index); | ||
| String[] ipAndPort = server.split(":"); | ||
|
|
||
| if (index != selectedIdx) { | ||
| selectedIdx = index; | ||
| LOGGER.debug("Connecting to different gRPC server {}. Shutting down existing channel if any.", server); | ||
| createNewChannel(ipAndPort[0], Integer.parseInt(ipAndPort[1])); | ||
| } else { | ||
| // Same server, increment reconnectCount | ||
| reconnectCount++; | ||
wu-sheng marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| server = grpcServers.get(index); | ||
| String[] ipAndPort = server.split(":"); | ||
|
|
||
| if (managedChannel != null) { | ||
| managedChannel.shutdownNow(); | ||
| if (reconnectCount > Config.Agent.FORCE_RECONNECTION_PERIOD) { | ||
| // Reconnect attempts exceeded threshold, force rebuild channel | ||
| LOGGER.warn("Reconnect attempts to {} exceeded threshold ({}), forcing channel rebuild", | ||
| server, Config.Agent.FORCE_RECONNECTION_PERIOD); | ||
| createNewChannel(ipAndPort[0], Integer.parseInt(ipAndPort[1])); | ||
| } else if (managedChannel.isConnected(false)) { | ||
| // Channel appears connected, trust it but keep reconnectCount for monitoring | ||
| LOGGER.debug("Channel to {} appears connected (reconnect attempt: {})", server, reconnectCount); | ||
| notifyConnected(); | ||
| } | ||
|
|
||
| managedChannel = GRPCChannel.newBuilder(ipAndPort[0], Integer.parseInt(ipAndPort[1])) | ||
| .addManagedChannelBuilder(new StandardChannelBuilder()) | ||
| .addManagedChannelBuilder(new TLSChannelBuilder()) | ||
| .addChannelDecorator(new AgentIDDecorator()) | ||
| .addChannelDecorator(new AuthenticationDecorator()) | ||
| .build(); | ||
| reconnectCount = 0; | ||
| reconnect = false; | ||
| notify(GRPCChannelStatus.CONNECTED); | ||
| } else if (managedChannel.isConnected(++reconnectCount > Config.Agent.FORCE_RECONNECTION_PERIOD)) { | ||
| // Reconnect to the same server is automatically done by GRPC, | ||
| // therefore we are responsible to check the connectivity and | ||
| // set the state and notify listeners | ||
| reconnectCount = 0; | ||
| reconnect = false; | ||
| notify(GRPCChannelStatus.CONNECTED); | ||
| // else: Channel is disconnected and under threshold, wait for next retry | ||
| } | ||
|
|
||
| return; | ||
|
|
@@ -184,8 +184,7 @@ public Channel getChannel() { | |
| */ | ||
| public void reportError(Throwable throwable) { | ||
| if (isNetworkError(throwable)) { | ||
| reconnect = true; | ||
| notify(GRPCChannelStatus.DISCONNECT); | ||
| triggerReconnect(); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -199,6 +198,49 @@ private void notify(GRPCChannelStatus status) { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create a new gRPC channel to the specified server and reset connection state. | ||
| */ | ||
| private void createNewChannel(String host, int port) throws Exception { | ||
| if (managedChannel != null) { | ||
| managedChannel.shutdownNow(); | ||
| } | ||
|
|
||
| managedChannel = GRPCChannel.newBuilder(host, port) | ||
| .addManagedChannelBuilder(new StandardChannelBuilder()) | ||
| .addManagedChannelBuilder(new TLSChannelBuilder()) | ||
| .addChannelDecorator(new AgentIDDecorator()) | ||
| .addChannelDecorator(new AuthenticationDecorator()) | ||
| .build(); | ||
|
|
||
| // Reset reconnectCount after actually rebuilding the channel | ||
| reconnectCount = 0; | ||
| notifyConnected(); | ||
| } | ||
|
|
||
| /** | ||
| * Trigger reconnection by setting reconnect flag and notifying listeners. | ||
| */ | ||
| private void triggerReconnect() { | ||
| synchronized (statusLock) { | ||
| reconnect = true; | ||
| notify(GRPCChannelStatus.DISCONNECT); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Notify listeners that connection is established without resetting reconnectCount. | ||
| * This is used when the channel appears connected but we want to keep monitoring | ||
| * reconnect attempts in case it's a false positive (half-open connection). | ||
| */ | ||
| private void notifyConnected() { | ||
| synchronized (statusLock) { | ||
| // Don't reset reconnectCount - connection might still be half-open | ||
|
Comment on lines
+231
to
+238
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am a little confused about that. If you need to check half-open, why doesn't check it directly?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't a way you can determine the server is reachable? I am a little confused. Still no
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't found a reliable way to directly detect half-open connections before they cause issues. What I observed: The change: // Original code
} else if (managedChannel.isConnected(++reconnectCount > 5)) {
reconnectCount = 0; // Reset here - problem!
reconnect = false;
}When Now: if (reconnectCount > Config.Agent.FORCE_RECONNECTION_PERIOD) {
createNewChannel(...); // Force rebuild
} else if (managedChannel.isConnected(false)) {
notifyConnected(); // Don't reset reconnectCount
}
Do you think there's a better approach to detect half-open connections directly?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think once any request made successfully, the channel is good.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't you check connection status as Could you rechecking the logic? This change and context seem to be not consistent with gRPC concept.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't you just add the TRANSIENT_FAILURE status into isNetworkError method? Then it could be triggered reconnection when the status changed. Please check the runtime more. |
||
| reconnect = false; | ||
| notify(GRPCChannelStatus.CONNECTED); | ||
| } | ||
| } | ||
|
|
||
| private boolean isNetworkError(Throwable throwable) { | ||
| if (throwable instanceof StatusRuntimeException) { | ||
| StatusRuntimeException statusRuntimeException = (StatusRuntimeException) throwable; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.