Nacos 是如何剔除非健康实例的

Nacos Client 心跳上报

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class BeatProcessor implements Runnable {

@Override
public void run() {
try {
for (Map.Entry<String, BeatInfo> entry : dom2Beat.entrySet()) {
BeatInfo beatInfo = entry.getValue();
if (beatInfo.isScheduled()) {
continue;
}
beatInfo.setScheduled(true);
executorService.schedule(new BeatTask(beatInfo), 0, TimeUnit.MILLISECONDS);
}
} catch (Exception e) {
NAMING_LOGGER.error("[CLIENT-BEAT] Exception while scheduling beat.", e);
} finally {
executorService.schedule(this, clientBeatInterval, TimeUnit.MILLISECONDS);
}
}
}

class BeatTask implements Runnable {

BeatInfo beatInfo;

public BeatTask(BeatInfo beatInfo) {
this.beatInfo = beatInfo;
}

@Override
public void run() {
long result = serverProxy.sendBeat(beatInfo);
beatInfo.setScheduled(false);
if (result > 0) {
clientBeatInterval = result;
}
}
}

上述代码就是nacos clientnacos naming server上报自己的心跳信息

Nacos Naming Server 处理心跳

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@CanDistro
@RequestMapping(value = "/beat", method = RequestMethod.PUT)
public JSONObject beat(HttpServletRequest request) throws Exception {

JSONObject result = new JSONObject();

// 告诉 nacos-client 以后每隔 {clientBeatInterval} 进行一次心跳上报
result.put("clientBeatInterval", switchDomain.getClientBeatInterval());

String namespaceId = WebUtils.optional(request, CommonParams.NAMESPACE_ID, Constants.DEFAULT_NAMESPACE_ID);
String beat = WebUtils.required(request, "beat");
// 将心跳信息进行转为 RsInfo 对象,以留作后面的任务信息
RsInfo clientBeat = JSON.parseObject(beat, RsInfo.class);

// 如果是非临时实例,直接返回
if (!switchDomain.isDefaultInstanceEphemeral() && !clientBeat.isEphemeral()) {
return result;
}
if (StringUtils.isBlank(clientBeat.getCluster())) {
clientBeat.setCluster(UtilsAndCommons.DEFAULT_CLUSTER_NAME);
}
String serviceName = WebUtils.required(request, CommonParams.SERVICE_NAME);
String clusterName = clientBeat.getCluster();

if (Loggers.DEBUG_LOG.isDebugEnabled()) {
Loggers.DEBUG_LOG.debug("[CLIENT-BEAT] full arguments: beat: {}, serviceName: {}", clientBeat, serviceName);
}

// 从 ServiceManager 取出对应的 Instance 实例信息
Instance instance = serviceManager.getInstance(namespaceId, serviceName, clientBeat.getCluster(), clientBeat.getIp(), clientBeat.getPort());

// 如果没有相应的实例信息,那么本次为首次心跳 or 数据补偿(非本次注册,但是由于某些原因实例没有续约成功
// 导致被剔除,后面再次续约成功)
if (instance == null) {
instance = new Instance();
instance.setPort(clientBeat.getPort());
instance.setIp(clientBeat.getIp());
instance.setWeight(clientBeat.getWeight());
instance.setMetadata(clientBeat.getMetadata());
instance.setClusterName(clusterName);
instance.setServiceName(serviceName);
instance.setInstanceId(instance.generateInstanceId());
instance.setEphemeral(clientBeat.isEphemeral());

// 注册实例
serviceManager.registerInstance(namespaceId, serviceName, instance);
}

// 获取服务信息
Service service = serviceManager.getService(namespaceId, serviceName);

if (service == null) {
throw new NacosException(NacosException.SERVER_ERROR, "service not found: " + serviceName + "@" + namespaceId);
}

// 处理 nacos-client 端的心跳信息
service.processClientBeat(clientBeat);
return result;
}

在进行心跳任务的末尾,会创建一个处理本次客户端心跳包的任务进行处理

1
2
3
4
5
6
public void processClientBeat(final RsInfo rsInfo) {
ClientBeatProcessor clientBeatProcessor = new ClientBeatProcessor();
clientBeatProcessor.setService(this);
clientBeatProcessor.setRsInfo(rsInfo);
HealthCheckReactor.scheduleNow(clientBeatProcessor);
}

进行通知所有的服务订阅者

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
@Override
public void run() {
Service service = this.service;
if (Loggers.EVT_LOG.isDebugEnabled()) {
Loggers.EVT_LOG.debug("[CLIENT-BEAT] processing beat: {}", rsInfo.toString());
}

String ip = rsInfo.getIp();
String clusterName = rsInfo.getCluster();
int port = rsInfo.getPort();
// 获取集群信息
Cluster cluster = service.getClusterMap().get(clusterName);
// 获取该集群下的所有临时实例
List<Instance> instances = cluster.allIPs(true);

for (Instance instance : instances) {
// 进行目标实例查找
if (instance.getIp().equals(ip) && instance.getPort() == port) {
if (Loggers.EVT_LOG.isDebugEnabled()) {
Loggers.EVT_LOG.debug("[CLIENT-BEAT] refresh beat: {}", rsInfo.toString());
}
// 设置实例的最新一次心跳时间
instance.setLastBeat(System.currentTimeMillis());
if (!instance.isMarked()) {
if (!instance.isHealthy()) {
// 更新实例的健康状态信息
instance.setHealthy(true);
Loggers.EVT_LOG.info("service: {} {POS} {IP-ENABLED} valid: {}:{}@{}, region: {}, msg: client beat ok", cluster.getService().getName(), ip, port, cluster.getName(), UtilsAndCommons.LOCALHOST_SITE);
// 通知所有订阅了服务为 Service.getName() 的订阅者
getPushService().serviceChanged(service.getNamespaceId(), this.service.getName());
}
}
}
}
}

剔除非健康实例

这个时候就涉及Service这个对象了,在创建Service时,会判断实例类型是否为临时实例,如果是,则会开启一个任务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// ServiceManager
public void createEmptyService(String namespaceId, String serviceName, boolean local) throws NacosException {
Service service = getService(namespaceId, serviceName);
if (service == null) {

Loggers.SRV_LOG.info("creating empty service {}:{}", namespaceId, serviceName);
service = new Service();
service.setName(serviceName);
service.setNamespaceId(namespaceId);
service.setGroupName(NamingUtils.getGroupName(serviceName));
// now validate the service. if failed, exception will be thrown
service.setLastModifiedMillis(System.currentTimeMillis());
service.recalculateChecksum();
service.validate();
if (local) {
// 临时实例下,开启一个任务,用于处理临时实例的健康状况
putService(service);
service.init();
consistencyService.listen(KeyBuilder.buildInstanceListKey(service.getNamespaceId(), service.getName(), true), service);
consistencyService.listen(KeyBuilder.buildInstanceListKey(service.getNamespaceId(), service.getName(), false), service);
} else {
addOrReplaceService(service);
}
}
}

// Service
public void init() {

// 这里就会开启定时执行的任务
HealthCheckReactor.scheduleCheck(clientBeatCheckTask);

for (Map.Entry<String, Cluster> entry : clusterMap.entrySet()) {
entry.getValue().setService(this);
entry.getValue().init();
}
}

这个ClientBeatCheckTask根据名字就知道,它是用于处理实例心跳信息检查的,也就是实例健康处理的任务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
public class ClientBeatCheckTask implements Runnable {

// 任务处理对象
private Service service;

public ClientBeatCheckTask(Service service) {
this.service = service;
}

// 这是给订阅者服务的
@JSONField(serialize = false)
public PushService getPushService() {
return SpringContext.getAppContext().getBean(PushService.class);
}

// 用于寻找实例的权威 Server
@JSONField(serialize = false)
public DistroMapper getDistroMapper() {
return SpringContext.getAppContext().getBean(DistroMapper.class);
}

public GlobalConfig getGlobalConfig() {
return SpringContext.getAppContext().getBean(GlobalConfig.class);
}

public String taskKey() {
return service.getName();
}

@Override
public void run() {
try {
// 自己是否是该服务的权威server,如果是则进行处理,否则不处理
if (!getDistroMapper().responsible(service.getName())) {
return;
}

// 获取所有的临时实例
List<Instance> instances = service.allIPs(true);

// first set health status of instances:
for (Instance instance : instances) {
// 如果实例的上次心跳时间距离当前时间超过了心跳过期时间
if (System.currentTimeMillis() - instance.getLastBeat() > ClientBeatProcessor.CLIENT_BEAT_TIMEOUT) {
if (!instance.isMarked()) {
if (instance.isHealthy()) {
// 将实例健康状态设置为fasle
instance.setHealthy(false);
Loggers.EVT_LOG.info("{POS} {IP-DISABLED} valid: {}:{}@{}@{}, region: {}, msg: client timeout after {}, last beat: {}",
instance.getIp(), instance.getPort(), instance.getClusterName(), service.getName(), UtilsAndCommons.LOCALHOST_SITE, ClientBeatProcessor.CLIENT_BEAT_TIMEOUT, instance.getLastBeat());
// 通知订阅者相关的数据改变
getPushService().serviceChanged(service.getNamespaceId(), service.getName());
}
}
}
}
// 是否开启了实例过期设置
if (!getGlobalConfig().isExpireInstance()) {
return;
}
// then remove obsolete instances:
for (Instance instance : instances) {
// IP will be deleted if it has not send beat for some time, default timeout is 30 seconds.
// 如果该实例上次心跳时间距离现在超过了设置的 DeleteTimeout,则对实例进行摘除
if (System.currentTimeMillis() - instance.getLastBeat() > service.getIpDeleteTimeout()) {
// delete instance
Loggers.SRV_LOG.info("[AUTO-DELETE-IP] service: {}, ip: {}", service.getName(), JSON.toJSONString(instance));
deleteIP(instance);
}
}
} catch (Exception e) {
Loggers.SRV_LOG.warn("Exception while processing client beat time out.", e);
}
}

private void deleteIP(Instance instance) {

try {
// 创建一个实例删除请求,将删除请求定位到自己对应的 RequestMapping
NamingProxy.Request request = NamingProxy.Request.newRequest();
request.appendParam("ip", instance.getIp())
.appendParam("port", String.valueOf(instance.getPort()))
.appendParam("ephemeral", "true")
.appendParam("clusterName", instance.getClusterName())
.appendParam("serviceName", service.getName())
.appendParam("namespaceId", service.getNamespaceId());

String url = "http://127.0.0.1:" + RunningConfig.getServerPort() + RunningConfig.getContextPath() + UtilsAndCommons.NACOS_NAMING_CONTEXT + "/instance?" + request.toUrl();

// 开启一个异步的 Http 请求任务进行删除实例数据
HttpClient.asyncHttpDelete(url, null, null, new AsyncCompletionHandler() {
@Override
public Object onCompleted(Response response) throws Exception {
if (response.getStatusCode() != HttpURLConnection.HTTP_OK) {
Loggers.SRV_LOG.error("[IP-DEAD] failed to delete ip automatically, ip: {}, caused {}, resp code: {}", instance.toJSON(), response.getResponseBody(), response.getStatusCode());
}
return null;
}
});
} catch (Exception e) {
Loggers.SRV_LOG.error("[IP-DEAD] failed to delete ip automatically, ip: {}, error: {}", instance.toJSON(), e);
}
}
}