blob: 1365a0172751e3ddf90568191c7b8a29aa42f969 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.admin;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.Replica.State;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.params.CommonParams.FAILURE;
import static org.apache.solr.common.params.CommonParams.OK;
import static org.apache.solr.common.params.CommonParams.STATUS;
/**
* Health Check Handler for reporting the health of a specific node.
*
* <p>
* By default the handler returns status <code>200 OK</code> if all checks succeed, else it returns
* status <code>503 UNAVAILABLE</code>:
* </p>
* <ol>
* <li>Cores container is active.</li>
* <li>Node connected to zookeeper.</li>
* <li>Node listed in <code>live_nodes</code> in zookeeper.</li>
* </ol>
*
* <p>
* The handler takes an optional request parameter <code>requireHealthyCores=true</code>
* which will also require that all local cores that are part of an <b>active shard</b>
* are done initializing, i.e. not in states <code>RECOVERING</code> or <code>DOWN</code>.
* This parameter is designed to help during rolling restarts, to make sure each node
* is fully initialized and stable before proceeding with restarting the next node, and thus
* reduce the risk of restarting the last live replica of a shard.
* </p>
*/
public class HealthCheckHandler extends RequestHandlerBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final String PARAM_REQUIRE_HEALTHY_CORES = "requireHealthyCores";
private static final List<State> UNHEALTHY_STATES = Arrays.asList(State.DOWN, State.RECOVERING);
CoreContainer coreContainer;
public HealthCheckHandler(final CoreContainer coreContainer) {
this.coreContainer = coreContainer;
}
@Override
final public void init(@SuppressWarnings({"rawtypes"})NamedList args) {
}
public CoreContainer getCoreContainer() {
return this.coreContainer;
}
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
CoreContainer cores = getCoreContainer();
rsp.setHttpCaching(false);
// Core container should not be null and active (redundant check)
if(cores == null || cores.isShutDown()) {
rsp.setException(new SolrException(SolrException.ErrorCode.SERVER_ERROR, "CoreContainer is either not initialized or shutting down"));
return;
}
if(!cores.isZooKeeperAware()) {
//TODO: Support standalone instances
rsp.setException(new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Health check is only available when running in SolrCloud mode"));
return;
}
if (log.isDebugEnabled()) {
log.debug("Invoked HealthCheckHandler on [{}]", coreContainer.getZkController().getNodeName());
}
ZkStateReader zkStateReader = cores.getZkController().getZkStateReader();
ClusterState clusterState = zkStateReader.getClusterState();
// Check for isConnected and isClosed
if(zkStateReader.getZkClient().isClosed() || !zkStateReader.getZkClient().isConnected()) {
rsp.add(STATUS, FAILURE);
rsp.setException(new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Host Unavailable: Not connected to zk"));
return;
}
// Fail if not in live_nodes
if (!clusterState.getLiveNodes().contains(cores.getZkController().getNodeName())) {
rsp.add(STATUS, FAILURE);
rsp.setException(new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Host Unavailable: Not in live nodes as per zk"));
return;
}
// Optionally require that all cores on this node are active if param 'requireHealthyCores=true'
if (req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES, false)) {
Collection<CloudDescriptor> coreDescriptors = cores.getCores().stream()
.map(c -> c.getCoreDescriptor().getCloudDescriptor()).collect(Collectors.toList());
long unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState);
if (unhealthyCores > 0) {
rsp.add(STATUS, FAILURE);
rsp.add("num_cores_unhealthy", unhealthyCores);
rsp.setException(new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, unhealthyCores + " out of "
+ cores.getNumAllCores() + " replicas are currently initializing or recovering"));
return;
}
rsp.add("message", "All cores are healthy");
}
// All lights green, report healthy
rsp.add(STATUS, OK);
}
/**
* Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not exist on local node.
* We first find local cores which are either not registered or unhealthy, and check each of these against
* the clusterstate, and return a count of unhealthy replicas
* @param cores list of core cloud descriptors to iterate
* @param clusterState clusterstate from ZK
* @return number of unhealthy cores, either in DOWN or RECOVERING state
*/
static long findUnhealthyCores(Collection<CloudDescriptor> cores, ClusterState clusterState) {
return cores.stream()
.filter(c -> !c.hasRegistered() || UNHEALTHY_STATES.contains(c.getLastPublished())) // Find candidates locally
.filter(c -> clusterState.hasCollection(c.getCollectionName())) // Only care about cores for actual collections
.filter(c -> clusterState.getCollection(c.getCollectionName()).getActiveSlicesMap().containsKey(c.getShardId()))
.count();
}
@Override
public String getDescription() {
return "Health check handler for SolrCloud node";
}
@Override
public Category getCategory() {
return Category.ADMIN;
}
@Override
public Boolean registerV2() {
return Boolean.TRUE;
}
}