001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase; 019 020import java.io.Closeable; 021import java.io.IOException; 022import org.apache.hadoop.conf.Configurable; 023import org.apache.hadoop.conf.Configuration; 024import org.apache.hadoop.hbase.client.RegionInfoBuilder; 025import org.apache.hadoop.hbase.util.Threads; 026import org.apache.yetus.audience.InterfaceAudience; 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService; 031import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.ClientService; 032import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService; 033 034/** 035 * This class defines methods that can help with managing HBase clusters from unit tests and system 036 * tests. There are 3 types of cluster deployments: 037 * <ul> 038 * <li><b>MiniHBaseCluster:</b> each server is run in the same JVM in separate threads, used by unit 039 * tests</li> 040 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can 041 * interact with the cluster.</li> 042 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate JVMs. 043 * </li> 044 * </ul> 045 * <p> 046 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can be run 047 * against a mini-cluster during unit test execution, or a distributed cluster having tens/hundreds 048 * of nodes during execution of integration tests. 049 * <p> 050 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume 051 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster, and 052 * some tests will still need to mock stuff and introspect internal state. For those use cases from 053 * unit tests, or if more control is needed, you can use the subclasses directly. In that sense, 054 * this class does not abstract away <strong>every</strong> interface that MiniHBaseCluster or 055 * DistributedHBaseCluster provide. 056 */ 057@InterfaceAudience.Public 058public abstract class HBaseCluster implements Closeable, Configurable { 059 // Log is being used in DistributedHBaseCluster class, hence keeping it as package scope 060 static final Logger LOG = LoggerFactory.getLogger(HBaseCluster.class.getName()); 061 protected Configuration conf; 062 063 /** the status of the cluster before we begin */ 064 protected ClusterMetrics initialClusterStatus; 065 066 /** 067 * Construct an HBaseCluster 068 * @param conf Configuration to be used for cluster 069 */ 070 public HBaseCluster(Configuration conf) { 071 setConf(conf); 072 } 073 074 @Override 075 public void setConf(Configuration conf) { 076 this.conf = conf; 077 } 078 079 @Override 080 public Configuration getConf() { 081 return conf; 082 } 083 084 /** 085 * Returns a ClusterMetrics for this HBase cluster. 086 * @see #getInitialClusterMetrics() 087 */ 088 public abstract ClusterMetrics getClusterMetrics() throws IOException; 089 090 /** 091 * Returns a ClusterStatus for this HBase cluster as observed at the starting of the HBaseCluster 092 */ 093 public ClusterMetrics getInitialClusterMetrics() throws IOException { 094 return initialClusterStatus; 095 } 096 097 /** 098 * Returns an {@link MasterService.BlockingInterface} to the active master 099 */ 100 public abstract MasterService.BlockingInterface getMasterAdminService() throws IOException; 101 102 /** 103 * Returns an AdminProtocol interface to the regionserver 104 */ 105 public abstract AdminService.BlockingInterface getAdminProtocol(ServerName serverName) 106 throws IOException; 107 108 /** 109 * Returns a ClientProtocol interface to the regionserver 110 */ 111 public abstract ClientService.BlockingInterface getClientProtocol(ServerName serverName) 112 throws IOException; 113 114 /** 115 * Starts a new region server on the given hostname or if this is a mini/local cluster, starts a 116 * region server locally. 117 * @param hostname the hostname to start the regionserver on 118 * @throws IOException if something goes wrong 119 */ 120 public abstract void startRegionServer(String hostname, int port) throws IOException; 121 122 /** 123 * Kills the region server process if this is a distributed cluster, otherwise this causes the 124 * region server to exit doing basic clean up only. 125 * @throws IOException if something goes wrong 126 */ 127 public abstract void killRegionServer(ServerName serverName) throws IOException; 128 129 /** 130 * Keeping track of killed servers and being able to check if a particular server was killed makes 131 * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete 132 * example of such case is - killing servers and waiting for all regions of a particular table to 133 * be assigned. We can check for server column in META table and that its value is not one of the 134 * killed servers. 135 */ 136 public abstract boolean isKilledRS(ServerName serverName); 137 138 /** 139 * Stops the given region server, by attempting a gradual stop. 140 * @throws IOException if something goes wrong 141 */ 142 public abstract void stopRegionServer(ServerName serverName) throws IOException; 143 144 /** 145 * Wait for the specified region server to join the cluster 146 * @throws IOException if something goes wrong or timeout occurs 147 */ 148 public void waitForRegionServerToStart(String hostname, int port, long timeout) 149 throws IOException { 150 long start = System.currentTimeMillis(); 151 while ((System.currentTimeMillis() - start) < timeout) { 152 for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) { 153 if (server.getHostname().equals(hostname) && server.getPort() == port) { 154 return; 155 } 156 } 157 Threads.sleep(100); 158 } 159 throw new IOException( 160 "did timeout " + timeout + "ms waiting for region server to start: " + hostname); 161 } 162 163 /** 164 * Wait for the specified region server to stop the thread / process. 165 * @throws IOException if something goes wrong or timeout occurs 166 */ 167 public abstract void waitForRegionServerToStop(ServerName serverName, long timeout) 168 throws IOException; 169 170 /** 171 * Suspend the region server 172 * @param serverName the hostname to suspend the regionserver on 173 * @throws IOException if something goes wrong 174 */ 175 public abstract void suspendRegionServer(ServerName serverName) throws IOException; 176 177 /** 178 * Resume the region server 179 * @param serverName the hostname to resume the regionserver on 180 * @throws IOException if something goes wrong 181 */ 182 public abstract void resumeRegionServer(ServerName serverName) throws IOException; 183 184 /** 185 * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, silently 186 * logs warning message. 187 * @param hostname the hostname to start the regionserver on 188 * @throws IOException if something goes wrong 189 */ 190 public abstract void startZkNode(String hostname, int port) throws IOException; 191 192 /** 193 * Kills the zookeeper node process if this is a distributed cluster, otherwise, this causes 194 * master to exit doing basic clean up only. 195 * @throws IOException if something goes wrong 196 */ 197 public abstract void killZkNode(ServerName serverName) throws IOException; 198 199 /** 200 * Stops the region zookeeper if this is a distributed cluster, otherwise silently logs warning 201 * message. 202 * @throws IOException if something goes wrong 203 */ 204 public abstract void stopZkNode(ServerName serverName) throws IOException; 205 206 /** 207 * Wait for the specified zookeeper node to join the cluster 208 * @throws IOException if something goes wrong or timeout occurs 209 */ 210 public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException; 211 212 /** 213 * Wait for the specified zookeeper node to stop the thread / process. 214 * @throws IOException if something goes wrong or timeout occurs 215 */ 216 public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException; 217 218 /** 219 * Starts a new datanode on the given hostname or if this is a mini/local cluster, silently logs 220 * warning message. 221 * @throws IOException if something goes wrong 222 */ 223 public abstract void startDataNode(ServerName serverName) throws IOException; 224 225 /** 226 * Kills the datanode process if this is a distributed cluster, otherwise, this causes master to 227 * exit doing basic clean up only. 228 * @throws IOException if something goes wrong 229 */ 230 public abstract void killDataNode(ServerName serverName) throws IOException; 231 232 /** 233 * Stops the datanode if this is a distributed cluster, otherwise silently logs warning message. 234 * @throws IOException if something goes wrong 235 */ 236 public abstract void stopDataNode(ServerName serverName) throws IOException; 237 238 /** 239 * Wait for the specified datanode to join the cluster 240 * @throws IOException if something goes wrong or timeout occurs 241 */ 242 public abstract void waitForDataNodeToStart(ServerName serverName, long timeout) 243 throws IOException; 244 245 /** 246 * Wait for the specified datanode to stop the thread / process. 247 * @throws IOException if something goes wrong or timeout occurs 248 */ 249 public abstract void waitForDataNodeToStop(ServerName serverName, long timeout) 250 throws IOException; 251 252 /** 253 * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs 254 * warning message. 255 * @throws IOException if something goes wrong 256 */ 257 public abstract void startNameNode(ServerName serverName) throws IOException; 258 259 /** 260 * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to 261 * exit doing basic clean up only. 262 * @throws IOException if something goes wrong 263 */ 264 public abstract void killNameNode(ServerName serverName) throws IOException; 265 266 /** 267 * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message. 268 * @throws IOException if something goes wrong 269 */ 270 public abstract void stopNameNode(ServerName serverName) throws IOException; 271 272 /** 273 * Wait for the specified namenode to join the cluster 274 * @throws IOException if something goes wrong or timeout occurs 275 */ 276 public abstract void waitForNameNodeToStart(ServerName serverName, long timeout) 277 throws IOException; 278 279 /** 280 * Wait for the specified namenode to stop 281 * @throws IOException if something goes wrong or timeout occurs 282 */ 283 public abstract void waitForNameNodeToStop(ServerName serverName, long timeout) 284 throws IOException; 285 286 /** 287 * Starts a new journalnode on the given hostname or if this is a mini/local cluster, silently 288 * logs warning message. 289 * @throws IOException if something goes wrong 290 */ 291 public abstract void startJournalNode(ServerName serverName) throws IOException; 292 293 /** 294 * Kills the journalnode process if this is a distributed cluster, otherwise, this causes master 295 * to exit doing basic clean up only. 296 * @throws IOException if something goes wrong 297 */ 298 public abstract void killJournalNode(ServerName serverName) throws IOException; 299 300 /** 301 * Stops the journalnode if this is a distributed cluster, otherwise silently logs warning 302 * message. 303 * @throws IOException if something goes wrong 304 */ 305 public abstract void stopJournalNode(ServerName serverName) throws IOException; 306 307 /** 308 * Wait for the specified journalnode to join the cluster 309 * @throws IOException if something goes wrong or timeout occurs 310 */ 311 public abstract void waitForJournalNodeToStart(ServerName serverName, long timeout) 312 throws IOException; 313 314 /** 315 * Wait for the specified journalnode to stop 316 * @throws IOException if something goes wrong or timeout occurs 317 */ 318 public abstract void waitForJournalNodeToStop(ServerName serverName, long timeout) 319 throws IOException; 320 321 /** 322 * Starts a new master on the given hostname or if this is a mini/local cluster, starts a master 323 * locally. 324 * @param hostname the hostname to start the master on 325 * @throws IOException if something goes wrong 326 */ 327 public abstract void startMaster(String hostname, int port) throws IOException; 328 329 /** 330 * Kills the master process if this is a distributed cluster, otherwise, this causes master to 331 * exit doing basic clean up only. 332 * @throws IOException if something goes wrong 333 */ 334 public abstract void killMaster(ServerName serverName) throws IOException; 335 336 /** 337 * Stops the given master, by attempting a gradual stop. 338 * @throws IOException if something goes wrong 339 */ 340 public abstract void stopMaster(ServerName serverName) throws IOException; 341 342 /** 343 * Wait for the specified master to stop the thread / process. 344 * @throws IOException if something goes wrong or timeout occurs 345 */ 346 public abstract void waitForMasterToStop(ServerName serverName, long timeout) throws IOException; 347 348 /** 349 * Blocks until there is an active master and that master has completed initialization. 350 * @return true if an active master becomes available. false if there are no masters left. 351 * @throws IOException if something goes wrong or timeout occurs 352 */ 353 public boolean waitForActiveAndReadyMaster() throws IOException { 354 return waitForActiveAndReadyMaster(Long.MAX_VALUE); 355 } 356 357 /** 358 * Blocks until there is an active master and that master has completed initialization. 359 * @param timeout the timeout limit in ms 360 * @return true if an active master becomes available. false if there are no masters left. 361 */ 362 public abstract boolean waitForActiveAndReadyMaster(long timeout) throws IOException; 363 364 /** 365 * Wait for HBase Cluster to shut down. 366 */ 367 public abstract void waitUntilShutDown() throws IOException; 368 369 /** 370 * Shut down the HBase cluster 371 */ 372 public abstract void shutdown() throws IOException; 373 374 /** 375 * Restores the cluster to it's initial state if this is a real cluster, otherwise does nothing. 376 * This is a best effort restore. If the servers are not reachable, or insufficient permissions, 377 * etc. restoration might be partial. 378 * @return whether restoration is complete 379 */ 380 public boolean restoreInitialStatus() throws IOException { 381 return restoreClusterMetrics(getInitialClusterMetrics()); 382 } 383 384 /** 385 * Restores the cluster to given state if this is a real cluster, otherwise does nothing. This is 386 * a best effort restore. If the servers are not reachable, or insufficient permissions, etc. 387 * restoration might be partial. 388 * @return whether restoration is complete 389 */ 390 public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException { 391 return true; 392 } 393 394 /** 395 * Get the ServerName of region server serving the first hbase:meta region 396 */ 397 public ServerName getServerHoldingMeta() throws IOException { 398 return getServerHoldingRegion(TableName.META_TABLE_NAME, 399 RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName()); 400 } 401 402 /** 403 * Get the ServerName of region server serving the specified region 404 * @param regionName Name of the region in bytes 405 * @param tn Table name that has the region. 406 * @return ServerName that hosts the region or null 407 */ 408 public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName) 409 throws IOException; 410 411 /** 412 * @return whether we are interacting with a distributed cluster as opposed to an in-process 413 * mini/local cluster. 414 */ 415 public boolean isDistributedCluster() { 416 return false; 417 } 418 419 /** 420 * Closes all the resources held open for this cluster. Note that this call does not shutdown the 421 * cluster. 422 * @see #shutdown() 423 */ 424 @Override 425 public abstract void close() throws IOException; 426 427 /** 428 * Wait for the namenode. 429 */ 430 public void waitForNamenodeAvailable() throws InterruptedException { 431 } 432 433 public void waitForDatanodesRegistered(int nbDN) throws Exception { 434 } 435}