001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import java.io.Closeable;
021import java.io.IOException;
022import org.apache.hadoop.conf.Configurable;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.client.RegionInfoBuilder;
025import org.apache.hadoop.hbase.util.Threads;
026import org.apache.yetus.audience.InterfaceAudience;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService;
031import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.ClientService;
032import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService;
033
034/**
035 * This class defines methods that can help with managing HBase clusters from unit tests and system
036 * tests. There are 3 types of cluster deployments:
037 * <ul>
038 * <li><b>MiniHBaseCluster:</b> each server is run in the same JVM in separate threads, used by unit
039 * tests</li>
040 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can
041 * interact with the cluster.</li>
042 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate JVMs.
043 * </li>
044 * </ul>
045 * <p>
046 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can be run
047 * against a mini-cluster during unit test execution, or a distributed cluster having tens/hundreds
048 * of nodes during execution of integration tests.
049 * <p>
050 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume
051 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster, and
052 * some tests will still need to mock stuff and introspect internal state. For those use cases from
053 * unit tests, or if more control is needed, you can use the subclasses directly. In that sense,
054 * this class does not abstract away <strong>every</strong> interface that MiniHBaseCluster or
055 * DistributedHBaseCluster provide.
056 */
057@InterfaceAudience.Public
058public abstract class HBaseCluster implements Closeable, Configurable {
059  // Log is being used in DistributedHBaseCluster class, hence keeping it as package scope
060  static final Logger LOG = LoggerFactory.getLogger(HBaseCluster.class.getName());
061  protected Configuration conf;
062
063  /** the status of the cluster before we begin */
064  protected ClusterMetrics initialClusterStatus;
065
066  /**
067   * Construct an HBaseCluster
068   * @param conf Configuration to be used for cluster
069   */
070  public HBaseCluster(Configuration conf) {
071    setConf(conf);
072  }
073
074  @Override
075  public void setConf(Configuration conf) {
076    this.conf = conf;
077  }
078
079  @Override
080  public Configuration getConf() {
081    return conf;
082  }
083
084  /**
085   * Returns a ClusterMetrics for this HBase cluster.
086   * @see #getInitialClusterMetrics()
087   */
088  public abstract ClusterMetrics getClusterMetrics() throws IOException;
089
090  /**
091   * Returns a ClusterStatus for this HBase cluster as observed at the starting of the HBaseCluster
092   */
093  public ClusterMetrics getInitialClusterMetrics() throws IOException {
094    return initialClusterStatus;
095  }
096
097  /**
098   * Returns an {@link MasterService.BlockingInterface} to the active master
099   */
100  public abstract MasterService.BlockingInterface getMasterAdminService() throws IOException;
101
102  /**
103   * Returns an AdminProtocol interface to the regionserver
104   */
105  public abstract AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
106    throws IOException;
107
108  /**
109   * Returns a ClientProtocol interface to the regionserver
110   */
111  public abstract ClientService.BlockingInterface getClientProtocol(ServerName serverName)
112    throws IOException;
113
114  /**
115   * Starts a new region server on the given hostname or if this is a mini/local cluster, starts a
116   * region server locally.
117   * @param hostname the hostname to start the regionserver on
118   * @throws IOException if something goes wrong
119   */
120  public abstract void startRegionServer(String hostname, int port) throws IOException;
121
122  /**
123   * Kills the region server process if this is a distributed cluster, otherwise this causes the
124   * region server to exit doing basic clean up only.
125   * @throws IOException if something goes wrong
126   */
127  public abstract void killRegionServer(ServerName serverName) throws IOException;
128
129  /**
130   * Keeping track of killed servers and being able to check if a particular server was killed makes
131   * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete
132   * example of such case is - killing servers and waiting for all regions of a particular table to
133   * be assigned. We can check for server column in META table and that its value is not one of the
134   * killed servers.
135   */
136  public abstract boolean isKilledRS(ServerName serverName);
137
138  /**
139   * Stops the given region server, by attempting a gradual stop.
140   * @throws IOException if something goes wrong
141   */
142  public abstract void stopRegionServer(ServerName serverName) throws IOException;
143
144  /**
145   * Wait for the specified region server to join the cluster
146   * @throws IOException if something goes wrong or timeout occurs
147   */
148  public void waitForRegionServerToStart(String hostname, int port, long timeout)
149    throws IOException {
150    long start = System.currentTimeMillis();
151    while ((System.currentTimeMillis() - start) < timeout) {
152      for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) {
153        if (server.getHostname().equals(hostname) && server.getPort() == port) {
154          return;
155        }
156      }
157      Threads.sleep(100);
158    }
159    throw new IOException(
160      "did timeout " + timeout + "ms waiting for region server to start: " + hostname);
161  }
162
163  /**
164   * Wait for the specified region server to stop the thread / process.
165   * @throws IOException if something goes wrong or timeout occurs
166   */
167  public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
168    throws IOException;
169
170  /**
171   * Suspend the region server
172   * @param serverName the hostname to suspend the regionserver on
173   * @throws IOException if something goes wrong
174   */
175  public abstract void suspendRegionServer(ServerName serverName) throws IOException;
176
177  /**
178   * Resume the region server
179   * @param serverName the hostname to resume the regionserver on
180   * @throws IOException if something goes wrong
181   */
182  public abstract void resumeRegionServer(ServerName serverName) throws IOException;
183
184  /**
185   * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, silently
186   * logs warning message.
187   * @param hostname the hostname to start the regionserver on
188   * @throws IOException if something goes wrong
189   */
190  public abstract void startZkNode(String hostname, int port) throws IOException;
191
192  /**
193   * Kills the zookeeper node process if this is a distributed cluster, otherwise, this causes
194   * master to exit doing basic clean up only.
195   * @throws IOException if something goes wrong
196   */
197  public abstract void killZkNode(ServerName serverName) throws IOException;
198
199  /**
200   * Stops the region zookeeper if this is a distributed cluster, otherwise silently logs warning
201   * message.
202   * @throws IOException if something goes wrong
203   */
204  public abstract void stopZkNode(ServerName serverName) throws IOException;
205
206  /**
207   * Wait for the specified zookeeper node to join the cluster
208   * @throws IOException if something goes wrong or timeout occurs
209   */
210  public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException;
211
212  /**
213   * Wait for the specified zookeeper node to stop the thread / process.
214   * @throws IOException if something goes wrong or timeout occurs
215   */
216  public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException;
217
218  /**
219   * Starts a new datanode on the given hostname or if this is a mini/local cluster, silently logs
220   * warning message.
221   * @throws IOException if something goes wrong
222   */
223  public abstract void startDataNode(ServerName serverName) throws IOException;
224
225  /**
226   * Kills the datanode process if this is a distributed cluster, otherwise, this causes master to
227   * exit doing basic clean up only.
228   * @throws IOException if something goes wrong
229   */
230  public abstract void killDataNode(ServerName serverName) throws IOException;
231
232  /**
233   * Stops the datanode if this is a distributed cluster, otherwise silently logs warning message.
234   * @throws IOException if something goes wrong
235   */
236  public abstract void stopDataNode(ServerName serverName) throws IOException;
237
238  /**
239   * Wait for the specified datanode to join the cluster
240   * @throws IOException if something goes wrong or timeout occurs
241   */
242  public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
243    throws IOException;
244
245  /**
246   * Wait for the specified datanode to stop the thread / process.
247   * @throws IOException if something goes wrong or timeout occurs
248   */
249  public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
250    throws IOException;
251
252  /**
253   * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
254   * warning message.
255   * @throws IOException if something goes wrong
256   */
257  public abstract void startNameNode(ServerName serverName) throws IOException;
258
259  /**
260   * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
261   * exit doing basic clean up only.
262   * @throws IOException if something goes wrong
263   */
264  public abstract void killNameNode(ServerName serverName) throws IOException;
265
266  /**
267   * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
268   * @throws IOException if something goes wrong
269   */
270  public abstract void stopNameNode(ServerName serverName) throws IOException;
271
272  /**
273   * Wait for the specified namenode to join the cluster
274   * @throws IOException if something goes wrong or timeout occurs
275   */
276  public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
277    throws IOException;
278
279  /**
280   * Wait for the specified namenode to stop
281   * @throws IOException if something goes wrong or timeout occurs
282   */
283  public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
284    throws IOException;
285
286  /**
287   * Starts a new journalnode on the given hostname or if this is a mini/local cluster, silently
288   * logs warning message.
289   * @throws IOException if something goes wrong
290   */
291  public abstract void startJournalNode(ServerName serverName) throws IOException;
292
293  /**
294   * Kills the journalnode process if this is a distributed cluster, otherwise, this causes master
295   * to exit doing basic clean up only.
296   * @throws IOException if something goes wrong
297   */
298  public abstract void killJournalNode(ServerName serverName) throws IOException;
299
300  /**
301   * Stops the journalnode if this is a distributed cluster, otherwise silently logs warning
302   * message.
303   * @throws IOException if something goes wrong
304   */
305  public abstract void stopJournalNode(ServerName serverName) throws IOException;
306
307  /**
308   * Wait for the specified journalnode to join the cluster
309   * @throws IOException if something goes wrong or timeout occurs
310   */
311  public abstract void waitForJournalNodeToStart(ServerName serverName, long timeout)
312    throws IOException;
313
314  /**
315   * Wait for the specified journalnode to stop
316   * @throws IOException if something goes wrong or timeout occurs
317   */
318  public abstract void waitForJournalNodeToStop(ServerName serverName, long timeout)
319    throws IOException;
320
321  /**
322   * Starts a new master on the given hostname or if this is a mini/local cluster, starts a master
323   * locally.
324   * @param hostname the hostname to start the master on
325   * @throws IOException if something goes wrong
326   */
327  public abstract void startMaster(String hostname, int port) throws IOException;
328
329  /**
330   * Kills the master process if this is a distributed cluster, otherwise, this causes master to
331   * exit doing basic clean up only.
332   * @throws IOException if something goes wrong
333   */
334  public abstract void killMaster(ServerName serverName) throws IOException;
335
336  /**
337   * Stops the given master, by attempting a gradual stop.
338   * @throws IOException if something goes wrong
339   */
340  public abstract void stopMaster(ServerName serverName) throws IOException;
341
342  /**
343   * Wait for the specified master to stop the thread / process.
344   * @throws IOException if something goes wrong or timeout occurs
345   */
346  public abstract void waitForMasterToStop(ServerName serverName, long timeout) throws IOException;
347
348  /**
349   * Blocks until there is an active master and that master has completed initialization.
350   * @return true if an active master becomes available. false if there are no masters left.
351   * @throws IOException if something goes wrong or timeout occurs
352   */
353  public boolean waitForActiveAndReadyMaster() throws IOException {
354    return waitForActiveAndReadyMaster(Long.MAX_VALUE);
355  }
356
357  /**
358   * Blocks until there is an active master and that master has completed initialization.
359   * @param timeout the timeout limit in ms
360   * @return true if an active master becomes available. false if there are no masters left.
361   */
362  public abstract boolean waitForActiveAndReadyMaster(long timeout) throws IOException;
363
364  /**
365   * Wait for HBase Cluster to shut down.
366   */
367  public abstract void waitUntilShutDown() throws IOException;
368
369  /**
370   * Shut down the HBase cluster
371   */
372  public abstract void shutdown() throws IOException;
373
374  /**
375   * Restores the cluster to it's initial state if this is a real cluster, otherwise does nothing.
376   * This is a best effort restore. If the servers are not reachable, or insufficient permissions,
377   * etc. restoration might be partial.
378   * @return whether restoration is complete
379   */
380  public boolean restoreInitialStatus() throws IOException {
381    return restoreClusterMetrics(getInitialClusterMetrics());
382  }
383
384  /**
385   * Restores the cluster to given state if this is a real cluster, otherwise does nothing. This is
386   * a best effort restore. If the servers are not reachable, or insufficient permissions, etc.
387   * restoration might be partial.
388   * @return whether restoration is complete
389   */
390  public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException {
391    return true;
392  }
393
394  /**
395   * Get the ServerName of region server serving the first hbase:meta region
396   */
397  public ServerName getServerHoldingMeta() throws IOException {
398    return getServerHoldingRegion(TableName.META_TABLE_NAME,
399      RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName());
400  }
401
402  /**
403   * Get the ServerName of region server serving the specified region
404   * @param regionName Name of the region in bytes
405   * @param tn         Table name that has the region.
406   * @return ServerName that hosts the region or null
407   */
408  public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName)
409    throws IOException;
410
411  /**
412   * @return whether we are interacting with a distributed cluster as opposed to an in-process
413   *         mini/local cluster.
414   */
415  public boolean isDistributedCluster() {
416    return false;
417  }
418
419  /**
420   * Closes all the resources held open for this cluster. Note that this call does not shutdown the
421   * cluster.
422   * @see #shutdown()
423   */
424  @Override
425  public abstract void close() throws IOException;
426
427  /**
428   * Wait for the namenode.
429   */
430  public void waitForNamenodeAvailable() throws InterruptedException {
431  }
432
433  public void waitForDatanodesRegistered(int nbDN) throws Exception {
434  }
435}