| <!DOCTYPE HTML> |
| <html lang="en"> |
| <head> |
| <!-- Generated by javadoc (17) --> |
| <title>Source code</title> |
| <meta name="viewport" content="width=device-width, initial-scale=1"> |
| <meta name="description" content="source: package: org.apache.hadoop.hbase.mapreduce, class: TableInputFormatBase"> |
| <meta name="generator" content="javadoc/SourceToHTMLConverter"> |
| <link rel="stylesheet" type="text/css" href="../../../../../../stylesheet.css" title="Style"> |
| </head> |
| <body class="source-page"> |
| <main role="main"> |
| <div class="source-container"> |
| <pre><span class="source-line-no">001</span><span id="line-1">/*</span> |
| <span class="source-line-no">002</span><span id="line-2"> * Licensed to the Apache Software Foundation (ASF) under one</span> |
| <span class="source-line-no">003</span><span id="line-3"> * or more contributor license agreements. See the NOTICE file</span> |
| <span class="source-line-no">004</span><span id="line-4"> * distributed with this work for additional information</span> |
| <span class="source-line-no">005</span><span id="line-5"> * regarding copyright ownership. The ASF licenses this file</span> |
| <span class="source-line-no">006</span><span id="line-6"> * to you under the Apache License, Version 2.0 (the</span> |
| <span class="source-line-no">007</span><span id="line-7"> * "License"); you may not use this file except in compliance</span> |
| <span class="source-line-no">008</span><span id="line-8"> * with the License. You may obtain a copy of the License at</span> |
| <span class="source-line-no">009</span><span id="line-9"> *</span> |
| <span class="source-line-no">010</span><span id="line-10"> * http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="source-line-no">011</span><span id="line-11"> *</span> |
| <span class="source-line-no">012</span><span id="line-12"> * Unless required by applicable law or agreed to in writing, software</span> |
| <span class="source-line-no">013</span><span id="line-13"> * distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="source-line-no">014</span><span id="line-14"> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="source-line-no">015</span><span id="line-15"> * See the License for the specific language governing permissions and</span> |
| <span class="source-line-no">016</span><span id="line-16"> * limitations under the License.</span> |
| <span class="source-line-no">017</span><span id="line-17"> */</span> |
| <span class="source-line-no">018</span><span id="line-18">package org.apache.hadoop.hbase.mapreduce;</span> |
| <span class="source-line-no">019</span><span id="line-19"></span> |
| <span class="source-line-no">020</span><span id="line-20">import java.io.Closeable;</span> |
| <span class="source-line-no">021</span><span id="line-21">import java.io.IOException;</span> |
| <span class="source-line-no">022</span><span id="line-22">import java.net.InetAddress;</span> |
| <span class="source-line-no">023</span><span id="line-23">import java.net.InetSocketAddress;</span> |
| <span class="source-line-no">024</span><span id="line-24">import java.net.UnknownHostException;</span> |
| <span class="source-line-no">025</span><span id="line-25">import java.util.ArrayList;</span> |
| <span class="source-line-no">026</span><span id="line-26">import java.util.HashMap;</span> |
| <span class="source-line-no">027</span><span id="line-27">import java.util.List;</span> |
| <span class="source-line-no">028</span><span id="line-28">import org.apache.hadoop.hbase.HConstants;</span> |
| <span class="source-line-no">029</span><span id="line-29">import org.apache.hadoop.hbase.HRegionLocation;</span> |
| <span class="source-line-no">030</span><span id="line-30">import org.apache.hadoop.hbase.TableName;</span> |
| <span class="source-line-no">031</span><span id="line-31">import org.apache.hadoop.hbase.client.Admin;</span> |
| <span class="source-line-no">032</span><span id="line-32">import org.apache.hadoop.hbase.client.Connection;</span> |
| <span class="source-line-no">033</span><span id="line-33">import org.apache.hadoop.hbase.client.RegionLocator;</span> |
| <span class="source-line-no">034</span><span id="line-34">import org.apache.hadoop.hbase.client.Result;</span> |
| <span class="source-line-no">035</span><span id="line-35">import org.apache.hadoop.hbase.client.Scan;</span> |
| <span class="source-line-no">036</span><span id="line-36">import org.apache.hadoop.hbase.client.Table;</span> |
| <span class="source-line-no">037</span><span id="line-37">import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException;</span> |
| <span class="source-line-no">038</span><span id="line-38">import org.apache.hadoop.hbase.io.ImmutableBytesWritable;</span> |
| <span class="source-line-no">039</span><span id="line-39">import org.apache.hadoop.hbase.util.Addressing;</span> |
| <span class="source-line-no">040</span><span id="line-40">import org.apache.hadoop.hbase.util.Bytes;</span> |
| <span class="source-line-no">041</span><span id="line-41">import org.apache.hadoop.hbase.util.Pair;</span> |
| <span class="source-line-no">042</span><span id="line-42">import org.apache.hadoop.hbase.util.Strings;</span> |
| <span class="source-line-no">043</span><span id="line-43">import org.apache.hadoop.mapreduce.InputFormat;</span> |
| <span class="source-line-no">044</span><span id="line-44">import org.apache.hadoop.mapreduce.InputSplit;</span> |
| <span class="source-line-no">045</span><span id="line-45">import org.apache.hadoop.mapreduce.JobContext;</span> |
| <span class="source-line-no">046</span><span id="line-46">import org.apache.hadoop.mapreduce.RecordReader;</span> |
| <span class="source-line-no">047</span><span id="line-47">import org.apache.hadoop.mapreduce.TaskAttemptContext;</span> |
| <span class="source-line-no">048</span><span id="line-48">import org.apache.hadoop.net.DNS;</span> |
| <span class="source-line-no">049</span><span id="line-49">import org.apache.yetus.audience.InterfaceAudience;</span> |
| <span class="source-line-no">050</span><span id="line-50">import org.slf4j.Logger;</span> |
| <span class="source-line-no">051</span><span id="line-51">import org.slf4j.LoggerFactory;</span> |
| <span class="source-line-no">052</span><span id="line-52"></span> |
| <span class="source-line-no">053</span><span id="line-53">/**</span> |
| <span class="source-line-no">054</span><span id="line-54"> * A base for {@link TableInputFormat}s. Receives a {@link Connection}, a {@link TableName}, an</span> |
| <span class="source-line-no">055</span><span id="line-55"> * {@link Scan} instance that defines the input columns etc. Subclasses may use other</span> |
| <span class="source-line-no">056</span><span id="line-56"> * TableRecordReader implementations. Subclasses MUST ensure initializeTable(Connection, TableName)</span> |
| <span class="source-line-no">057</span><span id="line-57"> * is called for an instance to function properly. Each of the entry points to this class used by</span> |
| <span class="source-line-no">058</span><span id="line-58"> * the MapReduce framework, {@link #createRecordReader(InputSplit, TaskAttemptContext)} and</span> |
| <span class="source-line-no">059</span><span id="line-59"> * {@link #getSplits(JobContext)}, will call {@link #initialize(JobContext)} as a convenient</span> |
| <span class="source-line-no">060</span><span id="line-60"> * centralized location to handle retrieving the necessary configuration information. If your</span> |
| <span class="source-line-no">061</span><span id="line-61"> * subclass overrides either of these methods, either call the parent version or call initialize</span> |
| <span class="source-line-no">062</span><span id="line-62"> * yourself.</span> |
| <span class="source-line-no">063</span><span id="line-63"> * <p></span> |
| <span class="source-line-no">064</span><span id="line-64"> * An example of a subclass:</span> |
| <span class="source-line-no">065</span><span id="line-65"> *</span> |
| <span class="source-line-no">066</span><span id="line-66"> * <pre></span> |
| <span class="source-line-no">067</span><span id="line-67"> * class ExampleTIF extends TableInputFormatBase {</span> |
| <span class="source-line-no">068</span><span id="line-68"> *</span> |
| <span class="source-line-no">069</span><span id="line-69"> * {@literal @}Override</span> |
| <span class="source-line-no">070</span><span id="line-70"> * protected void initialize(JobContext context) throws IOException {</span> |
| <span class="source-line-no">071</span><span id="line-71"> * // We are responsible for the lifecycle of this connection until we hand it over in</span> |
| <span class="source-line-no">072</span><span id="line-72"> * // initializeTable.</span> |
| <span class="source-line-no">073</span><span id="line-73"> * Connection connection = ConnectionFactory.createConnection(HBaseConfiguration.create(</span> |
| <span class="source-line-no">074</span><span id="line-74"> * job.getConfiguration()));</span> |
| <span class="source-line-no">075</span><span id="line-75"> * TableName tableName = TableName.valueOf("exampleTable");</span> |
| <span class="source-line-no">076</span><span id="line-76"> * // mandatory. once passed here, TableInputFormatBase will handle closing the connection.</span> |
| <span class="source-line-no">077</span><span id="line-77"> * initializeTable(connection, tableName);</span> |
| <span class="source-line-no">078</span><span id="line-78"> * byte[][] inputColumns = new byte [][] { Bytes.toBytes("columnA"),</span> |
| <span class="source-line-no">079</span><span id="line-79"> * Bytes.toBytes("columnB") };</span> |
| <span class="source-line-no">080</span><span id="line-80"> * // optional, by default we'll get everything for the table.</span> |
| <span class="source-line-no">081</span><span id="line-81"> * Scan scan = new Scan();</span> |
| <span class="source-line-no">082</span><span id="line-82"> * for (byte[] family : inputColumns) {</span> |
| <span class="source-line-no">083</span><span id="line-83"> * scan.addFamily(family);</span> |
| <span class="source-line-no">084</span><span id="line-84"> * }</span> |
| <span class="source-line-no">085</span><span id="line-85"> * Filter exampleFilter = new RowFilter(CompareOp.EQUAL, new RegexStringComparator("aa.*"));</span> |
| <span class="source-line-no">086</span><span id="line-86"> * scan.setFilter(exampleFilter);</span> |
| <span class="source-line-no">087</span><span id="line-87"> * setScan(scan);</span> |
| <span class="source-line-no">088</span><span id="line-88"> * }</span> |
| <span class="source-line-no">089</span><span id="line-89"> * }</span> |
| <span class="source-line-no">090</span><span id="line-90"> * </pre></span> |
| <span class="source-line-no">091</span><span id="line-91"> *</span> |
| <span class="source-line-no">092</span><span id="line-92"> * The number of InputSplits(mappers) match the number of regions in a table by default. Set</span> |
| <span class="source-line-no">093</span><span id="line-93"> * "hbase.mapreduce.tableinput.mappers.per.region" to specify how many mappers per region, set this</span> |
| <span class="source-line-no">094</span><span id="line-94"> * property will disable autobalance below.\ Set "hbase.mapreduce.tif.input.autobalance" to enable</span> |
| <span class="source-line-no">095</span><span id="line-95"> * autobalance, hbase will assign mappers based on average region size; For regions, whose size</span> |
| <span class="source-line-no">096</span><span id="line-96"> * larger than average region size may assigned more mappers, and for smaller one, they may group</span> |
| <span class="source-line-no">097</span><span id="line-97"> * together to use one mapper. If actual average region size is too big, like 50G, it is not good to</span> |
| <span class="source-line-no">098</span><span id="line-98"> * only assign 1 mapper for those large regions. Use "hbase.mapreduce.tif.ave.regionsize" to set max</span> |
| <span class="source-line-no">099</span><span id="line-99"> * average region size when enable "autobalanece", default mas average region size is 8G.</span> |
| <span class="source-line-no">100</span><span id="line-100"> */</span> |
| <span class="source-line-no">101</span><span id="line-101">@InterfaceAudience.Public</span> |
| <span class="source-line-no">102</span><span id="line-102">public abstract class TableInputFormatBase extends InputFormat<ImmutableBytesWritable, Result> {</span> |
| <span class="source-line-no">103</span><span id="line-103"></span> |
| <span class="source-line-no">104</span><span id="line-104"> private static final Logger LOG = LoggerFactory.getLogger(TableInputFormatBase.class);</span> |
| <span class="source-line-no">105</span><span id="line-105"></span> |
| <span class="source-line-no">106</span><span id="line-106"> private static final String NOT_INITIALIZED = "The input format instance has not been properly "</span> |
| <span class="source-line-no">107</span><span id="line-107"> + "initialized. Ensure you call initializeTable either in your constructor or initialize "</span> |
| <span class="source-line-no">108</span><span id="line-108"> + "method";</span> |
| <span class="source-line-no">109</span><span id="line-109"> private static final String INITIALIZATION_ERROR = "Cannot create a record reader because of a"</span> |
| <span class="source-line-no">110</span><span id="line-110"> + " previous error. Please look at the previous logs lines from"</span> |
| <span class="source-line-no">111</span><span id="line-111"> + " the task's full log for more details.";</span> |
| <span class="source-line-no">112</span><span id="line-112"></span> |
| <span class="source-line-no">113</span><span id="line-113"> /** Specify if we enable auto-balance to set number of mappers in M/R jobs. */</span> |
| <span class="source-line-no">114</span><span id="line-114"> public static final String MAPREDUCE_INPUT_AUTOBALANCE = "hbase.mapreduce.tif.input.autobalance";</span> |
| <span class="source-line-no">115</span><span id="line-115"> /**</span> |
| <span class="source-line-no">116</span><span id="line-116"> * In auto-balance, we split input by ave region size, if calculated region size is too big, we</span> |
| <span class="source-line-no">117</span><span id="line-117"> * can set it.</span> |
| <span class="source-line-no">118</span><span id="line-118"> */</span> |
| <span class="source-line-no">119</span><span id="line-119"> public static final String MAX_AVERAGE_REGION_SIZE = "hbase.mapreduce.tif.ave.regionsize";</span> |
| <span class="source-line-no">120</span><span id="line-120"></span> |
| <span class="source-line-no">121</span><span id="line-121"> /** Set the number of Mappers for each region, all regions have same number of Mappers */</span> |
| <span class="source-line-no">122</span><span id="line-122"> public static final String NUM_MAPPERS_PER_REGION =</span> |
| <span class="source-line-no">123</span><span id="line-123"> "hbase.mapreduce.tableinput.mappers.per.region";</span> |
| <span class="source-line-no">124</span><span id="line-124"></span> |
| <span class="source-line-no">125</span><span id="line-125"> /**</span> |
| <span class="source-line-no">126</span><span id="line-126"> * Holds the details for the internal scanner.</span> |
| <span class="source-line-no">127</span><span id="line-127"> * @see Scan</span> |
| <span class="source-line-no">128</span><span id="line-128"> */</span> |
| <span class="source-line-no">129</span><span id="line-129"> private Scan scan = null;</span> |
| <span class="source-line-no">130</span><span id="line-130"> /** The {@link Admin}. */</span> |
| <span class="source-line-no">131</span><span id="line-131"> private Admin admin;</span> |
| <span class="source-line-no">132</span><span id="line-132"> /** The {@link Table} to scan. */</span> |
| <span class="source-line-no">133</span><span id="line-133"> private Table table;</span> |
| <span class="source-line-no">134</span><span id="line-134"> /** The {@link RegionLocator} of the table. */</span> |
| <span class="source-line-no">135</span><span id="line-135"> private RegionLocator regionLocator;</span> |
| <span class="source-line-no">136</span><span id="line-136"> /** The reader scanning the table, can be a custom one. */</span> |
| <span class="source-line-no">137</span><span id="line-137"> private TableRecordReader tableRecordReader = null;</span> |
| <span class="source-line-no">138</span><span id="line-138"> /** The underlying {@link Connection} of the table. */</span> |
| <span class="source-line-no">139</span><span id="line-139"> private Connection connection;</span> |
| <span class="source-line-no">140</span><span id="line-140"> /** Used to generate splits based on region size. */</span> |
| <span class="source-line-no">141</span><span id="line-141"> private RegionSizeCalculator regionSizeCalculator;</span> |
| <span class="source-line-no">142</span><span id="line-142"></span> |
| <span class="source-line-no">143</span><span id="line-143"> /** The reverse DNS lookup cache mapping: IPAddress => HostName */</span> |
| <span class="source-line-no">144</span><span id="line-144"> private HashMap<InetAddress, String> reverseDNSCacheMap = new HashMap<>();</span> |
| <span class="source-line-no">145</span><span id="line-145"></span> |
| <span class="source-line-no">146</span><span id="line-146"> /**</span> |
| <span class="source-line-no">147</span><span id="line-147"> * Builds a {@link TableRecordReader}. If no {@link TableRecordReader} was provided, uses the</span> |
| <span class="source-line-no">148</span><span id="line-148"> * default.</span> |
| <span class="source-line-no">149</span><span id="line-149"> * @param split The split to work with.</span> |
| <span class="source-line-no">150</span><span id="line-150"> * @param context The current context.</span> |
| <span class="source-line-no">151</span><span id="line-151"> * @return The newly created record reader.</span> |
| <span class="source-line-no">152</span><span id="line-152"> * @throws IOException When creating the reader fails.</span> |
| <span class="source-line-no">153</span><span id="line-153"> * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(</span> |
| <span class="source-line-no">154</span><span id="line-154"> * org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)</span> |
| <span class="source-line-no">155</span><span id="line-155"> */</span> |
| <span class="source-line-no">156</span><span id="line-156"> @Override</span> |
| <span class="source-line-no">157</span><span id="line-157"> public RecordReader<ImmutableBytesWritable, Result> createRecordReader(InputSplit split,</span> |
| <span class="source-line-no">158</span><span id="line-158"> TaskAttemptContext context) throws IOException {</span> |
| <span class="source-line-no">159</span><span id="line-159"> // Just in case a subclass is relying on JobConfigurable magic.</span> |
| <span class="source-line-no">160</span><span id="line-160"> if (table == null) {</span> |
| <span class="source-line-no">161</span><span id="line-161"> initialize(context);</span> |
| <span class="source-line-no">162</span><span id="line-162"> }</span> |
| <span class="source-line-no">163</span><span id="line-163"> // null check in case our child overrides getTable to not throw.</span> |
| <span class="source-line-no">164</span><span id="line-164"> try {</span> |
| <span class="source-line-no">165</span><span id="line-165"> if (getTable() == null) {</span> |
| <span class="source-line-no">166</span><span id="line-166"> // initialize() must not have been implemented in the subclass.</span> |
| <span class="source-line-no">167</span><span id="line-167"> throw new IOException(INITIALIZATION_ERROR);</span> |
| <span class="source-line-no">168</span><span id="line-168"> }</span> |
| <span class="source-line-no">169</span><span id="line-169"> } catch (IllegalStateException exception) {</span> |
| <span class="source-line-no">170</span><span id="line-170"> throw new IOException(INITIALIZATION_ERROR, exception);</span> |
| <span class="source-line-no">171</span><span id="line-171"> }</span> |
| <span class="source-line-no">172</span><span id="line-172"> TableSplit tSplit = (TableSplit) split;</span> |
| <span class="source-line-no">173</span><span id="line-173"> LOG.info("Input split length: " + Strings.humanReadableInt(tSplit.getLength()) + " bytes.");</span> |
| <span class="source-line-no">174</span><span id="line-174"> final TableRecordReader trr =</span> |
| <span class="source-line-no">175</span><span id="line-175"> this.tableRecordReader != null ? this.tableRecordReader : new TableRecordReader();</span> |
| <span class="source-line-no">176</span><span id="line-176"> Scan sc = new Scan(this.scan);</span> |
| <span class="source-line-no">177</span><span id="line-177"> sc.withStartRow(tSplit.getStartRow());</span> |
| <span class="source-line-no">178</span><span id="line-178"> sc.withStopRow(tSplit.getEndRow());</span> |
| <span class="source-line-no">179</span><span id="line-179"> trr.setScan(sc);</span> |
| <span class="source-line-no">180</span><span id="line-180"> trr.setTable(getTable());</span> |
| <span class="source-line-no">181</span><span id="line-181"> return new RecordReader<ImmutableBytesWritable, Result>() {</span> |
| <span class="source-line-no">182</span><span id="line-182"></span> |
| <span class="source-line-no">183</span><span id="line-183"> @Override</span> |
| <span class="source-line-no">184</span><span id="line-184"> public void close() throws IOException {</span> |
| <span class="source-line-no">185</span><span id="line-185"> trr.close();</span> |
| <span class="source-line-no">186</span><span id="line-186"> closeTable();</span> |
| <span class="source-line-no">187</span><span id="line-187"> }</span> |
| <span class="source-line-no">188</span><span id="line-188"></span> |
| <span class="source-line-no">189</span><span id="line-189"> @Override</span> |
| <span class="source-line-no">190</span><span id="line-190"> public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {</span> |
| <span class="source-line-no">191</span><span id="line-191"> return trr.getCurrentKey();</span> |
| <span class="source-line-no">192</span><span id="line-192"> }</span> |
| <span class="source-line-no">193</span><span id="line-193"></span> |
| <span class="source-line-no">194</span><span id="line-194"> @Override</span> |
| <span class="source-line-no">195</span><span id="line-195"> public Result getCurrentValue() throws IOException, InterruptedException {</span> |
| <span class="source-line-no">196</span><span id="line-196"> return trr.getCurrentValue();</span> |
| <span class="source-line-no">197</span><span id="line-197"> }</span> |
| <span class="source-line-no">198</span><span id="line-198"></span> |
| <span class="source-line-no">199</span><span id="line-199"> @Override</span> |
| <span class="source-line-no">200</span><span id="line-200"> public float getProgress() throws IOException, InterruptedException {</span> |
| <span class="source-line-no">201</span><span id="line-201"> return trr.getProgress();</span> |
| <span class="source-line-no">202</span><span id="line-202"> }</span> |
| <span class="source-line-no">203</span><span id="line-203"></span> |
| <span class="source-line-no">204</span><span id="line-204"> @Override</span> |
| <span class="source-line-no">205</span><span id="line-205"> public void initialize(InputSplit inputsplit, TaskAttemptContext context)</span> |
| <span class="source-line-no">206</span><span id="line-206"> throws IOException, InterruptedException {</span> |
| <span class="source-line-no">207</span><span id="line-207"> trr.initialize(inputsplit, context);</span> |
| <span class="source-line-no">208</span><span id="line-208"> }</span> |
| <span class="source-line-no">209</span><span id="line-209"></span> |
| <span class="source-line-no">210</span><span id="line-210"> @Override</span> |
| <span class="source-line-no">211</span><span id="line-211"> public boolean nextKeyValue() throws IOException, InterruptedException {</span> |
| <span class="source-line-no">212</span><span id="line-212"> return trr.nextKeyValue();</span> |
| <span class="source-line-no">213</span><span id="line-213"> }</span> |
| <span class="source-line-no">214</span><span id="line-214"> };</span> |
| <span class="source-line-no">215</span><span id="line-215"> }</span> |
| <span class="source-line-no">216</span><span id="line-216"></span> |
| <span class="source-line-no">217</span><span id="line-217"> protected Pair<byte[][], byte[][]> getStartEndKeys() throws IOException {</span> |
| <span class="source-line-no">218</span><span id="line-218"> return getRegionLocator().getStartEndKeys();</span> |
| <span class="source-line-no">219</span><span id="line-219"> }</span> |
| <span class="source-line-no">220</span><span id="line-220"></span> |
| <span class="source-line-no">221</span><span id="line-221"> /**</span> |
| <span class="source-line-no">222</span><span id="line-222"> * Calculates the splits that will serve as input for the map tasks.</span> |
| <span class="source-line-no">223</span><span id="line-223"> * @param context The current job context.</span> |
| <span class="source-line-no">224</span><span id="line-224"> * @return The list of input splits.</span> |
| <span class="source-line-no">225</span><span id="line-225"> * @throws IOException When creating the list of splits fails.</span> |
| <span class="source-line-no">226</span><span id="line-226"> * @see org.apache.hadoop.mapreduce.InputFormat#getSplits( org.apache.hadoop.mapreduce.JobContext)</span> |
| <span class="source-line-no">227</span><span id="line-227"> */</span> |
| <span class="source-line-no">228</span><span id="line-228"> @Override</span> |
| <span class="source-line-no">229</span><span id="line-229"> public List<InputSplit> getSplits(JobContext context) throws IOException {</span> |
| <span class="source-line-no">230</span><span id="line-230"> boolean closeOnFinish = false;</span> |
| <span class="source-line-no">231</span><span id="line-231"></span> |
| <span class="source-line-no">232</span><span id="line-232"> // Just in case a subclass is relying on JobConfigurable magic.</span> |
| <span class="source-line-no">233</span><span id="line-233"> if (table == null) {</span> |
| <span class="source-line-no">234</span><span id="line-234"> initialize(context);</span> |
| <span class="source-line-no">235</span><span id="line-235"> closeOnFinish = true;</span> |
| <span class="source-line-no">236</span><span id="line-236"> }</span> |
| <span class="source-line-no">237</span><span id="line-237"></span> |
| <span class="source-line-no">238</span><span id="line-238"> // null check in case our child overrides getTable to not throw.</span> |
| <span class="source-line-no">239</span><span id="line-239"> try {</span> |
| <span class="source-line-no">240</span><span id="line-240"> if (getTable() == null) {</span> |
| <span class="source-line-no">241</span><span id="line-241"> // initialize() must not have been implemented in the subclass.</span> |
| <span class="source-line-no">242</span><span id="line-242"> throw new IOException(INITIALIZATION_ERROR);</span> |
| <span class="source-line-no">243</span><span id="line-243"> }</span> |
| <span class="source-line-no">244</span><span id="line-244"> } catch (IllegalStateException exception) {</span> |
| <span class="source-line-no">245</span><span id="line-245"> throw new IOException(INITIALIZATION_ERROR, exception);</span> |
| <span class="source-line-no">246</span><span id="line-246"> }</span> |
| <span class="source-line-no">247</span><span id="line-247"></span> |
| <span class="source-line-no">248</span><span id="line-248"> try {</span> |
| <span class="source-line-no">249</span><span id="line-249"> List<InputSplit> splits = oneInputSplitPerRegion();</span> |
| <span class="source-line-no">250</span><span id="line-250"></span> |
| <span class="source-line-no">251</span><span id="line-251"> // set same number of mappers for each region</span> |
| <span class="source-line-no">252</span><span id="line-252"> if (context.getConfiguration().get(NUM_MAPPERS_PER_REGION) != null) {</span> |
| <span class="source-line-no">253</span><span id="line-253"> int nSplitsPerRegion = context.getConfiguration().getInt(NUM_MAPPERS_PER_REGION, 1);</span> |
| <span class="source-line-no">254</span><span id="line-254"> List<InputSplit> res = new ArrayList<>();</span> |
| <span class="source-line-no">255</span><span id="line-255"> for (int i = 0; i < splits.size(); i++) {</span> |
| <span class="source-line-no">256</span><span id="line-256"> List<InputSplit> tmp = createNInputSplitsUniform(splits.get(i), nSplitsPerRegion);</span> |
| <span class="source-line-no">257</span><span id="line-257"> res.addAll(tmp);</span> |
| <span class="source-line-no">258</span><span id="line-258"> }</span> |
| <span class="source-line-no">259</span><span id="line-259"> return res;</span> |
| <span class="source-line-no">260</span><span id="line-260"> }</span> |
| <span class="source-line-no">261</span><span id="line-261"></span> |
| <span class="source-line-no">262</span><span id="line-262"> // The default value of "hbase.mapreduce.input.autobalance" is false.</span> |
| <span class="source-line-no">263</span><span id="line-263"> if (context.getConfiguration().getBoolean(MAPREDUCE_INPUT_AUTOBALANCE, false)) {</span> |
| <span class="source-line-no">264</span><span id="line-264"> long maxAveRegionSize =</span> |
| <span class="source-line-no">265</span><span id="line-265"> context.getConfiguration().getLong(MAX_AVERAGE_REGION_SIZE, 8L * 1073741824); // 8GB</span> |
| <span class="source-line-no">266</span><span id="line-266"> return calculateAutoBalancedSplits(splits, maxAveRegionSize);</span> |
| <span class="source-line-no">267</span><span id="line-267"> }</span> |
| <span class="source-line-no">268</span><span id="line-268"></span> |
| <span class="source-line-no">269</span><span id="line-269"> // return one mapper per region</span> |
| <span class="source-line-no">270</span><span id="line-270"> return splits;</span> |
| <span class="source-line-no">271</span><span id="line-271"> } finally {</span> |
| <span class="source-line-no">272</span><span id="line-272"> if (closeOnFinish) {</span> |
| <span class="source-line-no">273</span><span id="line-273"> closeTable();</span> |
| <span class="source-line-no">274</span><span id="line-274"> }</span> |
| <span class="source-line-no">275</span><span id="line-275"> }</span> |
| <span class="source-line-no">276</span><span id="line-276"> }</span> |
| <span class="source-line-no">277</span><span id="line-277"></span> |
| <span class="source-line-no">278</span><span id="line-278"> /**</span> |
| <span class="source-line-no">279</span><span id="line-279"> * Create one InputSplit per region</span> |
| <span class="source-line-no">280</span><span id="line-280"> * @return The list of InputSplit for all the regions</span> |
| <span class="source-line-no">281</span><span id="line-281"> * @throws IOException throws IOException</span> |
| <span class="source-line-no">282</span><span id="line-282"> */</span> |
| <span class="source-line-no">283</span><span id="line-283"> private List<InputSplit> oneInputSplitPerRegion() throws IOException {</span> |
| <span class="source-line-no">284</span><span id="line-284"> if (regionSizeCalculator == null) {</span> |
| <span class="source-line-no">285</span><span id="line-285"> // Initialize here rather than with the other resources because this involves</span> |
| <span class="source-line-no">286</span><span id="line-286"> // a full scan of meta, which can be heavy. We might as well only do it if/when necessary.</span> |
| <span class="source-line-no">287</span><span id="line-287"> regionSizeCalculator = createRegionSizeCalculator(getRegionLocator(), getAdmin());</span> |
| <span class="source-line-no">288</span><span id="line-288"> }</span> |
| <span class="source-line-no">289</span><span id="line-289"></span> |
| <span class="source-line-no">290</span><span id="line-290"> TableName tableName = getTable().getName();</span> |
| <span class="source-line-no">291</span><span id="line-291"></span> |
| <span class="source-line-no">292</span><span id="line-292"> Pair<byte[][], byte[][]> keys = getStartEndKeys();</span> |
| <span class="source-line-no">293</span><span id="line-293"> if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {</span> |
| <span class="source-line-no">294</span><span id="line-294"> HRegionLocation regLoc =</span> |
| <span class="source-line-no">295</span><span id="line-295"> getRegionLocator().getRegionLocation(HConstants.EMPTY_BYTE_ARRAY, false);</span> |
| <span class="source-line-no">296</span><span id="line-296"> if (null == regLoc) {</span> |
| <span class="source-line-no">297</span><span id="line-297"> throw new IOException("Expecting at least one region.");</span> |
| <span class="source-line-no">298</span><span id="line-298"> }</span> |
| <span class="source-line-no">299</span><span id="line-299"> List<InputSplit> splits = new ArrayList<>(1);</span> |
| <span class="source-line-no">300</span><span id="line-300"> long regionSize = regionSizeCalculator.getRegionSize(regLoc.getRegion().getRegionName());</span> |
| <span class="source-line-no">301</span><span id="line-301"> // In the table input format for single table we do not need to</span> |
| <span class="source-line-no">302</span><span id="line-302"> // store the scan object in table split because it can be memory intensive and redundant</span> |
| <span class="source-line-no">303</span><span id="line-303"> // information to what is already stored in conf SCAN. See HBASE-25212</span> |
| <span class="source-line-no">304</span><span id="line-304"> TableSplit split =</span> |
| <span class="source-line-no">305</span><span id="line-305"> new TableSplit(tableName, null, HConstants.EMPTY_BYTE_ARRAY, HConstants.EMPTY_BYTE_ARRAY,</span> |
| <span class="source-line-no">306</span><span id="line-306"> regLoc.getHostnamePort().split(Addressing.HOSTNAME_PORT_SEPARATOR)[0], regionSize);</span> |
| <span class="source-line-no">307</span><span id="line-307"> splits.add(split);</span> |
| <span class="source-line-no">308</span><span id="line-308"> return splits;</span> |
| <span class="source-line-no">309</span><span id="line-309"> }</span> |
| <span class="source-line-no">310</span><span id="line-310"> List<InputSplit> splits = new ArrayList<>(keys.getFirst().length);</span> |
| <span class="source-line-no">311</span><span id="line-311"> for (int i = 0; i < keys.getFirst().length; i++) {</span> |
| <span class="source-line-no">312</span><span id="line-312"> if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {</span> |
| <span class="source-line-no">313</span><span id="line-313"> continue;</span> |
| <span class="source-line-no">314</span><span id="line-314"> }</span> |
| <span class="source-line-no">315</span><span id="line-315"></span> |
| <span class="source-line-no">316</span><span id="line-316"> byte[] startRow = scan.getStartRow();</span> |
| <span class="source-line-no">317</span><span id="line-317"> byte[] stopRow = scan.getStopRow();</span> |
| <span class="source-line-no">318</span><span id="line-318"> // determine if the given start an stop key fall into the region</span> |
| <span class="source-line-no">319</span><span id="line-319"> if (</span> |
| <span class="source-line-no">320</span><span id="line-320"> (startRow.length == 0 || keys.getSecond()[i].length == 0</span> |
| <span class="source-line-no">321</span><span id="line-321"> || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0)</span> |
| <span class="source-line-no">322</span><span id="line-322"> && (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)</span> |
| <span class="source-line-no">323</span><span id="line-323"> ) {</span> |
| <span class="source-line-no">324</span><span id="line-324"> byte[] splitStart =</span> |
| <span class="source-line-no">325</span><span id="line-325"> startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0</span> |
| <span class="source-line-no">326</span><span id="line-326"> ? keys.getFirst()[i]</span> |
| <span class="source-line-no">327</span><span id="line-327"> : startRow;</span> |
| <span class="source-line-no">328</span><span id="line-328"> byte[] splitStop =</span> |
| <span class="source-line-no">329</span><span id="line-329"> (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0)</span> |
| <span class="source-line-no">330</span><span id="line-330"> && keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;</span> |
| <span class="source-line-no">331</span><span id="line-331"></span> |
| <span class="source-line-no">332</span><span id="line-332"> HRegionLocation location = getRegionLocator().getRegionLocation(keys.getFirst()[i], false);</span> |
| <span class="source-line-no">333</span><span id="line-333"> // The below InetSocketAddress creation does a name resolution.</span> |
| <span class="source-line-no">334</span><span id="line-334"> InetSocketAddress isa = new InetSocketAddress(location.getHostname(), location.getPort());</span> |
| <span class="source-line-no">335</span><span id="line-335"> if (isa.isUnresolved()) {</span> |
| <span class="source-line-no">336</span><span id="line-336"> LOG.warn("Failed resolve " + isa);</span> |
| <span class="source-line-no">337</span><span id="line-337"> }</span> |
| <span class="source-line-no">338</span><span id="line-338"> InetAddress regionAddress = isa.getAddress();</span> |
| <span class="source-line-no">339</span><span id="line-339"> String regionLocation;</span> |
| <span class="source-line-no">340</span><span id="line-340"> regionLocation = reverseDNS(regionAddress);</span> |
| <span class="source-line-no">341</span><span id="line-341"></span> |
| <span class="source-line-no">342</span><span id="line-342"> byte[] regionName = location.getRegion().getRegionName();</span> |
| <span class="source-line-no">343</span><span id="line-343"> String encodedRegionName = location.getRegion().getEncodedName();</span> |
| <span class="source-line-no">344</span><span id="line-344"> long regionSize = regionSizeCalculator.getRegionSize(regionName);</span> |
| <span class="source-line-no">345</span><span id="line-345"> // In the table input format for single table we do not need to</span> |
| <span class="source-line-no">346</span><span id="line-346"> // store the scan object in table split because it can be memory intensive and redundant</span> |
| <span class="source-line-no">347</span><span id="line-347"> // information to what is already stored in conf SCAN. See HBASE-25212</span> |
| <span class="source-line-no">348</span><span id="line-348"> TableSplit split = new TableSplit(tableName, null, splitStart, splitStop, regionLocation,</span> |
| <span class="source-line-no">349</span><span id="line-349"> encodedRegionName, regionSize);</span> |
| <span class="source-line-no">350</span><span id="line-350"> splits.add(split);</span> |
| <span class="source-line-no">351</span><span id="line-351"> if (LOG.isDebugEnabled()) {</span> |
| <span class="source-line-no">352</span><span id="line-352"> LOG.debug("getSplits: split -> " + i + " -> " + split);</span> |
| <span class="source-line-no">353</span><span id="line-353"> }</span> |
| <span class="source-line-no">354</span><span id="line-354"> }</span> |
| <span class="source-line-no">355</span><span id="line-355"> }</span> |
| <span class="source-line-no">356</span><span id="line-356"> return splits;</span> |
| <span class="source-line-no">357</span><span id="line-357"> }</span> |
| <span class="source-line-no">358</span><span id="line-358"></span> |
| <span class="source-line-no">359</span><span id="line-359"> /**</span> |
| <span class="source-line-no">360</span><span id="line-360"> * Create n splits for one InputSplit, For now only support uniform distribution</span> |
| <span class="source-line-no">361</span><span id="line-361"> * @param split A TableSplit corresponding to a range of rowkeys</span> |
| <span class="source-line-no">362</span><span id="line-362"> * @param n Number of ranges after splitting. Pass 1 means no split for the range Pass 2 if</span> |
| <span class="source-line-no">363</span><span id="line-363"> * you want to split the range in two;</span> |
| <span class="source-line-no">364</span><span id="line-364"> * @return A list of TableSplit, the size of the list is {@code n}</span> |
| <span class="source-line-no">365</span><span id="line-365"> */</span> |
| <span class="source-line-no">366</span><span id="line-366"> protected List<InputSplit> createNInputSplitsUniform(InputSplit split, int n)</span> |
| <span class="source-line-no">367</span><span id="line-367"> throws IllegalArgumentIOException {</span> |
| <span class="source-line-no">368</span><span id="line-368"> if (split == null || !(split instanceof TableSplit)) {</span> |
| <span class="source-line-no">369</span><span id="line-369"> throw new IllegalArgumentIOException(</span> |
| <span class="source-line-no">370</span><span id="line-370"> "InputSplit for CreateNSplitsPerRegion can not be null + "</span> |
| <span class="source-line-no">371</span><span id="line-371"> + "and should be instance of TableSplit");</span> |
| <span class="source-line-no">372</span><span id="line-372"> }</span> |
| <span class="source-line-no">373</span><span id="line-373"> // if n < 1, then still continue using n = 1</span> |
| <span class="source-line-no">374</span><span id="line-374"> n = n < 1 ? 1 : n;</span> |
| <span class="source-line-no">375</span><span id="line-375"> List<InputSplit> res = new ArrayList<>(n);</span> |
| <span class="source-line-no">376</span><span id="line-376"> if (n == 1) {</span> |
| <span class="source-line-no">377</span><span id="line-377"> res.add(split);</span> |
| <span class="source-line-no">378</span><span id="line-378"> return res;</span> |
| <span class="source-line-no">379</span><span id="line-379"> }</span> |
| <span class="source-line-no">380</span><span id="line-380"></span> |
| <span class="source-line-no">381</span><span id="line-381"> // Collect Region related information</span> |
| <span class="source-line-no">382</span><span id="line-382"> TableSplit ts = (TableSplit) split;</span> |
| <span class="source-line-no">383</span><span id="line-383"> TableName tableName = ts.getTable();</span> |
| <span class="source-line-no">384</span><span id="line-384"> String regionLocation = ts.getRegionLocation();</span> |
| <span class="source-line-no">385</span><span id="line-385"> String encodedRegionName = ts.getEncodedRegionName();</span> |
| <span class="source-line-no">386</span><span id="line-386"> long regionSize = ts.getLength();</span> |
| <span class="source-line-no">387</span><span id="line-387"> byte[] startRow = ts.getStartRow();</span> |
| <span class="source-line-no">388</span><span id="line-388"> byte[] endRow = ts.getEndRow();</span> |
| <span class="source-line-no">389</span><span id="line-389"></span> |
| <span class="source-line-no">390</span><span id="line-390"> // For special case: startRow or endRow is empty</span> |
| <span class="source-line-no">391</span><span id="line-391"> if (startRow.length == 0 && endRow.length == 0) {</span> |
| <span class="source-line-no">392</span><span id="line-392"> startRow = new byte[1];</span> |
| <span class="source-line-no">393</span><span id="line-393"> endRow = new byte[1];</span> |
| <span class="source-line-no">394</span><span id="line-394"> startRow[0] = 0;</span> |
| <span class="source-line-no">395</span><span id="line-395"> endRow[0] = -1;</span> |
| <span class="source-line-no">396</span><span id="line-396"> }</span> |
| <span class="source-line-no">397</span><span id="line-397"> if (startRow.length == 0 && endRow.length != 0) {</span> |
| <span class="source-line-no">398</span><span id="line-398"> startRow = new byte[1];</span> |
| <span class="source-line-no">399</span><span id="line-399"> startRow[0] = 0;</span> |
| <span class="source-line-no">400</span><span id="line-400"> }</span> |
| <span class="source-line-no">401</span><span id="line-401"> if (startRow.length != 0 && endRow.length == 0) {</span> |
| <span class="source-line-no">402</span><span id="line-402"> endRow = new byte[startRow.length];</span> |
| <span class="source-line-no">403</span><span id="line-403"> for (int k = 0; k < startRow.length; k++) {</span> |
| <span class="source-line-no">404</span><span id="line-404"> endRow[k] = -1;</span> |
| <span class="source-line-no">405</span><span id="line-405"> }</span> |
| <span class="source-line-no">406</span><span id="line-406"> }</span> |
| <span class="source-line-no">407</span><span id="line-407"></span> |
| <span class="source-line-no">408</span><span id="line-408"> // Split Region into n chunks evenly</span> |
| <span class="source-line-no">409</span><span id="line-409"> byte[][] splitKeys = Bytes.split(startRow, endRow, true, n - 1);</span> |
| <span class="source-line-no">410</span><span id="line-410"> for (int i = 0; i < splitKeys.length - 1; i++) {</span> |
| <span class="source-line-no">411</span><span id="line-411"> // In the table input format for single table we do not need to</span> |
| <span class="source-line-no">412</span><span id="line-412"> // store the scan object in table split because it can be memory intensive and redundant</span> |
| <span class="source-line-no">413</span><span id="line-413"> // information to what is already stored in conf SCAN. See HBASE-25212</span> |
| <span class="source-line-no">414</span><span id="line-414"> // notice that the regionSize parameter may be not very accurate</span> |
| <span class="source-line-no">415</span><span id="line-415"> TableSplit tsplit = new TableSplit(tableName, null, splitKeys[i], splitKeys[i + 1],</span> |
| <span class="source-line-no">416</span><span id="line-416"> regionLocation, encodedRegionName, regionSize / n);</span> |
| <span class="source-line-no">417</span><span id="line-417"> res.add(tsplit);</span> |
| <span class="source-line-no">418</span><span id="line-418"> }</span> |
| <span class="source-line-no">419</span><span id="line-419"> return res;</span> |
| <span class="source-line-no">420</span><span id="line-420"> }</span> |
| <span class="source-line-no">421</span><span id="line-421"></span> |
| <span class="source-line-no">422</span><span id="line-422"> /**</span> |
| <span class="source-line-no">423</span><span id="line-423"> * Calculates the number of MapReduce input splits for the map tasks. The number of MapReduce</span> |
| <span class="source-line-no">424</span><span id="line-424"> * input splits depends on the average region size. Make it 'public' for testing</span> |
| <span class="source-line-no">425</span><span id="line-425"> * @param splits The list of input splits before balance.</span> |
| <span class="source-line-no">426</span><span id="line-426"> * @param maxAverageRegionSize max Average region size for one mapper</span> |
| <span class="source-line-no">427</span><span id="line-427"> * @return The list of input splits.</span> |
| <span class="source-line-no">428</span><span id="line-428"> * @throws IOException When creating the list of splits fails.</span> |
| <span class="source-line-no">429</span><span id="line-429"> * @see org.apache.hadoop.mapreduce.InputFormat#getSplits( org.apache.hadoop.mapreduce.JobContext)</span> |
| <span class="source-line-no">430</span><span id="line-430"> */</span> |
| <span class="source-line-no">431</span><span id="line-431"> public List<InputSplit> calculateAutoBalancedSplits(List<InputSplit> splits,</span> |
| <span class="source-line-no">432</span><span id="line-432"> long maxAverageRegionSize) throws IOException {</span> |
| <span class="source-line-no">433</span><span id="line-433"> if (splits.size() == 0) {</span> |
| <span class="source-line-no">434</span><span id="line-434"> return splits;</span> |
| <span class="source-line-no">435</span><span id="line-435"> }</span> |
| <span class="source-line-no">436</span><span id="line-436"> List<InputSplit> resultList = new ArrayList<>();</span> |
| <span class="source-line-no">437</span><span id="line-437"> long totalRegionSize = 0;</span> |
| <span class="source-line-no">438</span><span id="line-438"> for (int i = 0; i < splits.size(); i++) {</span> |
| <span class="source-line-no">439</span><span id="line-439"> TableSplit ts = (TableSplit) splits.get(i);</span> |
| <span class="source-line-no">440</span><span id="line-440"> totalRegionSize += ts.getLength();</span> |
| <span class="source-line-no">441</span><span id="line-441"> }</span> |
| <span class="source-line-no">442</span><span id="line-442"> long averageRegionSize = totalRegionSize / splits.size();</span> |
| <span class="source-line-no">443</span><span id="line-443"> // totalRegionSize might be overflow, and the averageRegionSize must be positive.</span> |
| <span class="source-line-no">444</span><span id="line-444"> if (averageRegionSize <= 0) {</span> |
| <span class="source-line-no">445</span><span id="line-445"> LOG.warn("The averageRegionSize is not positive: " + averageRegionSize + ", "</span> |
| <span class="source-line-no">446</span><span id="line-446"> + "set it to Long.MAX_VALUE " + splits.size());</span> |
| <span class="source-line-no">447</span><span id="line-447"> averageRegionSize = Long.MAX_VALUE / splits.size();</span> |
| <span class="source-line-no">448</span><span id="line-448"> }</span> |
| <span class="source-line-no">449</span><span id="line-449"> // if averageRegionSize is too big, change it to default as 1 GB,</span> |
| <span class="source-line-no">450</span><span id="line-450"> if (averageRegionSize > maxAverageRegionSize) {</span> |
| <span class="source-line-no">451</span><span id="line-451"> averageRegionSize = maxAverageRegionSize;</span> |
| <span class="source-line-no">452</span><span id="line-452"> }</span> |
| <span class="source-line-no">453</span><span id="line-453"> // if averageRegionSize is too small, we do not need to allocate more mappers for those 'large'</span> |
| <span class="source-line-no">454</span><span id="line-454"> // region</span> |
| <span class="source-line-no">455</span><span id="line-455"> // set default as 16M = (default hdfs block size) / 4;</span> |
| <span class="source-line-no">456</span><span id="line-456"> if (averageRegionSize < 16 * 1048576) {</span> |
| <span class="source-line-no">457</span><span id="line-457"> return splits;</span> |
| <span class="source-line-no">458</span><span id="line-458"> }</span> |
| <span class="source-line-no">459</span><span id="line-459"> for (int i = 0; i < splits.size(); i++) {</span> |
| <span class="source-line-no">460</span><span id="line-460"> TableSplit ts = (TableSplit) splits.get(i);</span> |
| <span class="source-line-no">461</span><span id="line-461"> TableName tableName = ts.getTable();</span> |
| <span class="source-line-no">462</span><span id="line-462"> String regionLocation = ts.getRegionLocation();</span> |
| <span class="source-line-no">463</span><span id="line-463"> String encodedRegionName = ts.getEncodedRegionName();</span> |
| <span class="source-line-no">464</span><span id="line-464"> long regionSize = ts.getLength();</span> |
| <span class="source-line-no">465</span><span id="line-465"></span> |
| <span class="source-line-no">466</span><span id="line-466"> if (regionSize >= averageRegionSize) {</span> |
| <span class="source-line-no">467</span><span id="line-467"> // make this region as multiple MapReduce input split.</span> |
| <span class="source-line-no">468</span><span id="line-468"> int n =</span> |
| <span class="source-line-no">469</span><span id="line-469"> (int) Math.round(Math.log(((double) regionSize) / ((double) averageRegionSize)) + 1.0);</span> |
| <span class="source-line-no">470</span><span id="line-470"> List<InputSplit> temp = createNInputSplitsUniform(ts, n);</span> |
| <span class="source-line-no">471</span><span id="line-471"> resultList.addAll(temp);</span> |
| <span class="source-line-no">472</span><span id="line-472"> } else {</span> |
| <span class="source-line-no">473</span><span id="line-473"> // if the total size of several small continuous regions less than the average region size,</span> |
| <span class="source-line-no">474</span><span id="line-474"> // combine them into one MapReduce input split.</span> |
| <span class="source-line-no">475</span><span id="line-475"> long totalSize = regionSize;</span> |
| <span class="source-line-no">476</span><span id="line-476"> byte[] splitStartKey = ts.getStartRow();</span> |
| <span class="source-line-no">477</span><span id="line-477"> byte[] splitEndKey = ts.getEndRow();</span> |
| <span class="source-line-no">478</span><span id="line-478"> int j = i + 1;</span> |
| <span class="source-line-no">479</span><span id="line-479"> while (j < splits.size()) {</span> |
| <span class="source-line-no">480</span><span id="line-480"> TableSplit nextRegion = (TableSplit) splits.get(j);</span> |
| <span class="source-line-no">481</span><span id="line-481"> long nextRegionSize = nextRegion.getLength();</span> |
| <span class="source-line-no">482</span><span id="line-482"> if (</span> |
| <span class="source-line-no">483</span><span id="line-483"> totalSize + nextRegionSize <= averageRegionSize</span> |
| <span class="source-line-no">484</span><span id="line-484"> && Bytes.equals(splitEndKey, nextRegion.getStartRow())</span> |
| <span class="source-line-no">485</span><span id="line-485"> ) {</span> |
| <span class="source-line-no">486</span><span id="line-486"> totalSize = totalSize + nextRegionSize;</span> |
| <span class="source-line-no">487</span><span id="line-487"> splitEndKey = nextRegion.getEndRow();</span> |
| <span class="source-line-no">488</span><span id="line-488"> j++;</span> |
| <span class="source-line-no">489</span><span id="line-489"> } else {</span> |
| <span class="source-line-no">490</span><span id="line-490"> break;</span> |
| <span class="source-line-no">491</span><span id="line-491"> }</span> |
| <span class="source-line-no">492</span><span id="line-492"> }</span> |
| <span class="source-line-no">493</span><span id="line-493"> i = j - 1;</span> |
| <span class="source-line-no">494</span><span id="line-494"> // In the table input format for single table we do not need to</span> |
| <span class="source-line-no">495</span><span id="line-495"> // store the scan object in table split because it can be memory intensive and redundant</span> |
| <span class="source-line-no">496</span><span id="line-496"> // information to what is already stored in conf SCAN. See HBASE-25212</span> |
| <span class="source-line-no">497</span><span id="line-497"> TableSplit t = new TableSplit(tableName, null, splitStartKey, splitEndKey, regionLocation,</span> |
| <span class="source-line-no">498</span><span id="line-498"> encodedRegionName, totalSize);</span> |
| <span class="source-line-no">499</span><span id="line-499"> resultList.add(t);</span> |
| <span class="source-line-no">500</span><span id="line-500"> }</span> |
| <span class="source-line-no">501</span><span id="line-501"> }</span> |
| <span class="source-line-no">502</span><span id="line-502"> return resultList;</span> |
| <span class="source-line-no">503</span><span id="line-503"> }</span> |
| <span class="source-line-no">504</span><span id="line-504"></span> |
| <span class="source-line-no">505</span><span id="line-505"> String reverseDNS(InetAddress ipAddress) throws UnknownHostException {</span> |
| <span class="source-line-no">506</span><span id="line-506"> String hostName = this.reverseDNSCacheMap.get(ipAddress);</span> |
| <span class="source-line-no">507</span><span id="line-507"> if (hostName == null) {</span> |
| <span class="source-line-no">508</span><span id="line-508"> String ipAddressString = null;</span> |
| <span class="source-line-no">509</span><span id="line-509"> try {</span> |
| <span class="source-line-no">510</span><span id="line-510"> ipAddressString = DNS.reverseDns(ipAddress, null);</span> |
| <span class="source-line-no">511</span><span id="line-511"> } catch (Exception e) {</span> |
| <span class="source-line-no">512</span><span id="line-512"> // We can use InetAddress in case the jndi failed to pull up the reverse DNS entry from the</span> |
| <span class="source-line-no">513</span><span id="line-513"> // name service. Also, in case of ipv6, we need to use the InetAddress since resolving</span> |
| <span class="source-line-no">514</span><span id="line-514"> // reverse DNS using jndi doesn't work well with ipv6 addresses.</span> |
| <span class="source-line-no">515</span><span id="line-515"> ipAddressString = InetAddress.getByName(ipAddress.getHostAddress()).getHostName();</span> |
| <span class="source-line-no">516</span><span id="line-516"> }</span> |
| <span class="source-line-no">517</span><span id="line-517"> if (ipAddressString == null) {</span> |
| <span class="source-line-no">518</span><span id="line-518"> throw new UnknownHostException("No host found for " + ipAddress);</span> |
| <span class="source-line-no">519</span><span id="line-519"> }</span> |
| <span class="source-line-no">520</span><span id="line-520"> hostName = Strings.domainNamePointerToHostName(ipAddressString);</span> |
| <span class="source-line-no">521</span><span id="line-521"> this.reverseDNSCacheMap.put(ipAddress, hostName);</span> |
| <span class="source-line-no">522</span><span id="line-522"> }</span> |
| <span class="source-line-no">523</span><span id="line-523"> return hostName;</span> |
| <span class="source-line-no">524</span><span id="line-524"> }</span> |
| <span class="source-line-no">525</span><span id="line-525"></span> |
| <span class="source-line-no">526</span><span id="line-526"> /**</span> |
| <span class="source-line-no">527</span><span id="line-527"> * Test if the given region is to be included in the InputSplit while splitting the regions of a</span> |
| <span class="source-line-no">528</span><span id="line-528"> * table.</span> |
| <span class="source-line-no">529</span><span id="line-529"> * <p></span> |
| <span class="source-line-no">530</span><span id="line-530"> * This optimization is effective when there is a specific reasoning to exclude an entire region</span> |
| <span class="source-line-no">531</span><span id="line-531"> * from the M-R job, (and hence, not contributing to the InputSplit), given the start and end keys</span> |
| <span class="source-line-no">532</span><span id="line-532"> * of the same. <br></span> |
| <span class="source-line-no">533</span><span id="line-533"> * Useful when we need to remember the last-processed top record and revisit the [last, current)</span> |
| <span class="source-line-no">534</span><span id="line-534"> * interval for M-R processing, continuously. In addition to reducing InputSplits, reduces the</span> |
| <span class="source-line-no">535</span><span id="line-535"> * load on the region server as well, due to the ordering of the keys. <br></span> |
| <span class="source-line-no">536</span><span id="line-536"> * <br></span> |
| <span class="source-line-no">537</span><span id="line-537"> * Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region.</span> |
| <span class="source-line-no">538</span><span id="line-538"> * <br></span> |
| <span class="source-line-no">539</span><span id="line-539"> * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no</span> |
| <span class="source-line-no">540</span><span id="line-540"> * region is excluded( i.e. all regions are included).</span> |
| <span class="source-line-no">541</span><span id="line-541"> * @param startKey Start key of the region</span> |
| <span class="source-line-no">542</span><span id="line-542"> * @param endKey End key of the region</span> |
| <span class="source-line-no">543</span><span id="line-543"> * @return true, if this region needs to be included as part of the input (default).</span> |
| <span class="source-line-no">544</span><span id="line-544"> */</span> |
| <span class="source-line-no">545</span><span id="line-545"> protected boolean includeRegionInSplit(final byte[] startKey, final byte[] endKey) {</span> |
| <span class="source-line-no">546</span><span id="line-546"> return true;</span> |
| <span class="source-line-no">547</span><span id="line-547"> }</span> |
| <span class="source-line-no">548</span><span id="line-548"></span> |
| <span class="source-line-no">549</span><span id="line-549"> /**</span> |
| <span class="source-line-no">550</span><span id="line-550"> * Allows subclasses to get the {@link RegionLocator}.</span> |
| <span class="source-line-no">551</span><span id="line-551"> */</span> |
| <span class="source-line-no">552</span><span id="line-552"> protected RegionLocator getRegionLocator() {</span> |
| <span class="source-line-no">553</span><span id="line-553"> if (regionLocator == null) {</span> |
| <span class="source-line-no">554</span><span id="line-554"> throw new IllegalStateException(NOT_INITIALIZED);</span> |
| <span class="source-line-no">555</span><span id="line-555"> }</span> |
| <span class="source-line-no">556</span><span id="line-556"> return regionLocator;</span> |
| <span class="source-line-no">557</span><span id="line-557"> }</span> |
| <span class="source-line-no">558</span><span id="line-558"></span> |
| <span class="source-line-no">559</span><span id="line-559"> /**</span> |
| <span class="source-line-no">560</span><span id="line-560"> * Allows subclasses to get the {@link Table}.</span> |
| <span class="source-line-no">561</span><span id="line-561"> */</span> |
| <span class="source-line-no">562</span><span id="line-562"> protected Table getTable() {</span> |
| <span class="source-line-no">563</span><span id="line-563"> if (table == null) {</span> |
| <span class="source-line-no">564</span><span id="line-564"> throw new IllegalStateException(NOT_INITIALIZED);</span> |
| <span class="source-line-no">565</span><span id="line-565"> }</span> |
| <span class="source-line-no">566</span><span id="line-566"> return table;</span> |
| <span class="source-line-no">567</span><span id="line-567"> }</span> |
| <span class="source-line-no">568</span><span id="line-568"></span> |
| <span class="source-line-no">569</span><span id="line-569"> /**</span> |
| <span class="source-line-no">570</span><span id="line-570"> * Allows subclasses to get the {@link Admin}.</span> |
| <span class="source-line-no">571</span><span id="line-571"> */</span> |
| <span class="source-line-no">572</span><span id="line-572"> protected Admin getAdmin() {</span> |
| <span class="source-line-no">573</span><span id="line-573"> if (admin == null) {</span> |
| <span class="source-line-no">574</span><span id="line-574"> throw new IllegalStateException(NOT_INITIALIZED);</span> |
| <span class="source-line-no">575</span><span id="line-575"> }</span> |
| <span class="source-line-no">576</span><span id="line-576"> return admin;</span> |
| <span class="source-line-no">577</span><span id="line-577"> }</span> |
| <span class="source-line-no">578</span><span id="line-578"></span> |
| <span class="source-line-no">579</span><span id="line-579"> /**</span> |
| <span class="source-line-no">580</span><span id="line-580"> * Allows subclasses to initialize the table information.</span> |
| <span class="source-line-no">581</span><span id="line-581"> * @param connection The Connection to the HBase cluster. MUST be unmanaged. We will close.</span> |
| <span class="source-line-no">582</span><span id="line-582"> * @param tableName The {@link TableName} of the table to process.</span> |
| <span class="source-line-no">583</span><span id="line-583"> */</span> |
| <span class="source-line-no">584</span><span id="line-584"> protected void initializeTable(Connection connection, TableName tableName) throws IOException {</span> |
| <span class="source-line-no">585</span><span id="line-585"> if (this.table != null || this.connection != null) {</span> |
| <span class="source-line-no">586</span><span id="line-586"> LOG.warn("initializeTable called multiple times. Overwriting connection and table "</span> |
| <span class="source-line-no">587</span><span id="line-587"> + "reference; TableInputFormatBase will not close these old references when done.");</span> |
| <span class="source-line-no">588</span><span id="line-588"> }</span> |
| <span class="source-line-no">589</span><span id="line-589"> this.table = connection.getTable(tableName);</span> |
| <span class="source-line-no">590</span><span id="line-590"> this.regionLocator = connection.getRegionLocator(tableName);</span> |
| <span class="source-line-no">591</span><span id="line-591"> this.admin = connection.getAdmin();</span> |
| <span class="source-line-no">592</span><span id="line-592"> this.connection = connection;</span> |
| <span class="source-line-no">593</span><span id="line-593"> this.regionSizeCalculator = null;</span> |
| <span class="source-line-no">594</span><span id="line-594"> }</span> |
| <span class="source-line-no">595</span><span id="line-595"></span> |
| <span class="source-line-no">596</span><span id="line-596"> @InterfaceAudience.Private</span> |
| <span class="source-line-no">597</span><span id="line-597"> protected RegionSizeCalculator createRegionSizeCalculator(RegionLocator locator, Admin admin)</span> |
| <span class="source-line-no">598</span><span id="line-598"> throws IOException {</span> |
| <span class="source-line-no">599</span><span id="line-599"> return new RegionSizeCalculator(locator, admin);</span> |
| <span class="source-line-no">600</span><span id="line-600"> }</span> |
| <span class="source-line-no">601</span><span id="line-601"></span> |
| <span class="source-line-no">602</span><span id="line-602"> /**</span> |
| <span class="source-line-no">603</span><span id="line-603"> * Gets the scan defining the actual details like columns etc.</span> |
| <span class="source-line-no">604</span><span id="line-604"> * @return The internal scan instance.</span> |
| <span class="source-line-no">605</span><span id="line-605"> */</span> |
| <span class="source-line-no">606</span><span id="line-606"> public Scan getScan() {</span> |
| <span class="source-line-no">607</span><span id="line-607"> if (this.scan == null) this.scan = new Scan();</span> |
| <span class="source-line-no">608</span><span id="line-608"> return scan;</span> |
| <span class="source-line-no">609</span><span id="line-609"> }</span> |
| <span class="source-line-no">610</span><span id="line-610"></span> |
| <span class="source-line-no">611</span><span id="line-611"> /**</span> |
| <span class="source-line-no">612</span><span id="line-612"> * Sets the scan defining the actual details like columns etc.</span> |
| <span class="source-line-no">613</span><span id="line-613"> * @param scan The scan to set.</span> |
| <span class="source-line-no">614</span><span id="line-614"> */</span> |
| <span class="source-line-no">615</span><span id="line-615"> public void setScan(Scan scan) {</span> |
| <span class="source-line-no">616</span><span id="line-616"> this.scan = scan;</span> |
| <span class="source-line-no">617</span><span id="line-617"> }</span> |
| <span class="source-line-no">618</span><span id="line-618"></span> |
| <span class="source-line-no">619</span><span id="line-619"> /**</span> |
| <span class="source-line-no">620</span><span id="line-620"> * Allows subclasses to set the {@link TableRecordReader}.</span> |
| <span class="source-line-no">621</span><span id="line-621"> * @param tableRecordReader A different {@link TableRecordReader} implementation.</span> |
| <span class="source-line-no">622</span><span id="line-622"> */</span> |
| <span class="source-line-no">623</span><span id="line-623"> protected void setTableRecordReader(TableRecordReader tableRecordReader) {</span> |
| <span class="source-line-no">624</span><span id="line-624"> this.tableRecordReader = tableRecordReader;</span> |
| <span class="source-line-no">625</span><span id="line-625"> }</span> |
| <span class="source-line-no">626</span><span id="line-626"></span> |
| <span class="source-line-no">627</span><span id="line-627"> /**</span> |
| <span class="source-line-no">628</span><span id="line-628"> * Handle subclass specific set up. Each of the entry points used by the MapReduce framework,</span> |
| <span class="source-line-no">629</span><span id="line-629"> * {@link #createRecordReader(InputSplit, TaskAttemptContext)} and {@link #getSplits(JobContext)},</span> |
| <span class="source-line-no">630</span><span id="line-630"> * will call {@link #initialize(JobContext)} as a convenient centralized location to handle</span> |
| <span class="source-line-no">631</span><span id="line-631"> * retrieving the necessary configuration information and calling</span> |
| <span class="source-line-no">632</span><span id="line-632"> * {@link #initializeTable(Connection, TableName)}. Subclasses should implement their initialize</span> |
| <span class="source-line-no">633</span><span id="line-633"> * call such that it is safe to call multiple times. The current TableInputFormatBase</span> |
| <span class="source-line-no">634</span><span id="line-634"> * implementation relies on a non-null table reference to decide if an initialize call is needed,</span> |
| <span class="source-line-no">635</span><span id="line-635"> * but this behavior may change in the future. In particular, it is critical that initializeTable</span> |
| <span class="source-line-no">636</span><span id="line-636"> * not be called multiple times since this will leak Connection instances.</span> |
| <span class="source-line-no">637</span><span id="line-637"> */</span> |
| <span class="source-line-no">638</span><span id="line-638"> protected void initialize(JobContext context) throws IOException {</span> |
| <span class="source-line-no">639</span><span id="line-639"> }</span> |
| <span class="source-line-no">640</span><span id="line-640"></span> |
| <span class="source-line-no">641</span><span id="line-641"> /**</span> |
| <span class="source-line-no">642</span><span id="line-642"> * Close the Table and related objects that were initialized via</span> |
| <span class="source-line-no">643</span><span id="line-643"> * {@link #initializeTable(Connection, TableName)}.</span> |
| <span class="source-line-no">644</span><span id="line-644"> */</span> |
| <span class="source-line-no">645</span><span id="line-645"> protected void closeTable() throws IOException {</span> |
| <span class="source-line-no">646</span><span id="line-646"> close(admin, table, regionLocator, connection);</span> |
| <span class="source-line-no">647</span><span id="line-647"> admin = null;</span> |
| <span class="source-line-no">648</span><span id="line-648"> table = null;</span> |
| <span class="source-line-no">649</span><span id="line-649"> regionLocator = null;</span> |
| <span class="source-line-no">650</span><span id="line-650"> connection = null;</span> |
| <span class="source-line-no">651</span><span id="line-651"> regionSizeCalculator = null;</span> |
| <span class="source-line-no">652</span><span id="line-652"> }</span> |
| <span class="source-line-no">653</span><span id="line-653"></span> |
| <span class="source-line-no">654</span><span id="line-654"> private void close(Closeable... closables) throws IOException {</span> |
| <span class="source-line-no">655</span><span id="line-655"> for (Closeable c : closables) {</span> |
| <span class="source-line-no">656</span><span id="line-656"> if (c != null) {</span> |
| <span class="source-line-no">657</span><span id="line-657"> c.close();</span> |
| <span class="source-line-no">658</span><span id="line-658"> }</span> |
| <span class="source-line-no">659</span><span id="line-659"> }</span> |
| <span class="source-line-no">660</span><span id="line-660"> }</span> |
| <span class="source-line-no">661</span><span id="line-661"></span> |
| <span class="source-line-no">662</span><span id="line-662">}</span> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </pre> |
| </div> |
| </main> |
| </body> |
| </html> |