| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.hadoop.hdfs.server.datanode; |
| |
| import java.io.BufferedInputStream; |
| import java.io.DataInputStream; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.FileNotFoundException; |
| import java.io.FileOutputStream; |
| import java.io.FilenameFilter; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.RandomAccessFile; |
| import java.nio.channels.FileChannel; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Random; |
| |
| import javax.management.NotCompliantMBeanException; |
| import javax.management.ObjectName; |
| import javax.management.StandardMBean; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.DF; |
| import org.apache.hadoop.fs.DU; |
| import org.apache.hadoop.fs.FileUtil; |
| import org.apache.hadoop.hdfs.protocol.Block; |
| import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; |
| import org.apache.hadoop.hdfs.protocol.DatanodeInfo; |
| import org.apache.hadoop.hdfs.protocol.FSConstants; |
| import org.apache.hadoop.hdfs.protocol.LocatedBlock; |
| import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException; |
| import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean; |
| import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol; |
| import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo; |
| import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock; |
| import org.apache.hadoop.metrics.util.MBeanUtil; |
| import org.apache.hadoop.util.DataChecksum; |
| import org.apache.hadoop.util.DiskChecker; |
| import org.apache.hadoop.util.StringUtils; |
| import org.apache.hadoop.util.DiskChecker.DiskErrorException; |
| import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException; |
| import org.apache.hadoop.hdfs.server.common.GenerationStamp; |
| import org.apache.hadoop.hdfs.server.common.HdfsConstants.ReplicaState; |
| import org.apache.hadoop.io.IOUtils; |
| |
| /************************************************** |
| * FSDataset manages a set of data blocks. Each block |
| * has a unique name and an extent on disk. |
| * |
| ***************************************************/ |
| public class FSDataset implements FSConstants, FSDatasetInterface { |
| |
| |
| /** |
| * A node type that can be built into a tree reflecting the |
| * hierarchy of blocks on the local disk. |
| */ |
| class FSDir { |
| File dir; |
| int numBlocks = 0; |
| FSDir children[]; |
| int lastChildIdx = 0; |
| /** |
| */ |
| public FSDir(File dir) |
| throws IOException { |
| this.dir = dir; |
| this.children = null; |
| if (!dir.exists()) { |
| if (!dir.mkdirs()) { |
| throw new IOException("Mkdirs failed to create " + |
| dir.toString()); |
| } |
| } else { |
| File[] files = dir.listFiles(); |
| int numChildren = 0; |
| for (int idx = 0; idx < files.length; idx++) { |
| if (files[idx].isDirectory()) { |
| numChildren++; |
| } else if (Block.isBlockFilename(files[idx])) { |
| numBlocks++; |
| } |
| } |
| if (numChildren > 0) { |
| children = new FSDir[numChildren]; |
| int curdir = 0; |
| for (int idx = 0; idx < files.length; idx++) { |
| if (files[idx].isDirectory()) { |
| children[curdir] = new FSDir(files[idx]); |
| curdir++; |
| } |
| } |
| } |
| } |
| } |
| |
| public File addBlock(Block b, File src) throws IOException { |
| //First try without creating subdirectories |
| File file = addBlock(b, src, false, false); |
| return (file != null) ? file : addBlock(b, src, true, true); |
| } |
| |
| private File addBlock(Block b, File src, boolean createOk, |
| boolean resetIdx) throws IOException { |
| if (numBlocks < maxBlocksPerDir) { |
| File dest = new File(dir, b.getBlockName()); |
| File metaData = getMetaFile( src, b ); |
| File newmeta = getMetaFile(dest, b); |
| if ( ! metaData.renameTo( newmeta ) || |
| ! src.renameTo( dest ) ) { |
| throw new IOException( "could not move files for " + b + |
| " from " + src + " to " + |
| dest.getAbsolutePath() + " or from" |
| + metaData + " to " + newmeta); |
| } |
| if (DataNode.LOG.isDebugEnabled()) { |
| DataNode.LOG.debug("addBlock: Moved " + metaData + " to " + newmeta); |
| DataNode.LOG.debug("addBlock: Moved " + src + " to " + dest); |
| } |
| |
| numBlocks += 1; |
| return dest; |
| } |
| |
| if (lastChildIdx < 0 && resetIdx) { |
| //reset so that all children will be checked |
| lastChildIdx = random.nextInt(children.length); |
| } |
| |
| if (lastChildIdx >= 0 && children != null) { |
| //Check if any child-tree has room for a block. |
| for (int i=0; i < children.length; i++) { |
| int idx = (lastChildIdx + i)%children.length; |
| File file = children[idx].addBlock(b, src, false, resetIdx); |
| if (file != null) { |
| lastChildIdx = idx; |
| return file; |
| } |
| } |
| lastChildIdx = -1; |
| } |
| |
| if (!createOk) { |
| return null; |
| } |
| |
| if (children == null || children.length == 0) { |
| children = new FSDir[maxBlocksPerDir]; |
| for (int idx = 0; idx < maxBlocksPerDir; idx++) { |
| children[idx] = new FSDir(new File(dir, DataStorage.BLOCK_SUBDIR_PREFIX+idx)); |
| } |
| } |
| |
| //now pick a child randomly for creating a new set of subdirs. |
| lastChildIdx = random.nextInt(children.length); |
| return children[ lastChildIdx ].addBlock(b, src, true, false); |
| } |
| |
| void getVolumeMap(ReplicasMap volumeMap, FSVolume volume) |
| throws IOException { |
| if (children != null) { |
| for (int i = 0; i < children.length; i++) { |
| children[i].getVolumeMap(volumeMap, volume); |
| } |
| } |
| |
| recoverTempUnlinkedBlock(); |
| volume.addToReplicasMap(volumeMap, dir, true); |
| } |
| |
| /** |
| * Recover unlinked tmp files on datanode restart. If the original block |
| * does not exist, then the tmp file is renamed to be the |
| * original file name; otherwise the tmp file is deleted. |
| */ |
| private void recoverTempUnlinkedBlock() throws IOException { |
| File files[] = dir.listFiles(); |
| for (File file : files) { |
| if (!FSDataset.isUnlinkTmpFile(file)) { |
| continue; |
| } |
| File blockFile = getOrigFile(file); |
| if (blockFile.exists()) { |
| // |
| // If the original block file still exists, then no recovery |
| // is needed. |
| // |
| if (!file.delete()) { |
| throw new IOException("Unable to cleanup unlinked tmp file " + |
| file); |
| } |
| } else { |
| if (!file.renameTo(blockFile)) { |
| throw new IOException("Unable to cleanup detached file " + |
| file); |
| } |
| } |
| } |
| } |
| |
| /** |
| * check if a data diretory is healthy |
| * @throws DiskErrorException |
| */ |
| public void checkDirTree() throws DiskErrorException { |
| DiskChecker.checkDir(dir); |
| |
| if (children != null) { |
| for (int i = 0; i < children.length; i++) { |
| children[i].checkDirTree(); |
| } |
| } |
| } |
| |
| void clearPath(File f) { |
| String root = dir.getAbsolutePath(); |
| String dir = f.getAbsolutePath(); |
| if (dir.startsWith(root)) { |
| String[] dirNames = dir.substring(root.length()). |
| split(File.separator + "subdir"); |
| if (clearPath(f, dirNames, 1)) |
| return; |
| } |
| clearPath(f, null, -1); |
| } |
| |
| /* |
| * dirNames is an array of string integers derived from |
| * usual directory structure data/subdirN/subdirXY/subdirM ... |
| * If dirName array is non-null, we only check the child at |
| * the children[dirNames[idx]]. This avoids iterating over |
| * children in common case. If directory structure changes |
| * in later versions, we need to revisit this. |
| */ |
| private boolean clearPath(File f, String[] dirNames, int idx) { |
| if ((dirNames == null || idx == dirNames.length) && |
| dir.compareTo(f) == 0) { |
| numBlocks--; |
| return true; |
| } |
| |
| if (dirNames != null) { |
| //guess the child index from the directory name |
| if (idx > (dirNames.length - 1) || children == null) { |
| return false; |
| } |
| int childIdx; |
| try { |
| childIdx = Integer.parseInt(dirNames[idx]); |
| } catch (NumberFormatException ignored) { |
| // layout changed? we could print a warning. |
| return false; |
| } |
| return (childIdx >= 0 && childIdx < children.length) ? |
| children[childIdx].clearPath(f, dirNames, idx+1) : false; |
| } |
| |
| //guesses failed. back to blind iteration. |
| if (children != null) { |
| for(int i=0; i < children.length; i++) { |
| if (children[i].clearPath(f, null, -1)){ |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| public String toString() { |
| return "FSDir{" + |
| "dir=" + dir + |
| ", children=" + (children == null ? null : Arrays.asList(children)) + |
| "}"; |
| } |
| } |
| |
| class FSVolume { |
| private File currentDir; |
| private FSDir dataDir; // directory store Finalized replica |
| private File rbwDir; // directory store RBW replica |
| private File tmpDir; // directory store Temporary replica |
| private DF usage; |
| private DU dfsUsage; |
| private long reserved; |
| |
| |
| FSVolume(File currentDir, Configuration conf) throws IOException { |
| this.reserved = conf.getLong("dfs.datanode.du.reserved", 0); |
| this.currentDir = currentDir; |
| File parent = currentDir.getParentFile(); |
| final File finalizedDir = new File( |
| currentDir, DataStorage.STORAGE_DIR_FINALIZED); |
| |
| // Files that were being written when the datanode was last shutdown |
| // are now moved back to the data directory. It is possible that |
| // in the future, we might want to do some sort of datanode-local |
| // recovery for these blocks. For example, crc validation. |
| // |
| this.tmpDir = new File(parent, "tmp"); |
| if (tmpDir.exists()) { |
| FileUtil.fullyDelete(tmpDir); |
| } |
| this.rbwDir = new File(currentDir, DataStorage.STORAGE_DIR_RBW); |
| if (rbwDir.exists() && !supportAppends) { |
| FileUtil.fullyDelete(rbwDir); |
| } |
| this.dataDir = new FSDir(finalizedDir); |
| if (!rbwDir.mkdirs()) { // create rbw directory if not exist |
| if (!rbwDir.isDirectory()) { |
| throw new IOException("Mkdirs failed to create " + rbwDir.toString()); |
| } |
| } |
| if (!tmpDir.mkdirs()) { |
| if (!tmpDir.isDirectory()) { |
| throw new IOException("Mkdirs failed to create " + tmpDir.toString()); |
| } |
| } |
| this.usage = new DF(parent, conf); |
| this.dfsUsage = new DU(parent, conf); |
| this.dfsUsage.start(); |
| } |
| |
| File getCurrentDir() { |
| return currentDir; |
| } |
| |
| void decDfsUsed(long value) { |
| // The caller to this method (BlockFileDeleteTask.run()) does |
| // not have locked FSDataset.this yet. |
| synchronized(FSDataset.this) { |
| dfsUsage.decDfsUsed(value); |
| } |
| } |
| |
| long getDfsUsed() throws IOException { |
| return dfsUsage.getUsed(); |
| } |
| |
| /** |
| * Calculate the capacity of the filesystem, after removing any |
| * reserved capacity. |
| * @return the unreserved number of bytes left in this filesystem. May be zero. |
| */ |
| long getCapacity() throws IOException { |
| long remaining = usage.getCapacity() - reserved; |
| return remaining > 0 ? remaining : 0; |
| } |
| |
| long getAvailable() throws IOException { |
| long remaining = getCapacity()-getDfsUsed(); |
| long available = usage.getAvailable(); |
| if (remaining>available) { |
| remaining = available; |
| } |
| return (remaining > 0) ? remaining : 0; |
| } |
| |
| String getMount() throws IOException { |
| return usage.getMount(); |
| } |
| |
| File getDir() { |
| return dataDir.dir; |
| } |
| |
| /** |
| * Temporary files. They get moved to the finalized block directory when |
| * the block is finalized. |
| */ |
| File createTmpFile(Block b) throws IOException { |
| File f = new File(tmpDir, b.getBlockName()); |
| return FSDataset.createTmpFile(b, f); |
| } |
| |
| /** |
| * RBW files. They get moved to the finalized block directory when |
| * the block is finalized. |
| */ |
| File createRbwFile(Block b) throws IOException { |
| File f = new File(rbwDir, b.getBlockName()); |
| return FSDataset.createTmpFile(b, f); |
| } |
| |
| File addBlock(Block b, File f) throws IOException { |
| File blockFile = dataDir.addBlock(b, f); |
| File metaFile = getMetaFile( blockFile , b); |
| dfsUsage.incDfsUsed(b.getNumBytes()+metaFile.length()); |
| return blockFile; |
| } |
| |
| void checkDirs() throws DiskErrorException { |
| dataDir.checkDirTree(); |
| DiskChecker.checkDir(tmpDir); |
| DiskChecker.checkDir(rbwDir); |
| } |
| |
| void getVolumeMap(ReplicasMap volumeMap) throws IOException { |
| // add finalized replicas |
| dataDir.getVolumeMap(volumeMap, this); |
| // add rbw replicas |
| addToReplicasMap(volumeMap, rbwDir, false); |
| } |
| |
| /** |
| * Add replicas under the given directory to the volume map |
| * @param volumeMap the replicas map |
| * @param dir an input directory |
| * @param isFinalized true if the directory has finalized replicas; |
| * false if the directory has rbw replicas |
| */ |
| private void addToReplicasMap(ReplicasMap volumeMap, |
| File dir, boolean isFinalized) { |
| File blockFiles[] = dir.listFiles(); |
| for (File blockFile : blockFiles) { |
| if (!Block.isBlockFilename(blockFile)) |
| continue; |
| |
| long genStamp = getGenerationStampFromFile(blockFiles, blockFile); |
| long blockId = Block.filename2id(blockFile.getName()); |
| ReplicaInfo newReplica = null; |
| if (isFinalized) { |
| newReplica = new FinalizedReplica(blockId, |
| blockFile.length(), genStamp, this, blockFile.getParentFile()); |
| } else { |
| newReplica = new ReplicaWaitingToBeRecovered(blockId, |
| validateIntegrity(blockFile, genStamp), |
| genStamp, this, blockFile.getParentFile()); |
| } |
| |
| ReplicaInfo oldReplica = volumeMap.add(newReplica); |
| if (oldReplica != null) { |
| DataNode.LOG.warn("Two block files with the same block id exist " + |
| "on disk: " + oldReplica.getBlockFile() + |
| " and " + blockFile ); |
| } |
| } |
| } |
| |
| /** |
| * Find out the number of bytes in the block that match its crc. |
| * |
| * This algorithm assumes that data corruption caused by unexpected |
| * datanode shutdown occurs only in the last crc chunk. So it checks |
| * only the last chunk. |
| * |
| * @param blockFile the block file |
| * @param genStamp generation stamp of the block |
| * @return the number of valid bytes |
| */ |
| private long validateIntegrity(File blockFile, long genStamp) { |
| DataInputStream checksumIn = null; |
| InputStream blockIn = null; |
| try { |
| File metaFile = new File(getMetaFileName(blockFile.toString(), genStamp)); |
| long blockFileLen = blockFile.length(); |
| long metaFileLen = metaFile.length(); |
| int crcHeaderLen = DataChecksum.getChecksumHeaderSize(); |
| if (!blockFile.exists() || blockFileLen == 0 || |
| !metaFile.exists() || metaFileLen < (long)crcHeaderLen) { |
| return 0; |
| } |
| checksumIn = new DataInputStream( |
| new BufferedInputStream(new FileInputStream(metaFile), |
| BUFFER_SIZE)); |
| |
| // read and handle the common header here. For now just a version |
| BlockMetadataHeader header = BlockMetadataHeader.readHeader(checksumIn); |
| short version = header.getVersion(); |
| if (version != FSDataset.METADATA_VERSION) { |
| DataNode.LOG.warn("Wrong version (" + version + ") for metadata file " |
| + metaFile + " ignoring ..."); |
| } |
| DataChecksum checksum = header.getChecksum(); |
| int bytesPerChecksum = checksum.getBytesPerChecksum(); |
| int checksumSize = checksum.getChecksumSize(); |
| long numChunks = Math.min( |
| (blockFileLen + bytesPerChecksum - 1)/bytesPerChecksum, |
| (metaFileLen - crcHeaderLen)/checksumSize); |
| if (numChunks == 0) { |
| return 0; |
| } |
| IOUtils.skipFully(checksumIn, (numChunks-1)*checksumSize); |
| blockIn = new FileInputStream(blockFile); |
| long lastChunkStartPos = (numChunks-1)*bytesPerChecksum; |
| IOUtils.skipFully(blockIn, lastChunkStartPos); |
| int lastChunkSize = (int)Math.min( |
| bytesPerChecksum, blockFileLen-lastChunkStartPos); |
| byte[] buf = new byte[lastChunkSize+checksumSize]; |
| checksumIn.readFully(buf, lastChunkSize, checksumSize); |
| IOUtils.readFully(blockIn, buf, 0, lastChunkSize); |
| |
| checksum.update(buf, 0, lastChunkSize); |
| if (checksum.compare(buf, lastChunkSize)) { // last chunk matches crc |
| return lastChunkStartPos + lastChunkSize; |
| } else { // last chunck is corrupt |
| return lastChunkStartPos; |
| } |
| } catch (IOException e) { |
| DataNode.LOG.warn(e); |
| return 0; |
| } finally { |
| IOUtils.closeStream(checksumIn); |
| IOUtils.closeStream(blockIn); |
| } |
| } |
| |
| void clearPath(File f) { |
| dataDir.clearPath(f); |
| } |
| |
| public String toString() { |
| return getDir().getAbsolutePath(); |
| } |
| } |
| |
| static class FSVolumeSet { |
| FSVolume[] volumes = null; |
| int curVolume = 0; |
| |
| FSVolumeSet(FSVolume[] volumes) { |
| this.volumes = volumes; |
| } |
| |
| private int numberOfVolumes() { |
| return volumes.length; |
| } |
| |
| synchronized FSVolume getNextVolume(long blockSize) throws IOException { |
| |
| if(volumes.length < 1) { |
| throw new DiskOutOfSpaceException("No more available volumes"); |
| } |
| |
| // since volumes could've been removed because of the failure |
| // make sure we are not out of bounds |
| if(curVolume >= volumes.length) { |
| curVolume = 0; |
| } |
| |
| int startVolume = curVolume; |
| |
| while (true) { |
| FSVolume volume = volumes[curVolume]; |
| curVolume = (curVolume + 1) % volumes.length; |
| if (volume.getAvailable() > blockSize) { return volume; } |
| if (curVolume == startVolume) { |
| throw new DiskOutOfSpaceException("Insufficient space for an additional block"); |
| } |
| } |
| } |
| |
| long getDfsUsed() throws IOException { |
| long dfsUsed = 0L; |
| for (int idx = 0; idx < volumes.length; idx++) { |
| dfsUsed += volumes[idx].getDfsUsed(); |
| } |
| return dfsUsed; |
| } |
| |
| long getCapacity() throws IOException { |
| long capacity = 0L; |
| for (int idx = 0; idx < volumes.length; idx++) { |
| capacity += volumes[idx].getCapacity(); |
| } |
| return capacity; |
| } |
| |
| long getRemaining() throws IOException { |
| long remaining = 0L; |
| for (int idx = 0; idx < volumes.length; idx++) { |
| remaining += volumes[idx].getAvailable(); |
| } |
| return remaining; |
| } |
| |
| synchronized void getVolumeMap(ReplicasMap volumeMap) throws IOException { |
| for (int idx = 0; idx < volumes.length; idx++) { |
| volumes[idx].getVolumeMap(volumeMap); |
| } |
| } |
| |
| /** |
| * goes over all the volumes and checkDir eachone of them |
| * if one throws DiskErrorException - removes from the list of active |
| * volumes. |
| * @return list of all the removed volumes |
| */ |
| synchronized List<FSVolume> checkDirs() { |
| |
| ArrayList<FSVolume> removed_vols = null; |
| |
| for (int idx = 0; idx < volumes.length; idx++) { |
| FSVolume fsv = volumes[idx]; |
| try { |
| fsv.checkDirs(); |
| } catch (DiskErrorException e) { |
| DataNode.LOG.warn("Removing failed volume " + fsv + ": ",e); |
| if(removed_vols == null) { |
| removed_vols = new ArrayList<FSVolume>(1); |
| } |
| removed_vols.add(volumes[idx]); |
| volumes[idx] = null; //remove the volume |
| } |
| } |
| |
| // repair array - copy non null elements |
| int removed_size = (removed_vols==null)? 0 : removed_vols.size(); |
| if(removed_size > 0) { |
| FSVolume fsvs[] = new FSVolume [volumes.length-removed_size]; |
| for(int idx=0,idy=0; idx<volumes.length; idx++) { |
| if(volumes[idx] != null) { |
| fsvs[idy] = volumes[idx]; |
| idy++; |
| } |
| } |
| volumes = fsvs; // replace array of volumes |
| DataNode.LOG.info("Completed FSVolumeSet.checkDirs. Removed " |
| + removed_vols.size() + " volumes. List of current volumes: " |
| + this); |
| } |
| |
| return removed_vols; |
| } |
| |
| public String toString() { |
| StringBuffer sb = new StringBuffer(); |
| for (int idx = 0; idx < volumes.length; idx++) { |
| sb.append(volumes[idx].toString()); |
| if (idx != volumes.length - 1) { sb.append(","); } |
| } |
| return sb.toString(); |
| } |
| |
| public boolean isValid(FSVolume volume) { |
| for (int idx = 0; idx < volumes.length; idx++) { |
| if (volumes[idx] == volume) { |
| return true; |
| } |
| } |
| return false; |
| } |
| } |
| |
| ////////////////////////////////////////////////////// |
| // |
| // FSDataSet |
| // |
| ////////////////////////////////////////////////////// |
| |
| //Find better place? |
| public static final String METADATA_EXTENSION = ".meta"; |
| public static final short METADATA_VERSION = 1; |
| static final String UNLINK_BLOCK_SUFFIX = ".unlinked"; |
| |
| private static boolean isUnlinkTmpFile(File f) { |
| String name = f.getName(); |
| return name.endsWith(UNLINK_BLOCK_SUFFIX); |
| } |
| |
| static File getUnlinkTmpFile(File f) { |
| return new File(f.getParentFile(), f.getName()+UNLINK_BLOCK_SUFFIX); |
| } |
| |
| private static File getOrigFile(File unlinkTmpFile) { |
| String fileName = unlinkTmpFile.getName(); |
| return new File(unlinkTmpFile.getParentFile(), |
| fileName.substring(0, fileName.length()-UNLINK_BLOCK_SUFFIX.length())); |
| } |
| |
| static String getMetaFileName(String blockFileName, long genStamp) { |
| return blockFileName + "_" + genStamp + METADATA_EXTENSION; |
| } |
| |
| static File getMetaFile(File f , Block b) { |
| return new File(getMetaFileName(f.getAbsolutePath(), |
| b.getGenerationStamp())); |
| } |
| protected File getMetaFile(Block b) throws IOException { |
| return getMetaFile(getBlockFile(b), b); |
| } |
| |
| /** Find the metadata file for the specified block file. |
| * Return the generation stamp from the name of the metafile. |
| */ |
| private static long getGenerationStampFromFile(File[] listdir, File blockFile) { |
| String blockName = blockFile.getName(); |
| for (int j = 0; j < listdir.length; j++) { |
| String path = listdir[j].getName(); |
| if (!path.startsWith(blockName)) { |
| continue; |
| } |
| if (blockFile == listdir[j]) { |
| continue; |
| } |
| return Block.getGenerationStamp(listdir[j].getName()); |
| } |
| DataNode.LOG.warn("Block " + blockFile + |
| " does not have a metafile!"); |
| return GenerationStamp.GRANDFATHER_GENERATION_STAMP; |
| } |
| |
| /** Find the corresponding meta data file from a given block file */ |
| private static File findMetaFile(final File blockFile) throws IOException { |
| final String prefix = blockFile.getName() + "_"; |
| final File parent = blockFile.getParentFile(); |
| File[] matches = parent.listFiles(new FilenameFilter() { |
| public boolean accept(File dir, String name) { |
| return dir.equals(parent) |
| && name.startsWith(prefix) && name.endsWith(METADATA_EXTENSION); |
| } |
| }); |
| |
| if (matches == null || matches.length == 0) { |
| throw new IOException("Meta file not found, blockFile=" + blockFile); |
| } |
| else if (matches.length > 1) { |
| throw new IOException("Found more than one meta files: " |
| + Arrays.asList(matches)); |
| } |
| return matches[0]; |
| } |
| |
| /** Find the corresponding meta data file from a given block file */ |
| private static long parseGenerationStamp(File blockFile, File metaFile |
| ) throws IOException { |
| String metaname = metaFile.getName(); |
| String gs = metaname.substring(blockFile.getName().length() + 1, |
| metaname.length() - METADATA_EXTENSION.length()); |
| try { |
| return Long.parseLong(gs); |
| } catch(NumberFormatException nfe) { |
| throw (IOException)new IOException("blockFile=" + blockFile |
| + ", metaFile=" + metaFile).initCause(nfe); |
| } |
| } |
| |
| /** Return the block file for the given ID */ |
| public File findBlockFile(long blockId) { |
| return getFile(blockId); |
| } |
| |
| /** {@inheritDoc} */ |
| public synchronized Block getStoredBlock(long blkid) throws IOException { |
| File blockfile = findBlockFile(blkid); |
| if (blockfile == null) { |
| return null; |
| } |
| File metafile = findMetaFile(blockfile); |
| return new Block(blkid, blockfile.length(), |
| parseGenerationStamp(blockfile, metafile)); |
| } |
| |
| /** |
| * Returns a clone of a replica stored in data-node memory. |
| * Should be primarily used for testing. |
| * @param blockId |
| * @return |
| */ |
| synchronized ReplicaInfo fetchReplicaInfo(long blockId) { |
| ReplicaInfo r = volumeMap.get(blockId); |
| if(r == null) |
| return null; |
| switch(r.getState()) { |
| case FINALIZED: |
| return new FinalizedReplica((FinalizedReplica)r); |
| case RBW: |
| return new ReplicaBeingWritten((ReplicaBeingWritten)r); |
| case RWR: |
| return new ReplicaWaitingToBeRecovered((ReplicaWaitingToBeRecovered)r); |
| case RUR: |
| return new ReplicaUnderRecovery((ReplicaUnderRecovery)r); |
| case TEMPORARY: |
| return new ReplicaInPipeline((ReplicaInPipeline)r); |
| } |
| return null; |
| } |
| |
| public boolean metaFileExists(Block b) throws IOException { |
| return getMetaFile(b).exists(); |
| } |
| |
| public long getMetaDataLength(Block b) throws IOException { |
| File checksumFile = getMetaFile( b ); |
| return checksumFile.length(); |
| } |
| |
| public MetaDataInputStream getMetaDataInputStream(Block b) |
| throws IOException { |
| File checksumFile = getMetaFile( b ); |
| return new MetaDataInputStream(new FileInputStream(checksumFile), |
| checksumFile.length()); |
| } |
| |
| static File createTmpFile(Block b, File f) throws IOException { |
| if (f.exists()) { |
| throw new IOException("Unexpected problem in creating temporary file for "+ |
| b + ". File " + f + " should not be present, but is."); |
| } |
| // Create the zero-length temp file |
| // |
| boolean fileCreated = false; |
| try { |
| fileCreated = f.createNewFile(); |
| } catch (IOException ioe) { |
| throw (IOException)new IOException(DISK_ERROR +f).initCause(ioe); |
| } |
| if (!fileCreated) { |
| throw new IOException("Unexpected problem in creating temporary file for "+ |
| b + ". File " + f + " should be creatable, but is already present."); |
| } |
| return f; |
| } |
| |
| FSVolumeSet volumes; |
| private int maxBlocksPerDir = 0; |
| ReplicasMap volumeMap = new ReplicasMap(); |
| static Random random = new Random(); |
| FSDatasetAsyncDiskService asyncDiskService; |
| |
| // Used for synchronizing access to usage stats |
| private Object statsLock = new Object(); |
| |
| boolean supportAppends = false; |
| |
| /** |
| * An FSDataset has a directory where it loads its data files. |
| */ |
| public FSDataset(DataStorage storage, Configuration conf) throws IOException { |
| this.maxBlocksPerDir = conf.getInt("dfs.datanode.numblocks", 64); |
| this.supportAppends = conf.getBoolean("dfs.support.append", false); |
| FSVolume[] volArray = new FSVolume[storage.getNumStorageDirs()]; |
| for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) { |
| volArray[idx] = new FSVolume(storage.getStorageDir(idx).getCurrentDir(), conf); |
| } |
| volumes = new FSVolumeSet(volArray); |
| volumes.getVolumeMap(volumeMap); |
| File[] roots = new File[storage.getNumStorageDirs()]; |
| for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) { |
| roots[idx] = storage.getStorageDir(idx).getCurrentDir(); |
| } |
| asyncDiskService = new FSDatasetAsyncDiskService(roots); |
| registerMBean(storage.getStorageID()); |
| } |
| |
| /** |
| * Return the total space used by dfs datanode |
| */ |
| public long getDfsUsed() throws IOException { |
| synchronized(statsLock) { |
| return volumes.getDfsUsed(); |
| } |
| } |
| /** |
| * Return true - if there are still valid volumes |
| * on the DataNode |
| */ |
| public boolean hasEnoughResource(){ |
| return volumes.numberOfVolumes() >= MIN_NUM_OF_VALID_VOLUMES; |
| } |
| |
| /** |
| * Return total capacity, used and unused |
| */ |
| public long getCapacity() throws IOException { |
| synchronized(statsLock) { |
| return volumes.getCapacity(); |
| } |
| } |
| |
| /** |
| * Return how many bytes can still be stored in the FSDataset |
| */ |
| public long getRemaining() throws IOException { |
| synchronized(statsLock) { |
| return volumes.getRemaining(); |
| } |
| } |
| |
| /** |
| * Find the block's on-disk length |
| */ |
| public long getLength(Block b) throws IOException { |
| return getBlockFile(b).length(); |
| } |
| |
| /** |
| * Get File name for a given block. |
| */ |
| public synchronized File getBlockFile(Block b) throws IOException { |
| File f = validateBlockFile(b); |
| if(f == null) { |
| if (InterDatanodeProtocol.LOG.isDebugEnabled()) { |
| InterDatanodeProtocol.LOG.debug("b=" + b + ", volumeMap=" + volumeMap); |
| } |
| throw new IOException("Block " + b + " is not valid."); |
| } |
| return f; |
| } |
| |
| public synchronized InputStream getBlockInputStream(Block b) throws IOException { |
| return new FileInputStream(getBlockFile(b)); |
| } |
| |
| public synchronized InputStream getBlockInputStream(Block b, long seekOffset) throws IOException { |
| |
| File blockFile = getBlockFile(b); |
| RandomAccessFile blockInFile = new RandomAccessFile(blockFile, "r"); |
| if (seekOffset > 0) { |
| blockInFile.seek(seekOffset); |
| } |
| return new FileInputStream(blockInFile.getFD()); |
| } |
| |
| /** |
| * Get the meta info of a block stored in volumeMap |
| * @param b block |
| * @return the meta replica information |
| * @throws IOException if no entry is in the map or |
| * there is a generation stamp mismatch |
| */ |
| private ReplicaInfo getReplicaInfo(Block b) throws IOException { |
| ReplicaInfo info = volumeMap.get(b); |
| if (info == null) { |
| throw new IOException("Block " + b + " does not exist in volumeMap."); |
| } |
| return info; |
| } |
| |
| /** |
| * Returns handles to the block file and its metadata file |
| */ |
| public synchronized BlockInputStreams getTmpInputStreams(Block b, |
| long blkOffset, long ckoff) throws IOException { |
| |
| ReplicaInfo info = getReplicaInfo(b); |
| File blockFile = info.getBlockFile(); |
| RandomAccessFile blockInFile = new RandomAccessFile(blockFile, "r"); |
| if (blkOffset > 0) { |
| blockInFile.seek(blkOffset); |
| } |
| File metaFile = info.getMetaFile(); |
| RandomAccessFile metaInFile = new RandomAccessFile(metaFile, "r"); |
| if (ckoff > 0) { |
| metaInFile.seek(ckoff); |
| } |
| return new BlockInputStreams(new FileInputStream(blockInFile.getFD()), |
| new FileInputStream(metaInFile.getFD())); |
| } |
| |
| /** |
| * Make a copy of the block if this block is linked to an existing |
| * snapshot. This ensures that modifying this block does not modify |
| * data in any existing snapshots. |
| * @param block Block |
| * @param numLinks Unlink if the number of links exceed this value |
| * @throws IOException |
| * @return - true if the specified block was unlinked or the block |
| * is not in any snapshot. |
| */ |
| public boolean unlinkBlock(Block block, int numLinks) throws IOException { |
| ReplicaInfo info = null; |
| |
| synchronized (this) { |
| info = getReplicaInfo(block); |
| } |
| return info.unlinkBlock(numLinks); |
| } |
| |
| static private void truncateBlock(File blockFile, File metaFile, |
| long oldlen, long newlen) throws IOException { |
| DataNode.LOG.info("truncateBlock: blockFile=" + blockFile |
| + ", metaFile=" + metaFile |
| + ", oldlen=" + oldlen |
| + ", newlen=" + newlen); |
| |
| if (newlen == oldlen) { |
| return; |
| } |
| if (newlen > oldlen) { |
| throw new IOException("Cannout truncate block to from oldlen (=" + oldlen |
| + ") to newlen (=" + newlen + ")"); |
| } |
| |
| DataChecksum dcs = BlockMetadataHeader.readHeader(metaFile).getChecksum(); |
| int checksumsize = dcs.getChecksumSize(); |
| int bpc = dcs.getBytesPerChecksum(); |
| long n = (newlen - 1)/bpc + 1; |
| long newmetalen = BlockMetadataHeader.getHeaderSize() + n*checksumsize; |
| long lastchunkoffset = (n - 1)*bpc; |
| int lastchunksize = (int)(newlen - lastchunkoffset); |
| byte[] b = new byte[Math.max(lastchunksize, checksumsize)]; |
| |
| RandomAccessFile blockRAF = new RandomAccessFile(blockFile, "rw"); |
| try { |
| //truncate blockFile |
| blockRAF.setLength(newlen); |
| |
| //read last chunk |
| blockRAF.seek(lastchunkoffset); |
| blockRAF.readFully(b, 0, lastchunksize); |
| } finally { |
| blockRAF.close(); |
| } |
| |
| //compute checksum |
| dcs.update(b, 0, lastchunksize); |
| dcs.writeValue(b, 0, false); |
| |
| //update metaFile |
| RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw"); |
| try { |
| metaRAF.setLength(newmetalen); |
| metaRAF.seek(newmetalen - checksumsize); |
| metaRAF.write(b, 0, checksumsize); |
| } finally { |
| metaRAF.close(); |
| } |
| } |
| |
| private final static String DISK_ERROR = "Possible disk error on file creation: "; |
| /** Get the cause of an I/O exception if caused by a possible disk error |
| * @param ioe an I/O exception |
| * @return cause if the I/O exception is caused by a possible disk error; |
| * null otherwise. |
| */ |
| static IOException getCauseIfDiskError(IOException ioe) { |
| if (ioe.getMessage()!=null && ioe.getMessage().startsWith(DISK_ERROR)) { |
| return (IOException)ioe.getCause(); |
| } else { |
| return null; |
| } |
| } |
| |
| @Override // FSDatasetInterface |
| public synchronized ReplicaInPipelineInterface append(Block b, |
| long newGS, long expectedBlockLen) throws IOException { |
| // If the block was successfully finalized because all packets |
| // were successfully processed at the Datanode but the ack for |
| // some of the packets were not received by the client. The client |
| // re-opens the connection and retries sending those packets. |
| // The other reason is that an "append" is occurring to this block. |
| |
| // check the validity of the parameter |
| if (newGS < b.getGenerationStamp()) { |
| throw new IOException("The new generation stamp " + newGS + |
| " should be greater than the replica " + b + "'s generation stamp"); |
| } |
| ReplicaInfo replicaInfo = volumeMap.get(b); |
| if (replicaInfo == null) { |
| throw new ReplicaNotFoundException( |
| ReplicaNotFoundException.NON_EXISTENT_REPLICA + b); |
| } |
| DataNode.LOG.info("Appending to replica " + replicaInfo); |
| if (replicaInfo.getState() != ReplicaState.FINALIZED) { |
| throw new ReplicaNotFoundException( |
| ReplicaNotFoundException.UNFINALIZED_REPLICA + b); |
| } |
| if (replicaInfo.getNumBytes() != expectedBlockLen) { |
| throw new IOException("Corrupted replica " + replicaInfo + |
| " with a length of " + replicaInfo.getNumBytes() + |
| " expected length is " + expectedBlockLen); |
| } |
| |
| return append((FinalizedReplica)replicaInfo, newGS, b.getNumBytes()); |
| } |
| |
| /** Append to a finalized replica |
| * Change a finalized replica to be a RBW replica and |
| * bump its generation stamp to be the newGS |
| * |
| * @param replicaInfo a finalized replica |
| * @param newGS new generation stamp |
| * @param estimateBlockLen estimate generation stamp |
| * @return a RBW replica |
| * @throws IOException if moving the replica from finalized directory |
| * to rbw directory fails |
| */ |
| private synchronized ReplicaBeingWritten append(FinalizedReplica replicaInfo, |
| long newGS, long estimateBlockLen) throws IOException { |
| // unlink the finalized replica |
| replicaInfo.unlinkBlock(1); |
| |
| // construct a RBW replica with the new GS |
| File blkfile = replicaInfo.getBlockFile(); |
| FSVolume v = volumes.getNextVolume(estimateBlockLen); |
| File newBlkFile = new File(v.rbwDir, replicaInfo.getBlockName()); |
| File oldmeta = replicaInfo.getMetaFile(); |
| ReplicaBeingWritten newReplicaInfo = new ReplicaBeingWritten( |
| replicaInfo.getBlockId(), replicaInfo.getNumBytes(), newGS, |
| v, newBlkFile.getParentFile(), Thread.currentThread()); |
| File newmeta = newReplicaInfo.getMetaFile(); |
| |
| // rename meta file to rbw directory |
| if (DataNode.LOG.isDebugEnabled()) { |
| DataNode.LOG.debug("Renaming " + oldmeta + " to " + newmeta); |
| } |
| if (!oldmeta.renameTo(newmeta)) { |
| throw new IOException("Block " + replicaInfo + " reopen failed. " + |
| " Unable to move meta file " + oldmeta + |
| " to rbw dir " + newmeta); |
| } |
| |
| // rename block file to rbw directory |
| if (DataNode.LOG.isDebugEnabled()) { |
| DataNode.LOG.debug("Renaming " + blkfile + " to " + newBlkFile); |
| DataNode.LOG.debug("Old block file length is " + blkfile.length()); |
| } |
| if (!blkfile.renameTo(newBlkFile)) { |
| if (!newmeta.renameTo(oldmeta)) { // restore the meta file |
| DataNode.LOG.warn("Cannot move meta file " + newmeta + |
| "back to the finalized directory " + oldmeta); |
| } |
| throw new IOException("Block " + replicaInfo + " reopen failed. " + |
| " Unable to move block file " + blkfile + |
| " to rbw dir " + newBlkFile); |
| } |
| |
| // Replace finalized replica by a RBW replica in replicas map |
| volumeMap.add(newReplicaInfo); |
| |
| return newReplicaInfo; |
| } |
| |
| private ReplicaInfo recoverCheck(Block b, long newGS, |
| long expectedBlockLen) throws IOException { |
| ReplicaInfo replicaInfo = volumeMap.get(b.getBlockId()); |
| if (replicaInfo == null) { |
| throw new ReplicaNotFoundException( |
| ReplicaNotFoundException.NON_EXISTENT_REPLICA + b); |
| } |
| |
| // check state |
| if (replicaInfo.getState() != ReplicaState.FINALIZED && |
| replicaInfo.getState() != ReplicaState.RBW) { |
| throw new ReplicaNotFoundException( |
| ReplicaNotFoundException.UNFINALIZED_AND_NONRBW_REPLICA + replicaInfo); |
| } |
| |
| // check generation stamp |
| long replicaGenerationStamp = replicaInfo.getGenerationStamp(); |
| if (replicaGenerationStamp < b.getGenerationStamp() || |
| replicaGenerationStamp > newGS) { |
| throw new ReplicaNotFoundException( |
| ReplicaNotFoundException.UNEXPECTED_GS_REPLICA + replicaGenerationStamp |
| + ". Expected GS range is [" + b.getGenerationStamp() + ", " + |
| newGS + "]."); |
| } |
| |
| // stop the previous writer before check a replica's length |
| long replicaLen = replicaInfo.getNumBytes(); |
| if (replicaInfo.getState() == ReplicaState.RBW) { |
| ReplicaBeingWritten rbw = (ReplicaBeingWritten)replicaInfo; |
| // kill the previous writer |
| rbw.stopWriter(); |
| rbw.setWriter(Thread.currentThread()); |
| // check length: bytesRcvd, bytesOnDisk, and bytesAcked should be the same |
| if (replicaLen != rbw.getBytesOnDisk() |
| || replicaLen != rbw.getBytesAcked()) { |
| throw new ReplicaAlreadyExistsException("RBW replica " + replicaInfo + |
| "bytesRcvd(" + rbw.getNumBytes() + "), bytesOnDisk(" + |
| rbw.getBytesOnDisk() + "), and bytesAcked(" + rbw.getBytesAcked() + |
| ") are not the same."); |
| } |
| } |
| |
| // check block length |
| if (replicaLen != expectedBlockLen) { |
| throw new IOException("Corrupted replica " + replicaInfo + |
| " with a length of " + replicaLen + |
| " expected length is " + expectedBlockLen); |
| } |
| |
| return replicaInfo; |
| } |
| @Override // FSDatasetInterface |
| public synchronized ReplicaInPipelineInterface recoverAppend(Block b, |
| long newGS, long expectedBlockLen) throws IOException { |
| DataNode.LOG.info("Recover failed append to " + b); |
| |
| ReplicaInfo replicaInfo = recoverCheck(b, newGS, expectedBlockLen); |
| |
| // change the replica's state/gs etc. |
| if (replicaInfo.getState() == ReplicaState.FINALIZED ) { |
| return append((FinalizedReplica)replicaInfo, newGS, b.getNumBytes()); |
| } else { //RBW |
| bumpReplicaGS(replicaInfo, newGS); |
| return (ReplicaBeingWritten)replicaInfo; |
| } |
| } |
| |
| @Override |
| public void recoverClose(Block b, long newGS, |
| long expectedBlockLen) throws IOException { |
| DataNode.LOG.info("Recover failed close " + b); |
| // check replica's state |
| ReplicaInfo replicaInfo = recoverCheck(b, newGS, expectedBlockLen); |
| // bump the replica's GS |
| bumpReplicaGS(replicaInfo, newGS); |
| // finalize the replica if RBW |
| if (replicaInfo.getState() == ReplicaState.RBW) { |
| finalizeBlock(replicaInfo); |
| } |
| } |
| |
| /** |
| * Bump a replica's generation stamp to a new one. |
| * Its on-disk meta file name is renamed to be the new one too. |
| * |
| * @param replicaInfo a replica |
| * @param newGS new generation stamp |
| * @throws IOException if rename fails |
| */ |
| private void bumpReplicaGS(ReplicaInfo replicaInfo, |
| long newGS) throws IOException { |
| long oldGS = replicaInfo.getGenerationStamp(); |
| File oldmeta = replicaInfo.getMetaFile(); |
| replicaInfo.setGenerationStamp(newGS); |
| File newmeta = replicaInfo.getMetaFile(); |
| |
| // rename meta file to new GS |
| if (DataNode.LOG.isDebugEnabled()) { |
| DataNode.LOG.debug("Renaming " + oldmeta + " to " + newmeta); |
| } |
| if (!oldmeta.renameTo(newmeta)) { |
| replicaInfo.setGenerationStamp(oldGS); // restore old GS |
| throw new IOException("Block " + (Block)replicaInfo + " reopen failed. " + |
| " Unable to move meta file " + oldmeta + |
| " to " + newmeta); |
| } |
| } |
| |
| @Override |
| public synchronized ReplicaInPipelineInterface createRbw(Block b) |
| throws IOException { |
| ReplicaInfo replicaInfo = volumeMap.get(b.getBlockId()); |
| if (replicaInfo != null) { |
| throw new ReplicaAlreadyExistsException("Block " + b + |
| " already exists in state " + replicaInfo.getState() + |
| " and thus cannot be created."); |
| } |
| // create a new block |
| FSVolume v = volumes.getNextVolume(b.getNumBytes()); |
| // create a rbw file to hold block in the designated volume |
| File f = v.createRbwFile(b); |
| ReplicaBeingWritten newReplicaInfo = new ReplicaBeingWritten(b.getBlockId(), |
| b.getGenerationStamp(), v, f.getParentFile()); |
| volumeMap.add(newReplicaInfo); |
| return newReplicaInfo; |
| } |
| |
| @Override |
| public synchronized ReplicaInPipelineInterface recoverRbw(Block b, |
| long newGS, long minBytesRcvd, long maxBytesRcvd) |
| throws IOException { |
| DataNode.LOG.info("Recover the RBW replica " + b); |
| |
| ReplicaInfo replicaInfo = volumeMap.get(b.getBlockId()); |
| if (replicaInfo == null) { |
| throw new ReplicaNotFoundException( |
| ReplicaNotFoundException.NON_EXISTENT_REPLICA + b); |
| } |
| |
| // check the replica's state |
| if (replicaInfo.getState() != ReplicaState.RBW) { |
| throw new ReplicaNotFoundException( |
| ReplicaNotFoundException.NON_RBW_REPLICA + replicaInfo); |
| } |
| ReplicaBeingWritten rbw = (ReplicaBeingWritten)replicaInfo; |
| |
| DataNode.LOG.info("Recovering replica " + rbw); |
| |
| // Stop the previous writer |
| rbw.stopWriter(); |
| rbw.setWriter(Thread.currentThread()); |
| |
| // check generation stamp |
| long replicaGenerationStamp = rbw.getGenerationStamp(); |
| if (replicaGenerationStamp < b.getGenerationStamp() || |
| replicaGenerationStamp > newGS) { |
| throw new ReplicaNotFoundException( |
| ReplicaNotFoundException.UNEXPECTED_GS_REPLICA + b + |
| ". Expected GS range is [" + b.getGenerationStamp() + ", " + |
| newGS + "]."); |
| } |
| |
| // check replica length |
| if (rbw.getBytesAcked() < minBytesRcvd || rbw.getNumBytes() > maxBytesRcvd){ |
| throw new ReplicaNotFoundException("Unmatched length replica " + |
| replicaInfo + ": BytesAcked = " + rbw.getBytesAcked() + |
| " BytesRcvd = " + rbw.getNumBytes() + " are not in the range of [" + |
| minBytesRcvd + ", " + maxBytesRcvd + "]."); |
| } |
| |
| // bump the replica's generation stamp to newGS |
| bumpReplicaGS(rbw, newGS); |
| |
| return rbw; |
| } |
| |
| @Override |
| public synchronized ReplicaInPipelineInterface createTemporary(Block b) |
| throws IOException { |
| ReplicaInfo replicaInfo = volumeMap.get(b.getBlockId()); |
| if (replicaInfo != null) { |
| throw new ReplicaAlreadyExistsException("Block " + b + |
| " already exists in state " + replicaInfo.getState() + |
| " and thus cannot be created."); |
| } |
| |
| FSVolume v = volumes.getNextVolume(b.getNumBytes()); |
| // create a temporary file to hold block in the designated volume |
| File f = v.createTmpFile(b); |
| ReplicaInPipeline newReplicaInfo = new ReplicaInPipeline(b.getBlockId(), |
| b.getGenerationStamp(), v, f.getParentFile()); |
| volumeMap.add(newReplicaInfo); |
| |
| return newReplicaInfo; |
| } |
| |
| /** |
| * Sets the offset in the meta file so that the |
| * last checksum will be overwritten. |
| */ |
| public void adjustCrcChannelPosition(Block b, BlockWriteStreams streams, |
| int checksumSize) throws IOException { |
| FileOutputStream file = (FileOutputStream) streams.checksumOut; |
| FileChannel channel = file.getChannel(); |
| long oldPos = channel.position(); |
| long newPos = oldPos - checksumSize; |
| DataNode.LOG.info("Changing meta file offset of block " + b + " from " + |
| oldPos + " to " + newPos); |
| channel.position(newPos); |
| } |
| |
| synchronized File createTmpFile( FSVolume vol, Block blk ) throws IOException { |
| if ( vol == null ) { |
| vol = getReplicaInfo( blk ).getVolume(); |
| if ( vol == null ) { |
| throw new IOException("Could not find volume for block " + blk); |
| } |
| } |
| return vol.createTmpFile(blk); |
| } |
| |
| // |
| // REMIND - mjc - eventually we should have a timeout system |
| // in place to clean up block files left by abandoned clients. |
| // We should have some timer in place, so that if a blockfile |
| // is created but non-valid, and has been idle for >48 hours, |
| // we can GC it safely. |
| // |
| |
| /** |
| * Complete the block write! |
| */ |
| public synchronized void finalizeBlock(Block b) throws IOException { |
| ReplicaInfo replicaInfo = getReplicaInfo(b); |
| if (replicaInfo.getState() == ReplicaState.FINALIZED) { |
| // this is legal, when recovery happens on a file that has |
| // been opened for append but never modified |
| return; |
| } |
| finalizeReplica(replicaInfo); |
| } |
| |
| private synchronized FinalizedReplica finalizeReplica(ReplicaInfo replicaInfo) |
| throws IOException { |
| FinalizedReplica newReplicaInfo = null; |
| if (replicaInfo.getState() == ReplicaState.RUR && |
| ((ReplicaUnderRecovery)replicaInfo).getOrignalReplicaState() == |
| ReplicaState.FINALIZED) { |
| newReplicaInfo = (FinalizedReplica) |
| ((ReplicaUnderRecovery)replicaInfo).getOriginalReplica(); |
| } else { |
| FSVolume v = replicaInfo.getVolume(); |
| File f = replicaInfo.getBlockFile(); |
| if (v == null) { |
| throw new IOException("No volume for temporary file " + f + |
| " for block " + replicaInfo); |
| } |
| |
| File dest = v.addBlock(replicaInfo, f); |
| newReplicaInfo = new FinalizedReplica(replicaInfo, v, dest.getParentFile()); |
| } |
| volumeMap.add(newReplicaInfo); |
| return newReplicaInfo; |
| } |
| |
| /** |
| * Remove the temporary block file (if any) |
| */ |
| public synchronized void unfinalizeBlock(Block b) throws IOException { |
| ReplicaInfo replicaInfo = volumeMap.get(b); |
| if (replicaInfo != null && replicaInfo.getState() == ReplicaState.TEMPORARY) { |
| // remove from volumeMap |
| volumeMap.remove(b); |
| |
| // delete the on-disk temp file |
| if (delBlockFromDisk(replicaInfo.getBlockFile(), |
| replicaInfo.getMetaFile(), b)) { |
| DataNode.LOG.warn("Block " + b + " unfinalized and removed. " ); |
| } |
| } |
| } |
| |
| /** |
| * Remove a block from disk |
| * @param blockFile block file |
| * @param metaFile block meta file |
| * @param b a block |
| * @return true if on-disk files are deleted; false otherwise |
| */ |
| private boolean delBlockFromDisk(File blockFile, File metaFile, Block b) { |
| if (blockFile == null) { |
| DataNode.LOG.warn("No file exists for block: " + b); |
| return true; |
| } |
| |
| if (!blockFile.delete()) { |
| DataNode.LOG.warn("Not able to delete the block file: " + blockFile); |
| return false; |
| } else { // remove the meta file |
| if (metaFile != null && !metaFile.delete()) { |
| DataNode.LOG.warn( |
| "Not able to delete the meta block file: " + metaFile); |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Generates a block report from the in-memory block map. |
| */ |
| public BlockListAsLongs getBlockReport() { |
| ArrayList<ReplicaInfo> finalized = |
| new ArrayList<ReplicaInfo>(volumeMap.size()); |
| ArrayList<ReplicaInfo> uc = new ArrayList<ReplicaInfo>(); |
| synchronized(this) { |
| for (ReplicaInfo b : volumeMap.replicas()) { |
| switch(b.getState()) { |
| case FINALIZED: |
| finalized.add(b); |
| break; |
| case RBW: |
| case RWR: |
| uc.add(b); |
| break; |
| case RUR: |
| ReplicaUnderRecovery rur = (ReplicaUnderRecovery)b; |
| uc.add(rur.getOriginalReplica()); |
| break; |
| case TEMPORARY: |
| break; |
| default: |
| assert false : "Illegal ReplicaInfo state."; |
| } |
| } |
| return new BlockListAsLongs(finalized, uc); |
| } |
| } |
| |
| /** |
| * Get the block list from in-memory blockmap. Note if <deepcopy> |
| * is false, reference to the block in the volumeMap is returned. This block |
| * should not be changed. Suitable synchronization using {@link FSDataset} |
| * is needed to handle concurrent modification to the block. |
| */ |
| synchronized Block[] getBlockList(boolean deepcopy) { |
| Block[] list = volumeMap.replicas().toArray(new Block[volumeMap.size()]); |
| if (deepcopy) { |
| for (int i = 0; i < list.length; i++) { |
| list[i] = new Block(list[i]); |
| } |
| } |
| return list; |
| } |
| |
| /** |
| * Get the list of finalized blocks from in-memory blockmap. |
| */ |
| synchronized List<Block> getFinalizedBlocks() { |
| ArrayList<Block> finalized = new ArrayList<Block>(volumeMap.size()); |
| for (ReplicaInfo b : volumeMap.replicas()) { |
| if(b.getState() == ReplicaState.FINALIZED) { |
| finalized.add(new Block(b)); |
| } |
| } |
| return finalized; |
| } |
| |
| /** |
| * Check whether the given block is a valid one. |
| * valid means finalized |
| */ |
| public boolean isValidBlock(Block b) { |
| ReplicaInfo replicaInfo = volumeMap.get(b); |
| if (replicaInfo == null || |
| replicaInfo.getState() != ReplicaState.FINALIZED) { |
| return false; |
| } |
| return replicaInfo.getBlockFile().exists(); |
| } |
| |
| /** |
| * Find the file corresponding to the block and return it if it exists. |
| */ |
| File validateBlockFile(Block b) throws IOException { |
| //Should we check for metadata file too? |
| File f = getFile(b); |
| |
| if(f != null ) { |
| if(f.exists()) |
| return f; |
| |
| // if file is not null, but doesn't exist - possibly disk failed |
| DataNode datanode = DataNode.getDataNode(); |
| datanode.checkDiskError(); |
| } |
| |
| if (InterDatanodeProtocol.LOG.isDebugEnabled()) { |
| InterDatanodeProtocol.LOG.debug("b=" + b + ", f=" + f); |
| } |
| return null; |
| } |
| |
| /** Check the files of a replica. */ |
| static void checkReplicaFiles(final ReplicaInfo r) throws IOException { |
| //check replica's file |
| final File f = r.getBlockFile(); |
| if (!f.exists()) { |
| throw new FileNotFoundException("File " + f + " not found, r=" + r); |
| } |
| if (r.getBytesOnDisk() != f.length()) { |
| throw new IOException("File length mismatched. The length of " |
| + f + " is " + f.length() + " but r=" + r); |
| } |
| |
| //check replica's meta file |
| final File metafile = getMetaFile(f, r); |
| if (!metafile.exists()) { |
| throw new IOException("Metafile " + metafile + " does not exist, r=" + r); |
| } |
| if (metafile.length() == 0) { |
| throw new IOException("Metafile " + metafile + " is empty, r=" + r); |
| } |
| } |
| |
| /** |
| * We're informed that a block is no longer valid. We |
| * could lazily garbage-collect the block, but why bother? |
| * just get rid of it. |
| */ |
| public void invalidate(Block invalidBlks[]) throws IOException { |
| boolean error = false; |
| for (int i = 0; i < invalidBlks.length; i++) { |
| File f = null; |
| FSVolume v; |
| synchronized (this) { |
| f = getFile(invalidBlks[i]); |
| ReplicaInfo dinfo = volumeMap.get(invalidBlks[i]); |
| if (dinfo == null || |
| dinfo.getGenerationStamp() != invalidBlks[i].getGenerationStamp()) { |
| DataNode.LOG.warn("Unexpected error trying to delete block " |
| + invalidBlks[i] + |
| ". BlockInfo not found in volumeMap."); |
| error = true; |
| continue; |
| } |
| v = dinfo.getVolume(); |
| if (f == null) { |
| DataNode.LOG.warn("Unexpected error trying to delete block " |
| + invalidBlks[i] + |
| ". Block not found in blockMap." + |
| ((v == null) ? " " : " Block found in volumeMap.")); |
| error = true; |
| continue; |
| } |
| if (v == null) { |
| DataNode.LOG.warn("Unexpected error trying to delete block " |
| + invalidBlks[i] + |
| ". No volume for this block." + |
| " Block found in blockMap. " + f + "."); |
| error = true; |
| continue; |
| } |
| File parent = f.getParentFile(); |
| if (parent == null) { |
| DataNode.LOG.warn("Unexpected error trying to delete block " |
| + invalidBlks[i] + |
| ". Parent not found for file " + f + "."); |
| error = true; |
| continue; |
| } |
| ReplicaState replicaState = dinfo.getState(); |
| if (replicaState == ReplicaState.FINALIZED || |
| (replicaState == ReplicaState.RUR && |
| ((ReplicaUnderRecovery)dinfo).getOrignalReplicaState() == |
| ReplicaState.FINALIZED)) { |
| v.clearPath(parent); |
| } |
| volumeMap.remove(invalidBlks[i]); |
| } |
| File metaFile = getMetaFile( f, invalidBlks[i] ); |
| long dfsBytes = f.length() + metaFile.length(); |
| |
| // Delete the block asynchronously to make sure we can do it fast enough |
| asyncDiskService.deleteAsync(v, f, metaFile, dfsBytes, invalidBlks[i].toString()); |
| } |
| if (error) { |
| throw new IOException("Error in deleting blocks."); |
| } |
| } |
| |
| /** |
| * Turn the block identifier into a filename; ignore generation stamp!!! |
| */ |
| public synchronized File getFile(Block b) { |
| return getFile(b.getBlockId()); |
| } |
| |
| /** |
| * Turn the block identifier into a filename |
| * @param blockId a block's id |
| * @return on disk data file path; null if the replica does not exist |
| */ |
| private File getFile(long blockId) { |
| ReplicaInfo info = volumeMap.get(blockId); |
| if (info != null) { |
| return info.getBlockFile(); |
| } |
| return null; |
| } |
| /** |
| * check if a data directory is healthy |
| * if some volumes failed - make sure to remove all the blocks that belong |
| * to these volumes |
| * @throws DiskErrorException |
| */ |
| public void checkDataDir() throws DiskErrorException { |
| long total_blocks=0, removed_blocks=0; |
| List<FSVolume> failed_vols = volumes.checkDirs(); |
| |
| //if there no failed volumes return |
| if(failed_vols == null) |
| return; |
| |
| // else |
| // remove related blocks |
| long mlsec = System.currentTimeMillis(); |
| synchronized (this) { |
| Iterator<ReplicaInfo> ib = volumeMap.replicas().iterator(); |
| while(ib.hasNext()) { |
| ReplicaInfo b = ib.next(); |
| total_blocks ++; |
| // check if the volume block belongs to still valid |
| FSVolume vol = b.getVolume(); |
| for(FSVolume fv: failed_vols) { |
| if(vol == fv) { |
| DataNode.LOG.warn("removing block " + b.getBlockId() + " from vol " |
| + vol.dataDir.dir.getAbsolutePath()); |
| ib.remove(); |
| removed_blocks++; |
| break; |
| } |
| } |
| } |
| } // end of sync |
| mlsec = System.currentTimeMillis() - mlsec; |
| DataNode.LOG.warn("Removed " + removed_blocks + " out of " + total_blocks + |
| "(took " + mlsec + " millisecs)"); |
| |
| // report the error |
| StringBuilder sb = new StringBuilder(); |
| for(FSVolume fv : failed_vols) { |
| sb.append(fv.dataDir.dir.getAbsolutePath() + ";"); |
| } |
| |
| throw new DiskErrorException("DataNode failed volumes:" + sb); |
| |
| } |
| |
| |
| public String toString() { |
| return "FSDataset{dirpath='"+volumes+"'}"; |
| } |
| |
| private ObjectName mbeanName; |
| private Random rand = new Random(); |
| |
| /** |
| * Register the FSDataset MBean using the name |
| * "hadoop:service=DataNode,name=FSDatasetState-<storageid>" |
| */ |
| void registerMBean(final String storageId) { |
| // We wrap to bypass standard mbean naming convetion. |
| // This wraping can be removed in java 6 as it is more flexible in |
| // package naming for mbeans and their impl. |
| StandardMBean bean; |
| String storageName; |
| if (storageId == null || storageId.equals("")) {// Temp fix for the uninitialized storage |
| storageName = "UndefinedStorageId" + rand.nextInt(); |
| } else { |
| storageName = storageId; |
| } |
| try { |
| bean = new StandardMBean(this,FSDatasetMBean.class); |
| mbeanName = MBeanUtil.registerMBean("DataNode", "FSDatasetState-" + storageName, bean); |
| } catch (NotCompliantMBeanException e) { |
| e.printStackTrace(); |
| } |
| |
| DataNode.LOG.info("Registered FSDatasetStatusMBean"); |
| } |
| |
| public void shutdown() { |
| if (mbeanName != null) |
| MBeanUtil.unregisterMBean(mbeanName); |
| |
| if (asyncDiskService != null) { |
| asyncDiskService.shutdown(); |
| } |
| |
| if(volumes != null) { |
| for (FSVolume volume : volumes.volumes) { |
| if(volume != null) { |
| volume.dfsUsage.shutdown(); |
| } |
| } |
| } |
| } |
| |
| public String getStorageInfo() { |
| return toString(); |
| } |
| |
| /** |
| * Reconcile the difference between blocks on the disk and blocks in |
| * volumeMap |
| * |
| * Check the given block for inconsistencies. Look at the |
| * current state of the block and reconcile the differences as follows: |
| * <ul> |
| * <li>If the block file is missing, delete the block from volumeMap</li> |
| * <li>If the block file exists and the block is missing in volumeMap, |
| * add the block to volumeMap <li> |
| * <li>If generation stamp does not match, then update the block with right |
| * generation stamp</li> |
| * <li>If the block length in memory does not match the actual block file length |
| * then mark the block as corrupt and update the block length in memory</li> |
| * <li>If the file in {@link ReplicaInfo} does not match the file on |
| * the disk, update {@link ReplicaInfo} with the correct file</li> |
| * </ul> |
| * |
| * @param blockId Block that differs |
| * @param diskFile Block file on the disk |
| * @param diskMetaFile Metadata file from on the disk |
| * @param vol Volume of the block file |
| */ |
| public void checkAndUpdate(long blockId, File diskFile, |
| File diskMetaFile, FSVolume vol) { |
| DataNode datanode = DataNode.getDataNode(); |
| Block corruptBlock = null; |
| ReplicaInfo memBlockInfo; |
| synchronized (this) { |
| memBlockInfo = volumeMap.get(blockId); |
| if (memBlockInfo != null && memBlockInfo.getState() != ReplicaState.FINALIZED) { |
| // Block is not finalized - ignore the difference |
| return; |
| } |
| |
| final long diskGS = diskMetaFile != null && diskMetaFile.exists() ? |
| Block.getGenerationStamp(diskMetaFile.getName()) : |
| GenerationStamp.GRANDFATHER_GENERATION_STAMP; |
| |
| if (diskFile == null || !diskFile.exists()) { |
| if (memBlockInfo == null) { |
| // Block file does not exist and block does not exist in memory |
| // If metadata file exists then delete it |
| if (diskMetaFile != null && diskMetaFile.exists() |
| && diskMetaFile.delete()) { |
| DataNode.LOG.warn("Deleted a metadata file without a block " |
| + diskMetaFile.getAbsolutePath()); |
| } |
| return; |
| } |
| if (!memBlockInfo.getBlockFile().exists()) { |
| // Block is in memory and not on the disk |
| // Remove the block from volumeMap |
| volumeMap.remove(blockId); |
| if (datanode.blockScanner != null) { |
| datanode.blockScanner.deleteBlock(new Block(blockId)); |
| } |
| DataNode.LOG.warn("Removed block " + blockId |
| + " from memory with missing block file on the disk"); |
| // Finally remove the metadata file |
| if (diskMetaFile != null && diskMetaFile.exists() |
| && diskMetaFile.delete()) { |
| DataNode.LOG.warn("Deleted a metadata file for the deleted block " |
| + diskMetaFile.getAbsolutePath()); |
| } |
| } |
| return; |
| } |
| /* |
| * Block file exists on the disk |
| */ |
| if (memBlockInfo == null) { |
| // Block is missing in memory - add the block to volumeMap |
| ReplicaInfo diskBlockInfo = new FinalizedReplica(blockId, |
| diskFile.length(), diskGS, vol, diskFile.getParentFile()); |
| volumeMap.add(diskBlockInfo); |
| if (datanode.blockScanner != null) { |
| datanode.blockScanner.addBlock(diskBlockInfo); |
| } |
| DataNode.LOG.warn("Added missing block to memory " + (Block)diskBlockInfo); |
| return; |
| } |
| /* |
| * Block exists in volumeMap and the block file exists on the disk |
| */ |
| // Compare block files |
| File memFile = memBlockInfo.getBlockFile(); |
| if (memFile.exists()) { |
| if (memFile.compareTo(diskFile) != 0) { |
| DataNode.LOG.warn("Block file " + memFile.getAbsolutePath() |
| + " does not match file found by scan " |
| + diskFile.getAbsolutePath()); |
| // TODO: Should the diskFile be deleted? |
| } |
| } else { |
| // Block refers to a block file that does not exist. |
| // Update the block with the file found on the disk. Since the block |
| // file and metadata file are found as a pair on the disk, update |
| // the block based on the metadata file found on the disk |
| DataNode.LOG.warn("Block file in volumeMap " |
| + memFile.getAbsolutePath() |
| + " does not exist. Updating it to the file found during scan " |
| + diskFile.getAbsolutePath()); |
| memBlockInfo.setDir(diskFile.getParentFile()); |
| memFile = diskFile; |
| |
| DataNode.LOG.warn("Updating generation stamp for block " + blockId |
| + " from " + memBlockInfo.getGenerationStamp() + " to " + diskGS); |
| memBlockInfo.setGenerationStamp(diskGS); |
| } |
| |
| // Compare generation stamp |
| if (memBlockInfo.getGenerationStamp() != diskGS) { |
| File memMetaFile = getMetaFile(diskFile, memBlockInfo); |
| if (memMetaFile.exists()) { |
| if (memMetaFile.compareTo(diskMetaFile) != 0) { |
| DataNode.LOG.warn("Metadata file in memory " |
| + memMetaFile.getAbsolutePath() |
| + " does not match file found by scan " |
| + diskMetaFile.getAbsolutePath()); |
| } |
| } else { |
| // Metadata file corresponding to block in memory is missing |
| // If metadata file found during the scan is on the same directory |
| // as the block file, then use the generation stamp from it |
| long gs = diskMetaFile != null && diskMetaFile.exists() |
| && diskMetaFile.getParent().equals(memFile.getParent()) ? diskGS |
| : GenerationStamp.GRANDFATHER_GENERATION_STAMP; |
| |
| DataNode.LOG.warn("Updating generation stamp for block " + blockId |
| + " from " + memBlockInfo.getGenerationStamp() + " to " + gs); |
| |
| memBlockInfo.setGenerationStamp(gs); |
| } |
| } |
| |
| // Compare block size |
| if (memBlockInfo.getNumBytes() != memFile.length()) { |
| // Update the length based on the block file |
| corruptBlock = new Block(memBlockInfo); |
| DataNode.LOG.warn("Updating size of block " + blockId + " from " |
| + memBlockInfo.getNumBytes() + " to " + memFile.length()); |
| memBlockInfo.setNumBytes(memFile.length()); |
| } |
| } |
| |
| // Send corrupt block report outside the lock |
| if (corruptBlock != null) { |
| DatanodeInfo[] dnArr = { new DatanodeInfo(datanode.dnRegistration) }; |
| LocatedBlock[] blocks = { new LocatedBlock(corruptBlock, dnArr) }; |
| try { |
| datanode.namenode.reportBadBlocks(blocks); |
| DataNode.LOG.warn("Reporting the block " + corruptBlock |
| + " as corrupt due to length mismatch"); |
| } catch (IOException e) { |
| DataNode.LOG.warn("Failed to repot bad block " + corruptBlock |
| + "Exception:" + StringUtils.stringifyException(e)); |
| } |
| } |
| } |
| |
| /** |
| * @deprecated use {@link #fetchReplicaInfo(long)} instead. |
| */ |
| @Override |
| @Deprecated |
| public ReplicaInfo getReplica(long blockId) { |
| assert(Thread.holdsLock(this)); |
| return volumeMap.get(blockId); |
| } |
| |
| @Override // FSDatasetInterface |
| public synchronized ReplicaRecoveryInfo initReplicaRecovery( |
| RecoveringBlock rBlock) throws IOException { |
| return initReplicaRecovery( |
| volumeMap, rBlock.getBlock(), rBlock.getNewGenerationStamp()); |
| } |
| |
| /** static version of {@link #initReplicaRecovery(Block, long)}. */ |
| static ReplicaRecoveryInfo initReplicaRecovery( |
| ReplicasMap map, Block block, long recoveryId) throws IOException { |
| final ReplicaInfo replica = map.get(block.getBlockId()); |
| DataNode.LOG.info("initReplicaRecovery: block=" + block |
| + ", recoveryId=" + recoveryId |
| + ", replica=" + replica); |
| |
| //check replica |
| if (replica == null) { |
| return null; |
| } |
| |
| //stop writer if there is any |
| if (replica instanceof ReplicaInPipeline) { |
| final ReplicaInPipeline rip = (ReplicaInPipeline)replica; |
| rip.stopWriter(); |
| |
| //check replica bytes on disk. |
| if (rip.getBytesOnDisk() < rip.getVisibleLength()) { |
| throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:" |
| + " getBytesOnDisk() < getVisibleLength(), rip=" + rip); |
| } |
| |
| //check the replica's files |
| checkReplicaFiles(rip); |
| } |
| |
| //check generation stamp |
| if (replica.getGenerationStamp() < block.getGenerationStamp()) { |
| throw new IOException( |
| "replica.getGenerationStamp() < block.getGenerationStamp(), block=" |
| + block + ", replica=" + replica); |
| } |
| |
| //check recovery id |
| if (replica.getGenerationStamp() >= recoveryId) { |
| throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:" |
| + " replica.getGenerationStamp() >= recoveryId = " + recoveryId |
| + ", block=" + block + ", replica=" + replica); |
| } |
| |
| //check RUR |
| final ReplicaUnderRecovery rur; |
| if (replica.getState() == ReplicaState.RUR) { |
| rur = (ReplicaUnderRecovery)replica; |
| if (rur.getRecoveryID() >= recoveryId) { |
| throw new RecoveryInProgressException( |
| "rur.getRecoveryID() >= recoveryId = " + recoveryId |
| + ", block=" + block + ", rur=" + rur); |
| } |
| final long oldRecoveryID = rur.getRecoveryID(); |
| rur.setRecoveryID(recoveryId); |
| DataNode.LOG.info("initReplicaRecovery: update recovery id for " + block |
| + " from " + oldRecoveryID + " to " + recoveryId); |
| } |
| else { |
| rur = new ReplicaUnderRecovery(replica, recoveryId); |
| map.add(rur); |
| DataNode.LOG.info("initReplicaRecovery: changing replica state for " |
| + block + " from " + replica.getState() |
| + " to " + rur.getState()); |
| } |
| return rur.createInfo(); |
| } |
| |
| @Override // FSDatasetInterface |
| public synchronized ReplicaInfo updateReplicaUnderRecovery( |
| final Block oldBlock, |
| final long recoveryId, |
| final long newlength) throws IOException { |
| //get replica |
| final ReplicaInfo replica = volumeMap.get(oldBlock.getBlockId()); |
| DataNode.LOG.info("updateReplica: block=" + oldBlock |
| + ", recoveryId=" + recoveryId |
| + ", length=" + newlength |
| + ", replica=" + replica); |
| |
| //check replica |
| if (replica == null) { |
| throw new ReplicaNotFoundException(oldBlock); |
| } |
| |
| //check replica state |
| if (replica.getState() != ReplicaState.RUR) { |
| throw new IOException("replica.getState() != " + ReplicaState.RUR |
| + ", replica=" + replica); |
| } |
| |
| //check replica's byte on disk |
| if (replica.getBytesOnDisk() != oldBlock.getNumBytes()) { |
| throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:" |
| + " replica.getBytesOnDisk() != block.getNumBytes(), block=" |
| + oldBlock + ", replica=" + replica); |
| } |
| |
| //check replica files before update |
| checkReplicaFiles(replica); |
| |
| //update replica |
| final FinalizedReplica finalized = updateReplicaUnderRecovery( |
| (ReplicaUnderRecovery)replica, recoveryId, newlength); |
| |
| //check replica files after update |
| checkReplicaFiles(finalized); |
| return finalized; |
| } |
| |
| private FinalizedReplica updateReplicaUnderRecovery( |
| ReplicaUnderRecovery rur, |
| long recoveryId, |
| long newlength) throws IOException { |
| //check recovery id |
| if (rur.getRecoveryID() != recoveryId) { |
| throw new IOException("rur.getRecoveryID() != recoveryId = " + recoveryId |
| + ", rur=" + rur); |
| } |
| |
| // bump rur's GS to be recovery id |
| bumpReplicaGS(rur, recoveryId); |
| |
| //update length |
| final File replicafile = rur.getBlockFile(); |
| if (rur.getNumBytes() < newlength) { |
| throw new IOException("rur.getNumBytes() < newlength = " + newlength |
| + ", rur=" + rur); |
| } |
| if (rur.getNumBytes() > newlength) { |
| rur.unlinkBlock(1); |
| truncateBlock(replicafile, rur.getMetaFile(), rur.getNumBytes(), newlength); |
| // update RUR with the new length |
| rur.setNumBytes(newlength); |
| } |
| |
| // finalize the block |
| return finalizeReplica(rur); |
| } |
| |
| @Override // FSDatasetInterface |
| public synchronized long getReplicaVisibleLength(final Block block) |
| throws IOException { |
| final Replica replica = volumeMap.get(block.getBlockId()); |
| if (replica == null) { |
| throw new ReplicaNotFoundException(block); |
| } |
| if (replica.getGenerationStamp() < block.getGenerationStamp()) { |
| throw new IOException( |
| "replica.getGenerationStamp() < block.getGenerationStamp(), block=" |
| + block + ", replica=" + replica); |
| } |
| return replica.getVisibleLength(); |
| } |
| } |