一、讀取block數據方法流轉
傳輸數據是通過scoket建立的,Sender和Receiver,這兩個方法都繼承了父類DataTransferProtocol。
二、讀取方法
1、讀取操作DFSInputStream.read():
/**
* Read the entire buffer.
*/
@Override
public synchronized int read(final byte buf[], int off, int len) throws IOException {
ReaderStrategy byteArrayReader = new ByteArrayStrategy(buf);
TraceScope scope =
dfsClient.getPathTraceScope("DFSInputStream#byteArrayRead", src);
try {
//根據ByteArrayStrategy的策略讀取
return readWithStrategy(byteArrayReader, off, len);
} finally {
scope.close();
}
}
2、ByteArrayStrategy的策略讀取DFSInputStream.readWithStrategy()
該方法選擇最優的datanode并且讀取內容,默認允許失敗讀取三次。失敗原因可能是沒有合適的datanode,比如datanode都掛了;還有block數據丟失了,讀取不到。
private synchronized int readWithStrategy(ReaderStrategy strategy, int off, int len) throws IOException {
dfsClient.checkOpen();
if (closed.get()) {
throw new IOException("Stream closed");
}
Map<ExtendedBlock,Set<DatanodeInfo>> corruptedBlockMap
= new HashMap<ExtendedBlock, Set<DatanodeInfo>>();
failures = 0;
if (pos < getFileLength()) {
int retries = 2;
while (retries > 0) {
try {
// currentNode can be left as null if previous read had a checksum
// error on the same block. See HDFS-3067
if (pos > blockEnd || currentNode == null) {
// 根據當前position所在的block位置選擇最優的datanode節點讀取,并且獲取blockReader是本地讀取還是遠程讀取方式
//1.獲取讀取數據的datanode,和讀取策略(遠程或者本地)
currentNode = blockSeekTo(pos);
}
int realLen = (int) Math.min(len, (blockEnd - pos + 1L));
synchronized(infoLock) {
if (locatedBlocks.isLastBlockComplete()) {
realLen = (int) Math.min(realLen,
locatedBlocks.getFileLength() - pos);
}
}
//2.從datanode節點上讀取數據
int result = readBuffer(strategy, off, realLen, corruptedBlockMap);
if (result >= 0) {
pos += result;
} else {
// got a EOS from reader though we expect more data on it.
throw new IOException("Unexpected EOS from the reader");
}
if (dfsClient.stats != null) {
dfsClient.stats.incrementBytesRead(result);
}
return result;
} catch (ChecksumException ce) {
throw ce;
} catch (IOException e) {
if (retries == 1) {
DFSClient.LOG.warn("DFS Read", e);
}
blockEnd = -1;
if (currentNode != null) { addToDeadNodes(currentNode); }
if (--retries == 0) {
throw e;
}
} finally {
// Check if need to report block replicas corruption either read
// was successful or ChecksumException occured.
reportCheckSumFailure(corruptedBlockMap,
currentLocatedBlock.getLocations().length);
}
}
}
return -1;
}
三、選擇最優的datanode節點
1、DFSInputStream.blockSeekTo()
該方法判斷選擇讀取方式是遠程讀取還是本地讀取。從nanenode讀取的block,然后根據這個block獲取最優的datanode節點讀取。方法newBlockReader在獲取blockReader時候可能出現IO異常,while循環可能在fetchBlockAt(target)拋出異常終止。
blockReader構建的時候如果是遠程讀取會獲取一個RemoteBlockReader2對象,該對象主要作用是客戶端和datanode建立一個socket連接,主要返回兩個結果:一是、是否可以正常讀取datanode數據,二是、讀取datanode正常的block數據,在客戶端的readNextPacket方法接處理danode返回數據。后面會單獨講解Sender和DataXceiver處理socket請求。
/**
* Open a DataInputStream to a DataNode so that it can be read from.
* We get block ID and the IDs of the destinations at startup, from the namenode.
*/
private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
if (target >= getFileLength()) {
throw new IOException("Attempted to read past end of file");
}
// Will be getting a new BlockReader.
closeCurrentBlockReader();
//
// Connect to best DataNode for desired Block, with potential offset
//
DatanodeInfo chosenNode = null;
int refetchToken = 1; // only need to get a new access token once
int refetchEncryptionKey = 1; // only need to get a new encryption key once
boolean connectFailedOnce = false;
while (true) {
//
// Compute desired block
//1. 從locatedBlocks緩存中獲取block,緩存集合中沒有就rpc到namenode上讀取
LocatedBlock targetBlock = getBlockAt(target);
// update current position
this.pos = target;
this.blockEnd = targetBlock.getStartOffset() +
targetBlock.getBlockSize() - 1;
this.currentLocatedBlock = targetBlock;
assert (target==pos) : "Wrong postion " + pos + " expect " + target;
long offsetIntoBlock = target - targetBlock.getStartOffset();
// 2. 選擇最優的datanode讀取block
DNAddrPair retval = chooseDataNode(targetBlock, null);
chosenNode = retval.info;
InetSocketAddress targetAddr = retval.addr;
StorageType storageType = retval.storageType;
try {
ExtendedBlock blk = targetBlock.getBlock();
Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
CachingStrategy curCachingStrategy;
boolean shortCircuitForbidden;
synchronized(infoLock) {
curCachingStrategy = cachingStrategy;
shortCircuitForbidden = shortCircuitForbidden();
}
//選擇讀取方式,如果是本地讀取就讀文件或者走TCP方式讀取,RemoteBlockReader2 or BlockReaderLocal
blockReader = new BlockReaderFactory(dfsClient.getConf()).
setInetSocketAddress(targetAddr).
setRemotePeerFactory(dfsClient).
setDatanodeInfo(chosenNode).//最優的datanode
setStorageType(storageType).
setFileName(src).
setBlock(blk).
setBlockToken(accessToken).
setStartOffset(offsetIntoBlock).
setVerifyChecksum(verifyChecksum).
setClientName(dfsClient.clientName).
setLength(blk.getNumBytes() - offsetIntoBlock).
setCachingStrategy(curCachingStrategy).
setAllowShortCircuitLocalReads(!shortCircuitForbidden).
setClientCacheContext(dfsClient.getClientContext()).
setUserGroupInformation(dfsClient.ugi).
setConfiguration(dfsClient.getConfiguration()).
build();
if(connectFailedOnce) {
DFSClient.LOG.info("Successfully connected to " + targetAddr +
" for " + blk);
}
return chosenNode;
} catch (IOException ex) {
if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
DFSClient.LOG.info("Will fetch a new encryption key and retry, "
+ "encryption key was invalid when connecting to " + targetAddr
+ " : " + ex);
// The encryption key used is invalid.
refetchEncryptionKey--;
dfsClient.clearDataEncryptionKey();
} else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
refetchToken--;
fetchBlockAt(target);
} else {
connectFailedOnce = true;
DFSClient.LOG.warn("Failed to connect to " + targetAddr + " for block"
+ ", add to deadNodes and continue. " + ex, ex);
// Put chosen node into dead list, continue
addToDeadNodes(chosenNode);
}
}
}
}
2、獲取讀取的block DFSInputStream.getBlockAt():
從locatedBlocks緩存中獲取block,緩存集合中沒有就rpc到namenode上讀取
/**
* Get block at the specified position.
* Fetch it from the namenode if not cached.
*
* @param offset block corresponding to this offset in file is returned
* @return located block
* @throws IOException
*/
private LocatedBlock getBlockAt(long offset) throws IOException {
synchronized(infoLock) {
assert (locatedBlocks != null) : "locatedBlocks is null";
final LocatedBlock blk;
//check offset
if (offset < 0 || offset >= getFileLength()) {
throw new IOException("offset < 0 || offset >= getFileLength(), offset="
+ offset
+ ", locatedBlocks=" + locatedBlocks);
}
else if (offset >= locatedBlocks.getFileLength()) {
// offset to the portion of the last block,
// which is not known to the name-node yet;
// getting the last block
blk = locatedBlocks.getLastLocatedBlock();
}
else {
// search cached blocks first
//從緩存去查詢
int targetBlockIdx = locatedBlocks.findBlock(offset);
if (targetBlockIdx < 0) { // block is not cached
targetBlockIdx = LocatedBlocks.getInsertIndex(targetBlockIdx);
// fetch more blocks,緩存中沒有找到從namenode獲取
final LocatedBlocks newBlocks = dfsClient.getLocatedBlocks(src, offset);
assert (newBlocks != null) : "Could not find target position " + offset;
//重新插入到block集合中
locatedBlocks.insertRange(targetBlockIdx, newBlocks.getLocatedBlocks());
}
blk = locatedBlocks.get(targetBlockIdx);
}
return blk;
}
}
3、選擇最優的datanode節點 DFSInputStream.chooseDataNode()
如果讀取失敗次數超過閾值(默認3次),就會拋異常,該參數是通過dfs.client.max.block.acquire.failures修改。
private DNAddrPair chooseDataNode(LocatedBlock block,
Collection<DatanodeInfo> ignoredNodes) throws IOException {
while (true) {
try {
return getBestNodeDNAddrPair(block, ignoredNodes);
} catch (IOException ie) {
String errMsg = getBestNodeDNAddrPairErrorString(block.getLocations(),
deadNodes, ignoredNodes);
String blockInfo = block.getBlock() + " file=" + src;
if (failures >= dfsClient.getMaxBlockAcquireFailures()) {
String description = "Could not obtain block: " + blockInfo;
DFSClient.LOG.warn(description + errMsg
+ ". Throwing a BlockMissingException");
throw new BlockMissingException(src, description,
block.getStartOffset());
}
DatanodeInfo[] nodes = block.getLocations();
if (nodes == null || nodes.length == 0) {
DFSClient.LOG.info("No node available for " + blockInfo);
}
DFSClient.LOG.info("Could not obtain " + block.getBlock()
+ " from any node: " + ie + errMsg
+ ". Will get new block locations from namenode and retry...");
try {
// Introducing a random factor to the wait time before another retry.
// The wait time is dependent on # of failures and a random factor.
// At the first time of getting a BlockMissingException, the wait time
// is a random number between 0..3000 ms. If the first retry
// still fails, we will wait 3000 ms grace period before the 2nd retry.
// Also at the second retry, the waiting window is expanded to 6000 ms
// alleviating the request rate from the server. Similarly the 3rd retry
// will wait 6000ms grace period before retry and the waiting window is
// expanded to 9000ms.
final int timeWindow = dfsClient.getConf().timeWindow;
double waitTime = timeWindow * failures + // grace period for the last round of attempt
timeWindow * (failures + 1) * DFSUtil.getRandom().nextDouble(); // expanding time window for each failure
DFSClient.LOG.warn("DFS chooseDataNode: got # " + (failures + 1) + " IOException, will wait for " + waitTime + " msec.");
Thread.sleep((long)waitTime);
} catch (InterruptedException iex) {
}
deadNodes.clear(); //2nd option is to remove only nodes[blockId]
openInfo();
block = getBlockAt(block.getStartOffset());
failures++;
continue;
}
}
}
4、選擇最優的節點DFSInputStream.getBestNodeDNAddrPair():
/**
* Get the best node from which to stream the data.
* @param block LocatedBlock, containing nodes in priority order.
* @param ignoredNodes Do not choose nodes in this array (may be null)
* @return The DNAddrPair of the best node.
* @throws IOException
*/
private DNAddrPair getBestNodeDNAddrPair(LocatedBlock block,
Collection<DatanodeInfo> ignoredNodes) throws IOException {
//nodes已經是最優排序后的datanode集合
DatanodeInfo[] nodes = block.getLocations();
StorageType[] storageTypes = block.getStorageTypes();
DatanodeInfo chosenNode = null;
StorageType storageType = null;
if (nodes != null) {
for (int i = 0; i < nodes.length; i++) {
if (!deadNodes.containsKey(nodes[i])
&& (ignoredNodes == null || !ignoredNodes.contains(nodes[i]))) {
chosenNode = nodes[i];
// Storage types are ordered to correspond with nodes, so use the same
// index to get storage type.
if (storageTypes != null && i < storageTypes.length) {
storageType = storageTypes[i];
}
break;
}
}
}
if (chosenNode == null) {
throw new IOException("No live nodes contain block " + block.getBlock() +
" after checking nodes = " + Arrays.toString(nodes) +
", ignoredNodes = " + ignoredNodes);
}
final String dnAddr =
chosenNode.getXferAddr(dfsClient.getConf().connectToDnViaHostname);
if (DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("Connecting to datanode " + dnAddr);
}
InetSocketAddress targetAddr = NetUtils.createSocketAddr(dnAddr);
return new DNAddrPair(chosenNode, targetAddr, storageType);
}
四、選擇讀取數據策略
1、讀取數據 DFSInputStream.readBuffer():
讀取block時候,如果Checksum校驗失敗,會一直選擇合適的datanode節點讀取,直到讀取成功或者遍歷所有datanode都讀取失敗拋出ChecksumException終止讀取數據;如果讀取出現IO異常,也是不斷的遍歷正常的datanode,直到成功或者拋出IOException終止讀取。
/* This is a used by regular read() and handles ChecksumExceptions.
* name readBuffer() is chosen to imply similarity to readBuffer() in
* ChecksumFileSystem
*/
private synchronized int readBuffer(ReaderStrategy reader, int off, int len,
Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
throws IOException {
IOException ioe;
/* we retry current node only once. So this is set to true only here.
* Intention is to handle one common case of an error that is not a
* failure on datanode or client : when DataNode closes the connection
* since client is idle. If there are other cases of "non-errors" then
* then a datanode might be retried by setting this to true again.
*/
boolean retryCurrentNode = true;
while (true) {
// retry as many times as seekToNewSource allows.
try {
//reader策略是ByteArrayStrategy,開始讀取數據。blockReader是遠程讀RemoteBlockReader2還是本地讀 BlockReaderLocal
return reader.doRead(blockReader, off, len);
} catch ( ChecksumException ce ) {
DFSClient.LOG.warn("Found Checksum error for "
+ getCurrentBlock() + " from " + currentNode
+ " at " + ce.getPos());
ioe = ce;
retryCurrentNode = false;
// we want to remember which block replicas we have tried
addIntoCorruptedBlockMap(getCurrentBlock(), currentNode,
corruptedBlockMap);
} catch ( IOException e ) {
if (!retryCurrentNode) {
DFSClient.LOG.warn("Exception while reading from "
+ getCurrentBlock() + " of " + src + " from "
+ currentNode, e);
}
ioe = e;
}
boolean sourceFound = false;
if (retryCurrentNode) {
/* possibly retry the same node so that transient errors don't
* result in application level failures (e.g. Datanode could have
* closed the connection because the client is idle for too long).
*/
sourceFound = seekToBlockSource(pos);
} else {
//如果讀取失敗,則將當前datanode添加到失敗的節點deadNodes Map集合中
addToDeadNodes(currentNode);
//重新找個正常的datanode讀取
sourceFound = seekToNewSource(pos);
}
//沒有找到position所在的datanode就拋異常
if (!sourceFound) {
throw ioe;
}
retryCurrentNode = false;
}
}
2、采用ByteArrayStrategy讀取ByteArrayStrategy.doRead()
@Override
public int doRead(BlockReader blockReader, int off, int len)
throws ChecksumException, IOException {
// BlockReaderLocal本地讀寫,
//RemoteBlockReader2 遠程讀寫
int nRead = blockReader.read(buf, off, len);
updateReadStatistics(readStatistics, nRead, blockReader);
return nRead;
}
五、遠程讀取方式
1、以packet單位讀取 RemoteBlockReader2.read()
@Override
public synchronized int read(byte[] buf, int off, int len)
throws IOException {
UUID randomId = null;
if (LOG.isTraceEnabled()) {
randomId = UUID.randomUUID();
LOG.trace(String.format("Starting read #%s file %s from datanode %s",
randomId.toString(), this.filename,
this.datanodeID.getHostName()));
}
// 所有緩沖區都有4個屬性:capacity、limit、position、mark,并遵循:mark <= position <= limit <= capacity
//remaining(): return limit - position; 返回limit和position之間相對位置差
//curDataSlice為空或者curDataSlice已經讀取完了,讀取下一個packet
if (curDataSlice == null || curDataSlice.remaining() == 0 && bytesNeededToFinish > 0) {
TraceScope scope = Trace.startSpan(
"RemoteBlockReader2#readNextPacket(" + blockId + ")", Sampler.NEVER);
try {
//以Packet單位讀取,每個packet由若干個chunk組成
readNextPacket();
} finally {
scope.close();
}
}
if (LOG.isTraceEnabled()) {
LOG.trace(String.format("Finishing read #" + randomId));
}
if (curDataSlice.remaining() == 0) {
// we're at EOF now
return -1;
}
//去最小值讀取
int nRead = Math.min(curDataSlice.remaining(), len);
//從curDataSlice的position位置開始相對讀,讀nRead個byte,并寫入buf下標從off到off+nRead的區域
curDataSlice.get(buf, off, nRead);
return nRead;
}
2、以Packet單位讀取RemoteBlockReader2.readNextPacket()
private void readNextPacket() throws IOException {
//Read packet headers.
packetReceiver.receiveNextPacket(in);
PacketHeader curHeader = packetReceiver.getHeader();
curDataSlice = packetReceiver.getDataSlice();
assert curDataSlice.capacity() == curHeader.getDataLen();
if (LOG.isTraceEnabled()) {
LOG.trace("DFSClient readNextPacket got header " + curHeader);
}
// Sanity check the lengths
if (!curHeader.sanityCheck(lastSeqNo)) {
throw new IOException("BlockReader: error in packet header " +
curHeader);
}
if (curHeader.getDataLen() > 0) {
//計算一個packet由多少個chunks
int chunks = 1 + (curHeader.getDataLen() - 1) / bytesPerChecksum;
int checksumsLen = chunks * checksumSize;
assert packetReceiver.getChecksumSlice().capacity() == checksumsLen :
"checksum slice capacity=" + packetReceiver.getChecksumSlice().capacity() +
" checksumsLen=" + checksumsLen;
lastSeqNo = curHeader.getSeqno();
if (verifyChecksum && curDataSlice.remaining() > 0) {
// N.B.: the checksum error offset reported here is actually
// relative to the start of the block, not the start of the file.
// This is slightly misleading, but preserves the behavior from
// the older BlockReader.
//校驗checksum
checksum.verifyChunkedSums(curDataSlice,
packetReceiver.getChecksumSlice(),
filename, curHeader.getOffsetInBlock());
}
bytesNeededToFinish -= curHeader.getDataLen();
}
// First packet will include some data prior to the first byte
// the user requested. Skip it.
if (curHeader.getOffsetInBlock() < startOffset) {
int newPos = (int) (startOffset - curHeader.getOffsetInBlock());
curDataSlice.position(newPos);
}
// If we've now satisfied the whole client read, read one last packet
// header, which should be empty
if (bytesNeededToFinish <= 0) {
readTrailingEmptyPacket();
//告訴客戶的結果
if (verifyChecksum) {
sendReadResult(Status.CHECKSUM_OK);
} else {
sendReadResult(Status.SUCCESS);
}
}
}
六、本地讀取方式
1、BlockReaderLocal.read()
@Override
public synchronized int read(ByteBuffer buf) throws IOException {
boolean canSkipChecksum = createNoChecksumContext();
try {
String traceString = null;
if (LOG.isTraceEnabled()) {
traceString = new StringBuilder().
append("read(").
append("buf.remaining=").append(buf.remaining()).
append(", block=").append(block).
append(", filename=").append(filename).
append(", canSkipChecksum=").append(canSkipChecksum).
append(")").toString();
LOG.info(traceString + ": starting");
}
int nRead;
try {
if (canSkipChecksum && zeroReadaheadRequested) {
//不校驗讀取
nRead = readWithoutBounceBuffer(buf);
} else {
//校驗讀取
nRead = readWithBounceBuffer(buf, canSkipChecksum);
}
} catch (IOException e) {
if (LOG.isTraceEnabled()) {
LOG.info(traceString + ": I/O error", e);
}
throw e;
}
if (LOG.isTraceEnabled()) {
LOG.info(traceString + ": returning " + nRead);
}
return nRead;
} finally {
if (canSkipChecksum) releaseNoChecksumContext();
}
}