blob: 424ddc04b4cc6eea700631ccb377e055f6146664 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ignite.internal.processors.diagnostic;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Arrays;
import java.util.stream.LongStream;
import org.apache.ignite.IgniteCheckedException;
import org.apache.ignite.IgniteSystemProperties;
import org.apache.ignite.failure.FailureContext;
import org.apache.ignite.internal.GridKernalContext;
import org.apache.ignite.internal.pagemem.wal.IgniteWriteAheadLogManager;
import org.apache.ignite.internal.processors.GridProcessorAdapter;
import org.apache.ignite.internal.processors.cache.persistence.CorruptedDataStructureException;
import org.apache.ignite.internal.processors.cache.persistence.file.FileIO;
import org.apache.ignite.internal.processors.cache.persistence.file.FileIOFactory;
import org.apache.ignite.internal.processors.cache.persistence.filename.NodeFileTree;
import org.apache.ignite.internal.processors.cache.persistence.wal.FileWriteAheadLogManager;
import org.apache.ignite.internal.processors.cache.persistence.wal.SegmentRouter;
import org.apache.ignite.internal.util.typedef.F;
import org.apache.ignite.internal.util.typedef.X;
import org.apache.ignite.internal.util.typedef.internal.U;
import org.jetbrains.annotations.Nullable;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.stream.Collectors.joining;
import static org.apache.ignite.configuration.DataStorageConfiguration.DFLT_PAGE_SIZE;
import static org.apache.ignite.internal.processors.cache.GridCacheUtils.isPersistenceEnabled;
/**
* Processor which contained helper methods for different diagnostic cases.
*/
public class DiagnosticProcessor extends GridProcessorAdapter {
/** @see IgniteSystemProperties#IGNITE_DUMP_PAGE_LOCK_ON_FAILURE */
public static final boolean DFLT_DUMP_PAGE_LOCK_ON_FAILURE = true;
/** Value of the system property that enables page locks dumping on failure. */
private static final boolean IGNITE_DUMP_PAGE_LOCK_ON_FAILURE =
IgniteSystemProperties.getBoolean(IgniteSystemProperties.IGNITE_DUMP_PAGE_LOCK_ON_FAILURE,
DFLT_DUMP_PAGE_LOCK_ON_FAILURE);
/** Time formatter for dump file name. */
private static final DateTimeFormatter TIME_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd'_'HH-mm-ss_SSS");
/** Folder name for store diagnostic info. **/
public static final String DEFAULT_TARGET_FOLDER = "diagnostic";
/** Full path for store dubug info. */
private final Path diagnosticPath;
/** File I/O factory. */
@Nullable private final FileIOFactory fileIOFactory;
/**
* Constructor.
*
* @param ctx Kernal context.
*/
public DiagnosticProcessor(GridKernalContext ctx) throws IgniteCheckedException {
super(ctx);
diagnosticPath = U.resolveWorkDirectory(ctx.config().getWorkDirectory(), DEFAULT_TARGET_FOLDER, false)
.toPath();
fileIOFactory = isPersistenceEnabled(ctx.config()) ?
ctx.config().getDataStorageConfiguration().getFileIOFactory() : null;
}
/**
* Print diagnostic info about failure occurred on {@code ignite} instance.
* Failure details is contained in {@code failureCtx}.
*
* @param failureCtx Failure context.
*/
public void onFailure(FailureContext failureCtx) {
// Dump data structures page locks.
if (IGNITE_DUMP_PAGE_LOCK_ON_FAILURE)
ctx.cache().context().diagnostic().pageLockTracker().dumpLocksToLog();
CorruptedDataStructureException corruptedDataStructureEx =
X.cause(failureCtx.error(), CorruptedDataStructureException.class);
if (corruptedDataStructureEx != null && !F.isEmpty(corruptedDataStructureEx.pageIds()) && fileIOFactory != null) {
File[] walDirs = walDirs(ctx);
if (F.isEmpty(walDirs)) {
if (log.isInfoEnabled())
log.info("Skipping dump diagnostic info due to WAL not configured");
}
else {
try {
File corruptedPagesFile = corruptedPagesFile(
diagnosticPath,
fileIOFactory,
corruptedDataStructureEx.groupId(),
corruptedDataStructureEx.pageIds()
);
String walDirsStr = Arrays.stream(walDirs).map(File::getAbsolutePath)
.collect(joining(", ", "[", "]"));
String args = "walDir=" + walDirs[0].getAbsolutePath() + (walDirs.length == 1 ? "" :
" walArchiveDir=" + walDirs[1].getAbsolutePath());
if (ctx.config().getDataStorageConfiguration().getPageSize() != DFLT_PAGE_SIZE)
args += " pageSize=" + ctx.config().getDataStorageConfiguration().getPageSize();
args += " pages=" + corruptedPagesFile.getAbsolutePath();
log.warning(corruptedDataStructureEx.getClass().getSimpleName() + " has occurred. " +
"To diagnose it, make a backup of the following directories: " + walDirsStr + ". " +
"Then, run the following command: java -cp <classpath> " +
"org.apache.ignite.development.utils.IgniteWalConverter " + args);
}
catch (Throwable t) {
String pages = LongStream.of(corruptedDataStructureEx.pageIds())
.mapToObj(pageId -> corruptedDataStructureEx.groupId() + ":" + pageId)
.collect(joining("\n", "", ""));
log.error("Failed to dump diagnostic info of partition corruption. Page ids:\n" + pages, t);
}
}
}
}
/**
* Creation and filling of a file with pages that can be corrupted.
* Pages are written on each line in format "grpId:pageId".
* File name format "corruptedPages_yyyy-MM-dd'_'HH-mm-ss_SSS.txt".
*
* @param dirPath Path to the directory where the file will be created.
* @param ioFactory File I/O factory.
* @param grpId Cache group id.
* @param pageIds PageId's that can be corrupted.
* @return Created and filled file.
* @throws IOException If an I/O error occurs.
*/
public static File corruptedPagesFile(
Path dirPath,
FileIOFactory ioFactory,
int grpId,
long... pageIds
) throws IOException {
dirPath.toFile().mkdirs();
File f = dirPath.resolve("corruptedPages_" + LocalDateTime.now().format(TIME_FORMATTER) + ".txt").toFile();
assert !f.exists();
try (FileIO fileIO = ioFactory.create(f)) {
for (long pageId : pageIds) {
byte[] bytes = (grpId + ":" + pageId + U.nl()).getBytes(UTF_8);
int left = bytes.length;
while ((left - fileIO.writeFully(bytes, bytes.length - left, left)) > 0)
;
}
fileIO.force();
}
return f;
}
/**
* Getting the WAL directories.
* Note:
* Index 0: WAL working directory.
* Index 1: WAL archive directory (may be absent).
*
* @param ctx Kernal context.
* @return WAL directories.
*/
@Nullable static File[] walDirs(GridKernalContext ctx) {
IgniteWriteAheadLogManager walMgr = ctx.cache().context().wal();
NodeFileTree ft = ctx.pdsFolderResolver().fileTree();
if (walMgr instanceof FileWriteAheadLogManager) {
SegmentRouter sr = ((FileWriteAheadLogManager)walMgr).getSegmentRouter();
if (sr != null) {
return ft.walArchiveEnabled() ? F.asArray(ft.wal(), ft.walArchive()) : F.asArray(ft.wal());
}
}
return null;
}
}