| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.segment; |
| |
| import java.io.IOException; |
| import java.lang.invoke.MethodHandles; |
| |
| import org.apache.nutch.crawl.CrawlDatum; |
| import org.apache.nutch.parse.ParseText; |
| import org.apache.nutch.parse.ParseData; |
| import org.apache.nutch.util.HadoopFSUtil; |
| import org.apache.hadoop.fs.FileStatus; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Checks whether a segment is valid, or has a certain status (generated, |
| * fetched, parsed), or can be used safely for a certain processing step |
| * (e.g., indexing). |
| */ |
| public class SegmentChecker { |
| |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| /** |
| * Check if the segment is indexable. May add new check methods here. |
| */ |
| public static boolean isIndexable(Path segmentPath, FileSystem fs) |
| throws IOException { |
| if (segmentPath == null || fs == null) { |
| LOG.info("No segment path or filesystem set."); |
| return false; |
| } |
| |
| boolean checkResult = true; |
| checkResult &= checkSegmentDir(segmentPath, fs); |
| // Add new check methods here |
| |
| if (checkResult) { |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Check the segment to see if it is valid based on the sub directories. |
| */ |
| public static boolean checkSegmentDir(Path segmentPath, FileSystem fs) |
| throws IOException { |
| |
| if (segmentPath.getName().length() != 14) { |
| LOG.warn("The input path at {} is not a segment... skipping", segmentPath.getName()); |
| return false; |
| } |
| |
| FileStatus[] fstats_segment = fs.listStatus(segmentPath, |
| HadoopFSUtil.getPassDirectoriesFilter(fs)); |
| Path[] segment_files = HadoopFSUtil.getPaths(fstats_segment); |
| |
| boolean crawlFetchExists = false; |
| boolean crawlParseExists = false; |
| boolean parseDataExists = false; |
| boolean parseTextExists = false; |
| |
| for (Path path : segment_files) { |
| String pathName = path.getName(); |
| crawlFetchExists |= pathName.equals(CrawlDatum.FETCH_DIR_NAME); |
| crawlParseExists |= pathName.equals(CrawlDatum.PARSE_DIR_NAME); |
| parseDataExists |= pathName.equals(ParseData.DIR_NAME); |
| parseTextExists |= pathName.equals(ParseText.DIR_NAME); |
| } |
| |
| if (parseTextExists && crawlParseExists && crawlFetchExists |
| && parseDataExists) { |
| |
| // No segment dir missing |
| LOG.info("Segment dir is complete: " + segmentPath.toString() + "."); |
| |
| return true; |
| } else { |
| |
| // log the missing dir |
| StringBuilder missingDir = new StringBuilder(""); |
| if (parseDataExists == false) { |
| missingDir.append(ParseData.DIR_NAME + ", "); |
| } |
| if (parseTextExists == false) { |
| missingDir.append(ParseText.DIR_NAME + ", "); |
| } |
| if (crawlParseExists == false) { |
| missingDir.append(CrawlDatum.PARSE_DIR_NAME + ", "); |
| } |
| if (crawlFetchExists == false) { |
| missingDir.append(CrawlDatum.FETCH_DIR_NAME + ", "); |
| } |
| |
| String missingDirString = missingDir.toString(); |
| LOG.warn("Skipping segment: " + segmentPath.toString() |
| + ". Missing sub directories: " |
| + missingDirString.substring(0, missingDirString.length() - 2)); |
| |
| return false; |
| } |
| |
| } |
| |
| /** |
| * Check the segment to see if it is has been parsed before. |
| */ |
| public static boolean isParsed(Path segment, FileSystem fs) |
| throws IOException { |
| |
| if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME))){ |
| return true; |
| } |
| return false; |
| } |
| |
| } |