modules/data/src/main/java/webindex/data/TestParser.java - fluo-examples - Git at Google

 /*
  * Copyright 2015 Webindex authors (see AUTHORS)
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
  * in compliance with the License. You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software distributed under the License
  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */

 package webindex.data;

 import java.net.URL;
 import java.util.List;

 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.archive.io.ArchiveReader;
 import org.archive.io.ArchiveRecord;
 import org.archive.io.warc.WARCReaderFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import webindex.core.DataConfig;
 import webindex.data.spark.IndexEnv;
 import webindex.data.util.ArchiveUtil;

 public class TestParser {

   private static final Logger log = LoggerFactory.getLogger(TestParser.class);

   public static void main(String[] args) throws Exception {

     if (args.length != 2) {
       log.error("Usage: TestParser <pathsFile> <range>");
       System.exit(1);
     }
     final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
     if (loadList.isEmpty()) {
       log.error("No files to load given {} {}", args[0], args[1]);
       System.exit(1);
     }

     DataConfig.load();

     SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser");
     try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

       log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
           args[0]);

       JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());

       final String prefix = DataConfig.CC_URL_PREFIX;

       loadRDD.foreachPartition(iter -> iter.forEachRemaining(path -> {
         String urlToCopy = prefix + path;
         log.info("Parsing {}", urlToCopy);
         try {
           ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
           for (ArchiveRecord record : reader) {
             ArchiveUtil.buildPageIgnoreErrors(record);
           }
         } catch (Exception e) {
           log.error("Exception while processing {}", path, e);
         }
       }));
     }
   }
 }
	/*
	* Copyright 2015 Webindex authors (see AUTHORS)
	*
	* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
	* in compliance with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software distributed under the License
	* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
	* or implied. See the License for the specific language governing permissions and limitations under
	* the License.
	*/

	package webindex.data;

	import java.net.URL;
	import java.util.List;

	import org.apache.spark.SparkConf;
	import org.apache.spark.api.java.JavaRDD;
	import org.apache.spark.api.java.JavaSparkContext;
	import org.archive.io.ArchiveReader;
	import org.archive.io.ArchiveRecord;
	import org.archive.io.warc.WARCReaderFactory;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import webindex.core.DataConfig;
	import webindex.data.spark.IndexEnv;
	import webindex.data.util.ArchiveUtil;

	public class TestParser {

	private static final Logger log = LoggerFactory.getLogger(TestParser.class);

	public static void main(String[] args) throws Exception {

	if (args.length != 2) {
	log.error("Usage: TestParser <pathsFile> <range>");
	System.exit(1);
	}
	final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
	if (loadList.isEmpty()) {
	log.error("No files to load given {} {}", args[0], args[1]);
	System.exit(1);
	}

	DataConfig.load();

	SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser");
	try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

	log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
	args[0]);

	JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());

	final String prefix = DataConfig.CC_URL_PREFIX;

	loadRDD.foreachPartition(iter -> iter.forEachRemaining(path -> {
	String urlToCopy = prefix + path;
	log.info("Parsing {}", urlToCopy);
	try {
	ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
	for (ArchiveRecord record : reader) {
	ArchiveUtil.buildPageIgnoreErrors(record);
	}
	} catch (Exception e) {
	log.error("Exception while processing {}", path, e);
	}
	}));
	}
	}
	}