blob: 6af20b03dfe9b6fc9b29613daadee3b0581479e9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.selenium;
import java.lang.invoke.MethodHandles;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.Capabilities;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.TimeoutException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
//import org.openqa.selenium.firefox.FirefoxBinary;
import org.openqa.selenium.firefox.FirefoxDriver;
//import org.openqa.selenium.firefox.FirefoxProfile;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.openqa.selenium.io.TemporaryFilesystem;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
//import org.openqa.selenium.safari.SafariDriver;
//import org.openqa.selenium.phantomjs.PhantomJSDriver;
//import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.openqa.selenium.opera.OperaOptions;
import org.openqa.selenium.opera.OperaDriver;
//import com.opera.core.systems.OperaDriver;
public class HttpWebClient {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
public static WebDriver getDriverForPage(String url, Configuration conf) {
WebDriver driver = null;
long pageLoadWait = conf.getLong("page.load.delay", 3);
try {
String driverType = conf.get("selenium.driver", "firefox");
boolean enableHeadlessMode = conf.getBoolean("selenium.enable.headless",
false);
switch (driverType) {
case "firefox":
String geckoDriverPath = conf.get("selenium.grid.binary",
"/root/geckodriver");
driver = createFirefoxWebDriver(geckoDriverPath, enableHeadlessMode);
break;
case "chrome":
String chromeDriverPath = conf.get("selenium.grid.binary",
"/root/chromedriver");
driver = createChromeWebDriver(chromeDriverPath, enableHeadlessMode);
break;
// case "opera":
// // This class is provided as a convenience for easily testing the
// Chrome browser.
// String operaDriverPath = conf.get("selenium.grid.binary",
// "/root/operadriver");
// driver = createOperaWebDriver(operaDriverPath, enableHeadlessMode);
// break;
case "remote":
String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
int seleniumHubPort = Integer
.parseInt(conf.get("selenium.hub.port", "4444"));
String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
URL seleniumHubUrl = new URL(seleniumHubProtocol, seleniumHubHost,
seleniumHubPort, seleniumHubPath);
String seleniumGridDriver = conf.get("selenium.grid.driver", "firefox");
switch (seleniumGridDriver) {
case "firefox":
driver = createFirefoxRemoteWebDriver(seleniumHubUrl,
enableHeadlessMode);
break;
case "chrome":
driver = createChromeRemoteWebDriver(seleniumHubUrl,
enableHeadlessMode);
break;
case "random":
driver = createRandomRemoteWebDriver(seleniumHubUrl,
enableHeadlessMode);
break;
default:
LOG.error(
"The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().",
driverType);
driver = createDefaultRemoteWebDriver(seleniumHubUrl,
enableHeadlessMode);
break;
}
break;
default:
LOG.error(
"The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().",
driverType);
FirefoxOptions options = new FirefoxOptions();
driver = new FirefoxDriver(options);
break;
}
LOG.debug("Selenium {} WebDriver selected.", driverType);
driver.manage().timeouts().pageLoadTimeout(pageLoadWait,
TimeUnit.SECONDS);
driver.get(url);
} catch (Exception e) {
if (e instanceof TimeoutException) {
LOG.error(
"Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
return driver;
} else {
LOG.error(e.toString());
}
cleanUpDriver(driver);
throw new RuntimeException(e);
}
return driver;
}
public static WebDriver createFirefoxWebDriver(String firefoxDriverPath,
boolean enableHeadlessMode) {
System.setProperty("webdriver.gecko.driver", firefoxDriverPath);
FirefoxOptions firefoxOptions = new FirefoxOptions();
if (enableHeadlessMode) {
firefoxOptions.addArguments("--headless");
}
WebDriver driver = new FirefoxDriver(firefoxOptions);
return driver;
}
public static WebDriver createChromeWebDriver(String chromeDriverPath,
boolean enableHeadlessMode) {
// if not specified, WebDriver will search your path for chromedriver
System.setProperty("webdriver.chrome.driver", chromeDriverPath);
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.addArguments("--no-sandbox");
chromeOptions.addArguments("--disable-extensions");
// be sure to set selenium.enable.headless to true if no monitor attached
// to your server
if (enableHeadlessMode) {
chromeOptions.addArguments("--headless");
}
WebDriver driver = new ChromeDriver(chromeOptions);
return driver;
}
public static WebDriver createOperaWebDriver(String operaDriverPath,
boolean enableHeadlessMode) {
// if not specified, WebDriver will search your path for operadriver
System.setProperty("webdriver.opera.driver", operaDriverPath);
OperaOptions operaOptions = new OperaOptions();
// operaOptions.setBinary("/usr/bin/opera");
operaOptions.addArguments("--no-sandbox");
operaOptions.addArguments("--disable-extensions");
// be sure to set selenium.enable.headless to true if no monitor attached
// to your server
if (enableHeadlessMode) {
operaOptions.addArguments("--headless");
}
WebDriver driver = new OperaDriver(operaOptions);
return driver;
}
public static RemoteWebDriver createFirefoxRemoteWebDriver(URL seleniumHubUrl,
boolean enableHeadlessMode) {
FirefoxOptions firefoxOptions = new FirefoxOptions();
if (enableHeadlessMode) {
firefoxOptions.setHeadless(true);
}
RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl,
firefoxOptions);
return driver;
}
public static RemoteWebDriver createChromeRemoteWebDriver(URL seleniumHubUrl,
boolean enableHeadlessMode) {
ChromeOptions chromeOptions = new ChromeOptions();
if (enableHeadlessMode) {
chromeOptions.setHeadless(true);
}
RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl, chromeOptions);
return driver;
}
public static RemoteWebDriver createRandomRemoteWebDriver(URL seleniumHubUrl,
boolean enableHeadlessMode) {
// we consider a possibility of generating only 2 types of browsers: Firefox
// and
// Chrome only
Random r = new Random();
int min = 0;
// we have actually hardcoded the maximum number of types of web driver that
// can
// be created
// but this must be later moved to the configuration file in order to be
// able
// to randomly choose between much more types(ex: Edge, Opera, Safari)
int max = 1; // for 3 types, change to 2 and update the if-clause
int num = r.nextInt((max - min) + 1) + min;
if (num == 0) {
return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
}
return createChromeRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
}
public static RemoteWebDriver createDefaultRemoteWebDriver(URL seleniumHubUrl,
boolean enableHeadlessMode) {
return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
}
public static void cleanUpDriver(WebDriver driver) {
if (driver != null) {
try {
// driver.close();
driver.quit();
TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
} catch (Exception e) {
LOG.error(e.toString());
// throw new RuntimeException(e);
}
}
}
/**
* Function for obtaining the HTML BODY using the selected <a href=
* 'https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium
* webdriver</a> There are a number of configuration properties within
* <code>nutch-site.xml</code> which determine whether to take screenshots of
* the rendered pages and persist them as timestamped .png's into HDFS.
*
* @param url
* the URL to fetch and render
* @param conf
* the {@link org.apache.hadoop.conf.Configuration}
* @return the rendered inner HTML page
*/
public static String getHtmlPage(String url, Configuration conf) {
WebDriver driver = getDriverForPage(url, conf);
try {
if (conf.getBoolean("take.screenshot", false)) {
takeScreenshot(driver, conf);
}
String innerHtml = driver.findElement(By.tagName("body"))
.getAttribute("innerHTML");
return innerHtml;
// I'm sure this catch statement is a code smell ; borrowing it from
// lib-htmlunit
} catch (Exception e) {
TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
// throw new RuntimeException(e);
LOG.error("getHtmlPage(url, conf): " + e.toString());
throw new RuntimeException(e);
} finally {
cleanUpDriver(driver);
}
}
public static String getHtmlPage(String url) {
return getHtmlPage(url, null);
}
private static void takeScreenshot(WebDriver driver, Configuration conf) {
try {
String url = driver.getCurrentUrl();
File srcFile = ((TakesScreenshot) driver)
.getScreenshotAs(OutputType.FILE);
LOG.debug("In-memory screenshot taken of: {}", url);
FileSystem fs = FileSystem.get(conf);
if (conf.get("screenshot.location") != null) {
Path screenshotPath = new Path(
conf.get("screenshot.location") + "/" + srcFile.getName());
OutputStream os = null;
if (!fs.exists(screenshotPath)) {
LOG.debug(
"No existing screenshot already exists... creating new file at {} {}.",
screenshotPath, srcFile.getName());
os = fs.create(screenshotPath);
}
InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
IOUtils.copyBytes(is, os, conf);
LOG.debug("Screenshot for {} successfully saved to: {} {}", url,
screenshotPath, srcFile.getName());
} else {
LOG.warn(
"Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
+ "'screenshot.location' is absent from nutch-site.xml.",
url);
}
} catch (Exception e) {
LOG.error("Error taking screenshot: ", e);
cleanUpDriver(driver);
throw new RuntimeException(e);
}
}
}