| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.protocol.selenium; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.io.BufferedInputStream; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.net.URL; |
| import java.util.concurrent.TimeUnit; |
| import java.util.Random; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.io.IOUtils; |
| |
| import org.openqa.selenium.By; |
| import org.openqa.selenium.Capabilities; |
| import org.openqa.selenium.OutputType; |
| import org.openqa.selenium.TakesScreenshot; |
| import org.openqa.selenium.TimeoutException; |
| import org.openqa.selenium.WebDriver; |
| |
| import org.openqa.selenium.chrome.ChromeDriver; |
| import org.openqa.selenium.chrome.ChromeOptions; |
| |
| //import org.openqa.selenium.firefox.FirefoxBinary; |
| import org.openqa.selenium.firefox.FirefoxDriver; |
| //import org.openqa.selenium.firefox.FirefoxProfile; |
| import org.openqa.selenium.firefox.FirefoxOptions; |
| |
| import org.openqa.selenium.io.TemporaryFilesystem; |
| |
| import org.openqa.selenium.remote.DesiredCapabilities; |
| import org.openqa.selenium.remote.RemoteWebDriver; |
| |
| //import org.openqa.selenium.safari.SafariDriver; |
| |
| //import org.openqa.selenium.phantomjs.PhantomJSDriver; |
| //import org.openqa.selenium.phantomjs.PhantomJSDriverService; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.openqa.selenium.opera.OperaOptions; |
| import org.openqa.selenium.opera.OperaDriver; |
| //import com.opera.core.systems.OperaDriver; |
| |
| public class HttpWebClient { |
| |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| public static WebDriver getDriverForPage(String url, Configuration conf) { |
| WebDriver driver = null; |
| long pageLoadWait = conf.getLong("page.load.delay", 3); |
| |
| try { |
| String driverType = conf.get("selenium.driver", "firefox"); |
| boolean enableHeadlessMode = conf.getBoolean("selenium.enable.headless", |
| false); |
| |
| switch (driverType) { |
| case "firefox": |
| String geckoDriverPath = conf.get("selenium.grid.binary", |
| "/root/geckodriver"); |
| driver = createFirefoxWebDriver(geckoDriverPath, enableHeadlessMode); |
| break; |
| case "chrome": |
| String chromeDriverPath = conf.get("selenium.grid.binary", |
| "/root/chromedriver"); |
| driver = createChromeWebDriver(chromeDriverPath, enableHeadlessMode); |
| break; |
| // case "opera": |
| // // This class is provided as a convenience for easily testing the |
| // Chrome browser. |
| // String operaDriverPath = conf.get("selenium.grid.binary", |
| // "/root/operadriver"); |
| // driver = createOperaWebDriver(operaDriverPath, enableHeadlessMode); |
| // break; |
| case "remote": |
| String seleniumHubHost = conf.get("selenium.hub.host", "localhost"); |
| int seleniumHubPort = Integer |
| .parseInt(conf.get("selenium.hub.port", "4444")); |
| String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub"); |
| String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http"); |
| URL seleniumHubUrl = new URL(seleniumHubProtocol, seleniumHubHost, |
| seleniumHubPort, seleniumHubPath); |
| |
| String seleniumGridDriver = conf.get("selenium.grid.driver", "firefox"); |
| |
| switch (seleniumGridDriver) { |
| case "firefox": |
| driver = createFirefoxRemoteWebDriver(seleniumHubUrl, |
| enableHeadlessMode); |
| break; |
| case "chrome": |
| driver = createChromeRemoteWebDriver(seleniumHubUrl, |
| enableHeadlessMode); |
| break; |
| case "random": |
| driver = createRandomRemoteWebDriver(seleniumHubUrl, |
| enableHeadlessMode); |
| break; |
| default: |
| LOG.error( |
| "The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", |
| driverType); |
| driver = createDefaultRemoteWebDriver(seleniumHubUrl, |
| enableHeadlessMode); |
| break; |
| } |
| break; |
| default: |
| LOG.error( |
| "The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", |
| driverType); |
| FirefoxOptions options = new FirefoxOptions(); |
| driver = new FirefoxDriver(options); |
| break; |
| } |
| LOG.debug("Selenium {} WebDriver selected.", driverType); |
| |
| driver.manage().timeouts().pageLoadTimeout(pageLoadWait, |
| TimeUnit.SECONDS); |
| driver.get(url); |
| } catch (Exception e) { |
| if (e instanceof TimeoutException) { |
| LOG.error( |
| "Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far..."); |
| return driver; |
| } else { |
| LOG.error(e.toString()); |
| } |
| cleanUpDriver(driver); |
| throw new RuntimeException(e); |
| } |
| |
| return driver; |
| } |
| |
| public static WebDriver createFirefoxWebDriver(String firefoxDriverPath, |
| boolean enableHeadlessMode) { |
| System.setProperty("webdriver.gecko.driver", firefoxDriverPath); |
| FirefoxOptions firefoxOptions = new FirefoxOptions(); |
| if (enableHeadlessMode) { |
| firefoxOptions.addArguments("--headless"); |
| } |
| WebDriver driver = new FirefoxDriver(firefoxOptions); |
| return driver; |
| } |
| |
| public static WebDriver createChromeWebDriver(String chromeDriverPath, |
| boolean enableHeadlessMode) { |
| // if not specified, WebDriver will search your path for chromedriver |
| System.setProperty("webdriver.chrome.driver", chromeDriverPath); |
| ChromeOptions chromeOptions = new ChromeOptions(); |
| chromeOptions.addArguments("--no-sandbox"); |
| chromeOptions.addArguments("--disable-extensions"); |
| // be sure to set selenium.enable.headless to true if no monitor attached |
| // to your server |
| if (enableHeadlessMode) { |
| chromeOptions.addArguments("--headless"); |
| } |
| WebDriver driver = new ChromeDriver(chromeOptions); |
| return driver; |
| } |
| |
| public static WebDriver createOperaWebDriver(String operaDriverPath, |
| boolean enableHeadlessMode) { |
| // if not specified, WebDriver will search your path for operadriver |
| System.setProperty("webdriver.opera.driver", operaDriverPath); |
| OperaOptions operaOptions = new OperaOptions(); |
| // operaOptions.setBinary("/usr/bin/opera"); |
| operaOptions.addArguments("--no-sandbox"); |
| operaOptions.addArguments("--disable-extensions"); |
| // be sure to set selenium.enable.headless to true if no monitor attached |
| // to your server |
| if (enableHeadlessMode) { |
| operaOptions.addArguments("--headless"); |
| } |
| WebDriver driver = new OperaDriver(operaOptions); |
| return driver; |
| } |
| |
| public static RemoteWebDriver createFirefoxRemoteWebDriver(URL seleniumHubUrl, |
| boolean enableHeadlessMode) { |
| FirefoxOptions firefoxOptions = new FirefoxOptions(); |
| if (enableHeadlessMode) { |
| firefoxOptions.setHeadless(true); |
| } |
| RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl, |
| firefoxOptions); |
| return driver; |
| } |
| |
| public static RemoteWebDriver createChromeRemoteWebDriver(URL seleniumHubUrl, |
| boolean enableHeadlessMode) { |
| ChromeOptions chromeOptions = new ChromeOptions(); |
| if (enableHeadlessMode) { |
| chromeOptions.setHeadless(true); |
| } |
| RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl, chromeOptions); |
| return driver; |
| } |
| |
| public static RemoteWebDriver createRandomRemoteWebDriver(URL seleniumHubUrl, |
| boolean enableHeadlessMode) { |
| // we consider a possibility of generating only 2 types of browsers: Firefox |
| // and |
| // Chrome only |
| Random r = new Random(); |
| int min = 0; |
| // we have actually hardcoded the maximum number of types of web driver that |
| // can |
| // be created |
| // but this must be later moved to the configuration file in order to be |
| // able |
| // to randomly choose between much more types(ex: Edge, Opera, Safari) |
| int max = 1; // for 3 types, change to 2 and update the if-clause |
| int num = r.nextInt((max - min) + 1) + min; |
| if (num == 0) { |
| return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode); |
| } |
| |
| return createChromeRemoteWebDriver(seleniumHubUrl, enableHeadlessMode); |
| } |
| |
| public static RemoteWebDriver createDefaultRemoteWebDriver(URL seleniumHubUrl, |
| boolean enableHeadlessMode) { |
| return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode); |
| } |
| |
| public static void cleanUpDriver(WebDriver driver) { |
| if (driver != null) { |
| try { |
| // driver.close(); |
| driver.quit(); |
| TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); |
| } catch (Exception e) { |
| LOG.error(e.toString()); |
| // throw new RuntimeException(e); |
| } |
| } |
| } |
| |
| /** |
| * Function for obtaining the HTML BODY using the selected <a href= |
| * 'https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium |
| * webdriver</a> There are a number of configuration properties within |
| * <code>nutch-site.xml</code> which determine whether to take screenshots of |
| * the rendered pages and persist them as timestamped .png's into HDFS. |
| * |
| * @param url |
| * the URL to fetch and render |
| * @param conf |
| * the {@link org.apache.hadoop.conf.Configuration} |
| * @return the rendered inner HTML page |
| */ |
| public static String getHtmlPage(String url, Configuration conf) { |
| WebDriver driver = getDriverForPage(url, conf); |
| |
| try { |
| if (conf.getBoolean("take.screenshot", false)) { |
| takeScreenshot(driver, conf); |
| } |
| |
| String innerHtml = driver.findElement(By.tagName("body")) |
| .getAttribute("innerHTML"); |
| return innerHtml; |
| |
| // I'm sure this catch statement is a code smell ; borrowing it from |
| // lib-htmlunit |
| } catch (Exception e) { |
| TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); |
| // throw new RuntimeException(e); |
| LOG.error("getHtmlPage(url, conf): " + e.toString()); |
| throw new RuntimeException(e); |
| } finally { |
| cleanUpDriver(driver); |
| } |
| } |
| |
| public static String getHtmlPage(String url) { |
| return getHtmlPage(url, null); |
| } |
| |
| private static void takeScreenshot(WebDriver driver, Configuration conf) { |
| try { |
| String url = driver.getCurrentUrl(); |
| File srcFile = ((TakesScreenshot) driver) |
| .getScreenshotAs(OutputType.FILE); |
| LOG.debug("In-memory screenshot taken of: {}", url); |
| FileSystem fs = FileSystem.get(conf); |
| if (conf.get("screenshot.location") != null) { |
| Path screenshotPath = new Path( |
| conf.get("screenshot.location") + "/" + srcFile.getName()); |
| OutputStream os = null; |
| if (!fs.exists(screenshotPath)) { |
| LOG.debug( |
| "No existing screenshot already exists... creating new file at {} {}.", |
| screenshotPath, srcFile.getName()); |
| os = fs.create(screenshotPath); |
| } |
| InputStream is = new BufferedInputStream(new FileInputStream(srcFile)); |
| IOUtils.copyBytes(is, os, conf); |
| LOG.debug("Screenshot for {} successfully saved to: {} {}", url, |
| screenshotPath, srcFile.getName()); |
| } else { |
| LOG.warn( |
| "Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " |
| + "'screenshot.location' is absent from nutch-site.xml.", |
| url); |
| } |
| } catch (Exception e) { |
| LOG.error("Error taking screenshot: ", e); |
| cleanUpDriver(driver); |
| throw new RuntimeException(e); |
| } |
| } |
| } |