blob: 625f6beb98d09733bcda9bbc82f9012d37fad1b5 [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="author" content="Apache Software Foundation">
<link rel="shortcut icon" href="../../img/favicon.ico">
<title>Google Webmaster - Apache Gobblin</title>
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
<link rel="stylesheet" href="../../css/theme.css" type="text/css" />
<link rel="stylesheet" href="../../css/theme_extra.css" type="text/css" />
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
<link href="../../css/extra.css" rel="stylesheet">
<script>
// Current page data
var mkdocs_page_name = "Google Webmaster";
var mkdocs_page_input_path = "sources/GoogleWebmaster.md";
var mkdocs_page_url = null;
</script>
<script src="../../js/jquery-2.1.1.min.js" defer></script>
<script src="../../js/modernizr-2.8.3.min.js" defer></script>
<script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script>hljs.initHighlightingOnLoad();</script>
</head>
<body class="wy-body-for-nav" role="document">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
<div class="wy-side-nav-search">
<a href="../.." class="icon icon-home"> Apache Gobblin</a>
<div role="search">
<form id ="rtd-search-form" class="wy-form" action="../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" title="Type search term here" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1">
<a class="" href="/">Home</a>
</li>
<li class="toctree-l1">
<a class="" href="../../Powered-By/">Companies Powered By Gobblin</a>
</li>
<li class="toctree-l1">
<a class="" href="../../Getting-Started/">Getting Started</a>
</li>
<li class="toctree-l1">
<a class="" href="../../Gobblin-Architecture/">Architecture</a>
</li>
<li class="toctree-l1">
<span class="caption-text">User Guide</span>
<ul class="subnav">
<li class="">
<a class="" href="../../user-guide/Working-with-Job-Configuration-Files/">Job Configuration Files</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-Deployment/">Deployment</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-as-a-Library/">Gobblin as a Library</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-CLI/">Gobblin CLI</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-Compliance/">Gobblin Compliance</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-on-Yarn/">Gobblin on Yarn</a>
</li>
<li class="">
<a class="" href="../../user-guide/Compaction/">Compaction</a>
</li>
<li class="">
<a class="" href="../../user-guide/State-Management-and-Watermarks/">State Management and Watermarks</a>
</li>
<li class="">
<a class="" href="../../user-guide/Working-with-the-ForkOperator/">Fork Operator</a>
</li>
<li class="">
<a class="" href="../../user-guide/Configuration-Properties-Glossary/">Configuration Glossary</a>
</li>
<li class="">
<a class="" href="../../user-guide/Source-schema-and-Converters/">Source schema and Converters</a>
</li>
<li class="">
<a class="" href="../../user-guide/Partitioned-Writers/">Partitioned Writers</a>
</li>
<li class="">
<a class="" href="../../user-guide/Monitoring/">Monitoring</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-template/">Template</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-Schedulers/">Schedulers</a>
</li>
<li class="">
<a class="" href="../../user-guide/Job-Execution-History-Store/">Job Execution History Store</a>
</li>
<li class="">
<a class="" href="../../user-guide/Building-Gobblin/">Building Gobblin</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-genericLoad/">Generic Configuration Loading</a>
</li>
<li class="">
<a class="" href="../../user-guide/Hive-Registration/">Hive Registration</a>
</li>
<li class="">
<a class="" href="../../user-guide/Config-Management/">Config Management</a>
</li>
<li class="">
<a class="" href="../../user-guide/Docker-Integration/">Docker Integration</a>
</li>
<li class="">
<a class="" href="../../user-guide/Troubleshooting/">Troubleshooting</a>
</li>
<li class="">
<a class="" href="../../user-guide/FAQs/">FAQs</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Sources</span>
<ul class="subnav">
<li class="">
<a class="" href="../AvroFileSource/">Avro files</a>
</li>
<li class="">
<a class="" href="../CopySource/">File copy</a>
</li>
<li class="">
<a class="" href="../QueryBasedSource/">Query based</a>
</li>
<li class="">
<a class="" href="../RestApiSource/">Rest Api</a>
</li>
<li class="">
<a class="" href="../GoogleAnalyticsSource/">Google Analytics</a>
</li>
<li class="">
<a class="" href="../GoogleDriveSource/">Google Drive</a>
</li>
<li class=" current">
<a class="current" href="./">Google Webmaster</a>
<ul class="subnav">
<li class="toctree-l3"><a href="#table-of-contents">Table of Contents</a></li>
<li class="toctree-l3"><a href="#introduction">Introduction</a></li>
<li class="toctree-l3"><a href="#implementation">Implementation</a></li>
<ul>
<li><a class="toctree-l4" href="#summary">Summary</a></li>
<li><a class="toctree-l4" href="#entities">Entities</a></li>
<li><a class="toctree-l4" href="#work-flow">Work Flow</a></li>
</ul>
<li class="toctree-l3"><a href="#configuration">Configuration</a></li>
</ul>
</li>
<li class="">
<a class="" href="../HadoopTextInputSource/">Hadoop Text Input</a>
</li>
<li class="">
<a class="" href="../HelloWorldSource/">Hello World</a>
</li>
<li class="">
<a class="" href="../HiveAvroToOrcSource/">Hive Avro-to-ORC</a>
</li>
<li class="">
<a class="" href="../HivePurgerSource/">Hive compliance purging</a>
</li>
<li class="">
<a class="" href="../SimpleJsonSource/">JSON</a>
</li>
<li class="">
<a class="" href="../KafkaSource/">Kafka</a>
</li>
<li class="">
<a class="" href="../MySQLSource/">MySQL</a>
</li>
<li class="">
<a class="" href="../OracleSource/">Oracle</a>
</li>
<li class="">
<a class="" href="../SalesforceSource/">Salesforce</a>
</li>
<li class="">
<a class="" href="../SftpSource/">SFTP</a>
</li>
<li class="">
<a class="" href="../SqlServerSource/">SQL Server</a>
</li>
<li class="">
<a class="" href="../TeradataSource/">Teradata</a>
</li>
<li class="">
<a class="" href="../WikipediaSource/">Wikipedia</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Sinks (Writers)</span>
<ul class="subnav">
<li class="">
<a class="" href="../../sinks/AvroHdfsDataWriter/">Avro HDFS</a>
</li>
<li class="">
<a class="" href="../../sinks/ParquetHdfsDataWriter/">Parquet HDFS</a>
</li>
<li class="">
<a class="" href="../../sinks/SimpleBytesWriter/">HDFS Byte array</a>
</li>
<li class="">
<a class="" href="../../sinks/ConsoleWriter/">Console</a>
</li>
<li class="">
<a class="" href="../../sinks/CouchbaseWriter/">Couchbase</a>
</li>
<li class="">
<a class="" href="../../sinks/Http/">HTTP</a>
</li>
<li class="">
<a class="" href="../../sinks/Gobblin-JDBC-Writer/">JDBC</a>
</li>
<li class="">
<a class="" href="../../sinks/Kafka/">Kafka</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Adaptors</span>
<ul class="subnav">
<li class="">
<a class="" href="../../adaptors/Gobblin-Distcp/">Gobblin Distcp</a>
</li>
<li class="">
<a class="" href="../../adaptors/Hive-Avro-To-ORC-Converter/">Hive Avro-To-Orc Converter</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Case Studies</span>
<ul class="subnav">
<li class="">
<a class="" href="../../case-studies/Kafka-HDFS-Ingestion/">Kafka-HDFS Ingestion</a>
</li>
<li class="">
<a class="" href="../../case-studies/Publishing-Data-to-S3/">Publishing Data to S3</a>
</li>
<li class="">
<a class="" href="../../case-studies/Writing-ORC-Data/">Writing ORC Data</a>
</li>
<li class="">
<a class="" href="../../case-studies/Hive-Distcp/">Hive Distcp</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Data Management</span>
<ul class="subnav">
<li class="">
<a class="" href="../../data-management/Gobblin-Retention/">Retention</a>
</li>
<li class="">
<a class="" href="../../data-management/DistcpNgEvents/">Distcp-NG events</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Metrics</span>
<ul class="subnav">
<li class="">
<a class="" href="../../metrics/Gobblin-Metrics/">Quick Start</a>
</li>
<li class="">
<a class="" href="../../metrics/Existing-Reporters/">Existing Reporters</a>
</li>
<li class="">
<a class="" href="../../metrics/Metrics-for-Gobblin-ETL/">Metrics for Gobblin ETL</a>
</li>
<li class="">
<a class="" href="../../metrics/Gobblin-Metrics-Architecture/">Gobblin Metrics Architecture</a>
</li>
<li class="">
<a class="" href="../../metrics/Implementing-New-Reporters/">Implementing New Reporters</a>
</li>
<li class="">
<a class="" href="../../metrics/Gobblin-Metrics-Performance/">Gobblin Metrics Performance</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Developer Guide</span>
<ul class="subnav">
<li class="">
<a class="" href="../../developer-guide/Customization-for-New-Source/">Customization for New Source</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Customization-for-Converter-and-Operator/">Customization for Converter and Operator</a>
</li>
<li class="">
<a class="" href="../../developer-guide/CodingStyle/">Code Style Guide</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Gobblin-Compliance-Design/">Gobblin Compliance Design</a>
</li>
<li class="">
<a class="" href="../../developer-guide/IDE-setup/">IDE setup</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Monitoring-Design/">Monitoring Design</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Documentation-Architecture/">Documentation Architecture</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Contributing/">Contributing</a>
</li>
<li class="">
<a class="" href="../../developer-guide/GobblinModules/">Gobblin Modules</a>
</li>
<li class="">
<a class="" href="../../developer-guide/HighLevelConsumer/">High Level Consumer</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Project</span>
<ul class="subnav">
<li class="">
<a class="" href="../../project/Feature-List/">Feature List</a>
</li>
<li class="">
<a class="" href="/people">Contributors and Team</a>
</li>
<li class="">
<a class="" href="../../project/Talks-and-Tech-Blogs/">Talks and Tech Blog Posts</a>
</li>
<li class="">
<a class="" href="../../project/Posts/">Posts</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Miscellaneous</span>
<ul class="subnav">
<li class="">
<a class="" href="../../miscellaneous/Camus-to-Gobblin-Migration/">Camus to Gobblin Migration</a>
</li>
<li class="">
<a class="" href="../../miscellaneous/Exactly-Once-Support/">Exactly Once Support</a>
</li>
</ul>
</li>
</ul>
</div>
&nbsp;
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../..">Apache Gobblin</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../..">Docs</a> &raquo;</li>
<li>Sources &raquo;</li>
<li>Google Webmaster</li>
<li class="wy-breadcrumbs-aside">
<a href="https://github.com/apache/incubator-gobblin/edit/master/docs/sources/GoogleWebmaster.md" rel="nofollow"> Edit on Gobblin</a>
</li>
</ul>
<hr/>
</div>
<div role="main">
<div class="section">
<h1 id="table-of-contents">Table of Contents</h1>
<div class="toc">
<ul>
<li><a href="#table-of-contents">Table of Contents</a></li>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#implementation">Implementation</a><ul>
<li><a href="#summary">Summary</a></li>
<li><a href="#entities">Entities</a></li>
<li><a href="#work-flow">Work Flow</a></li>
</ul>
</li>
<li><a href="#configuration">Configuration</a></li>
</ul>
</div>
<h1 id="introduction">Introduction</h1>
<p>The Google Search Console data ingestion project is to download query and analytics data from Google Search Console for the purpose of doing search analytics of your verified sites. Available analytics measures are clicks, impressions, CTR and position. Used dimensions are dates, pages, countries and queries. </p>
<p>Details about this Google service and API can be found at <a href="http://">https://developers.google.com/webmaster-tools/</a>.This service can be run on a daily or weekly basis to download data at a daily granularity. </p>
<p>Other useful links:</p>
<ul>
<li>
<p>API Java documentation: <a href="http://">https://developers.google.com/resources/api-libraries/documentation/webmasters/v3/java/latest/</a></p>
</li>
<li>
<p>Google API Manager: <a href="http://">https://console.developers.google.com/apis/dashboard</a></p>
</li>
</ul>
<h1 id="implementation">Implementation</h1>
<h2 id="summary">Summary</h2>
<p>This connector implements sources, extractors, and iterators for the extractors, where each iterator is responsible for downloading data of each market. Due to the limitations of the Google API, the whole service has to deal with a lot of asynchronous API calls to figure out the problems like</p>
<ul>
<li>what is the total size of all unique pages</li>
<li>what is the full list of all unique pages</li>
<li>how to download queries and analytics data for each page</li>
<li>how to improve the overall performance</li>
</ul>
<p>There are two implementations for this service to download analytics data for each page, V1 and V2. V1 is the initial design, which is very straight forward. After we get a full list of all unique URL pages, we send a request for the queries and analytics data for that page with a page filter saying that the page needs to exactly equals that page. However, if the amount of pages is large, for example, above 100,000 pages; given the actual API request speed(less than 4 pages/second), the amount of time to process all pages can easily go beyond 10 hours. So a faster version is in demand, the V2, which is to send requests based on tries. This greatly reduces the amount of requests that need to be sent. The idea is to group a bunch of pages sharing the common prefix together and send just one request for that page group by utilizing the page filter that contains the common prefix. In order to achieve this, we first need to save all pages into a URL trie and then implement a trie iterator to iterate through the trie to return groups of pages based on a group size. This new implementation can easily improve the performance by over 40 times. This graph can visually explain how the performance gain is achieved.
<img alt="Conversion Ratio by Group Size" src="../../img/Trie-Conversion-Ratio.png" /></p>
<p>In short, large group size can convert large percentage of pages into groups, each of which results in a single API call.</p>
<p>The user still has the ability to choose which algorithm or implementation to use when starting the service by configuring the key <code>source.google_webmasters.request.tuning.get_queries.apply_trie</code>.</p>
<h2 id="entities">Entities</h2>
<p>Here is a table explaining the responsibility of each class briefly</p>
<table>
<thead>
<tr>
<th>Name</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>GoogleWebmasterClient</td>
<td>GoogleWebmasterClient provides basic accesses to Google Search Console by utilizing Google Webmaster API.</td>
</tr>
<tr>
<td>GoogleWebmasterDataFetcher</td>
<td>GoogleWebmasterDataFetcher implements the features to get all pages, and download analytics data (e.g. queries, clicks, impressions, CTR, position) for a given set of constraints like dates, pages, and countries.</td>
</tr>
<tr>
<td>GoogleWebmasterFilter</td>
<td>This is a util class providing enums and utility functions relevant to Google webmaster filters.</td>
</tr>
<tr>
<td>GoogleWebMasterSource</td>
<td>This is an abstract class that extends Gobblin's standard QueryBasedSource. It provides basic checks and configuration processing for google-webmaster-pull configuration files.</td>
</tr>
<tr>
<td>GoogleWebMasterSourceDaily</td>
<td>This implementation gives you the ability to do a daily extract from Google Search Console.</td>
</tr>
<tr>
<td>GoogleWebmasterExtractor</td>
<td>An implementation of Gobblin's extractor. <p><br>It relies on a bunch of GoogleWebmasterExtractorIterator generated for each market to extract the data.</p></td>
</tr>
<tr>
<td>GoogleWebmasterExtractorIterator</td>
<td>The core piece used by GoogleWebmasterExtractor to iterate through the downloaded dataset.</td>
</tr>
<tr>
<td>GoogleWebmasterDayPartitioner</td>
<td>The output partitioner that partitions output by the date of fetched data set</td>
</tr>
<tr>
<td>ProducerJob</td>
<td>This is a partitionable request unit used by GoogleWebmasterExtractorIterator for sending detailed API requests to Google Search Console. It includes the filter dimensions like date range, page URL and page URL filter type(e.g. contains, non-contains, equals). These jobs are generated in a producer thread while requesting queries and analytics data for pages. They are placed into a ConcurrentLinkedDeque and dispatched or processed by a pool of working threads. The downloaded data will be put into a LinkedBlockingDeque which is shared with the GoogleWebmasterExtractorIterator. GoogleWebmasterExtractorIterator will then pass the data to GoogleWebmasterExtractor, and then to the rest of the Gobblin framework. <p><br>It is an abstract class and currently has two implementations, SimpleProducerJob and TrieBasedProducerJob.</p> It provides the default logic about how to partition a ProducerJob. When this producer job has a date range, then divide the job evenly into two minor producer jobs covering the original date range; otherwise, this producer job is not partitionable.</td>
</tr>
<tr>
<td>SimpleProducerJob</td>
<td>SimpleProducerJob is a basic implementation of ProducerJob, utilizing the default partition logic.</td>
</tr>
<tr>
<td>TrieBasedProducerJob</td>
<td>TrieBasedProducerJob is a trie-based implementation of ProducerJob. <p><br>For the partition logic, it first tries to partition the pages by splitting the trie into smaller ones based on a new group size, which is a half of previous value. When it is not partitionable at the page level, the parition logic falls back to the default one provided by the base class.</p></td>
</tr>
<tr>
<td>UrlTrie</td>
<td>The trie that keeps all URL pages. Save all fetched pages into a trie in order to use the TrieBasedProducerJobs</td>
</tr>
<tr>
<td>UrlTrieNode</td>
<td>The trie node in the URL trie</td>
</tr>
<tr>
<td>UrlTriePostOrderIterator</td>
<td>This is a post-order iterator that traverses the nodes on the URL trie with a stopping rule, which is, it will not go deeper into the nodes whose size(defined as the number of descendant URLs and itself if itself is a URL page) is less than or equal to the stopping size. In other words, those nodes with size less than or equal to the stopping size will be treated as leaf nodes.</td>
</tr>
<tr>
<td>UrlTriePrefixGrouper</td>
<td>UrlTriePrefixGrouper will package the URL pages/nodes into groups given the group size while traversing the UrlTrie by utilizing a TrieIterator. If the current node is not a "leaf" node defined by the TrieIterator, then a "fake" group of size 1 will be created by only including this node. <p><br>A group of URL pages will share the same common longest prefix and will be sent in one API request by using the "containing" page filter. A fake group containing only one page uses the "equals" page filter.</p></td>
</tr>
</tbody>
</table>
<h2 id="work-flow">Work Flow</h2>
<p>Starting with GoogleWebMasterSource, it consumes the job or pull configuration file, does some basic checks, and decides the date range to work on based on the type of GoogleWebMasterSource specified. Then it passes the date range and the list of markets to GoogleWebmasterExtractor to work on. The GoogleWebmasterExtractor will create a GoogleWebmasterExtractorIterator for each market and start the downloading process, which is the same for every market. For the downloading process, firstly, it figures out the size of the unique URL pages by utilizing GoogleWebmasterDataFetcher and GoogleWebmasterClient. Then it tries to get all unique pages and give warnings if not all pages can be found. Based on the version or underlying algorithm the user chooses to use, GoogleWebmasterExtractorIterator starts downloading queries and analytics data for each page and pass the data back to GoogleWebmasterExtractor in an asynchronous mode.</p>
<h1 id="configuration">Configuration</h1>
<table>
<thead>
<tr>
<th>Configuration Key</th>
<th>Default Value</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>source.google_webmasters.property_url</td>
<td>Must Provide</td>
<td>Provide the property site URL whose google search analytics data you want to download</td>
</tr>
<tr>
<td>source.google_webmasters.request.filters</td>
<td>Optional</td>
<td>The filters that will be passed to all your API requests. <p><br>Filter format is [GoogleWebmasterFilter.Dimension].[DimensionValue].</p>Currently, this filter operator is "EQUALS" and only Country dimension is supported. Will extend this feature according to more use cases in the future.</td>
</tr>
<tr>
<td>source.google_webmasters.request.dimensions</td>
<td>Must Provide</td>
<td>Allowed dimensions are DATE, PAGE, COUNTRY, QUERY, DEVICE, SEARCH_TYPE, SEARCH_APPEARANCE</td>
</tr>
<tr>
<td>source.google_webmasters.request.metrics</td>
<td>Must Provide</td>
<td>Allowed metrics are CLICKS, IMPRESSIONS, CTR, POSITION</td>
</tr>
<tr>
<td>source.google_webmasters.request.page_limit</td>
<td>5000</td>
<td>The response row limits when you ask for pages. Set it to 5000 when you want to get all pages. Default to 5000, which is the maximum allowed.</td>
</tr>
<tr>
<td>source.google_webmasters.request.query_limit</td>
<td>5000</td>
<td>The response row limits when you ask for queries. Default to 5000, which is the maximum allowed.</td>
</tr>
<tr>
<td>source.google_webmasters.request.hot_start</td>
<td>Optional</td>
<td>Hot start this service with pre-set pages. Once this is set, the service will ignore source.google_webmasters.request.page_limit, and won't get all pages, but use the pre-set pages instead. <p><br>This is useful for debugging or resuming your failed work.</p></td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_queries.time_out</td>
<td>120</td>
<td>Set the time out in minutes for each round.</td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_queries.max_retries</td>
<td>30</td>
<td>Tune the maximum rounds of retries allowed when API calls failed because of exceeding quota.</td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_queries.cool_down_time</td>
<td>250</td>
<td>Tune the cool down time in millisecond between each round.</td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_queries.batches_per_second</td>
<td>2.25</td>
<td>Tune the speed of API requests in batches</td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_queries.batch_size</td>
<td>2</td>
<td>Tune the size of a batch. Batch API calls together to reduce the number of HTTP connections. <p><br>Note: A set of n requests batched together counts toward your usage limit as n requests, not as one request. The batch request is taken apart into a set of requests before processing.</p>Read more at <a href="http://">https://developers.google.com/webmaster-tools/v3/how-tos/batch</a>.</td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_queries.trie_group_size</td>
<td>500</td>
<td>Set the group size for the URL trie</td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_queries.apply_trie</td>
<td>false</td>
<td>Set to true to use the Trie based algorithm. Otherwise, set to false.<p><br>If set to true, you also need to set page_limit to 5000 indicating that you want to get all pages because trie based algorithm won't give you expected results if you just need a subset of all pages.</p></td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_pages.requests_per_second</td>
<td>5.0</td>
<td>Tune the speed of API requests while getting all pages.</td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_pages.max_retries</td>
<td>120</td>
<td>Tune the number of maximum retries while getting all pages. Consider the following affecting factors while setting this number: <ol><li>the length of shared prefix path may be very long.</li><li>the Quota Exceeded exception.</li></ol></td>
</tr>
<tr>
<td>source.google_webmasters.request.tuning.get_pages.time_out</td>
<td>2</td>
<td>Set the time out in minutes while getting all pages.</td>
</tr>
</tbody>
</table>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="../HadoopTextInputSource/" class="btn btn-neutral float-right" title="Hadoop Text Input">Next <span class="icon icon-circle-arrow-right"></span></a>
<a href="../GoogleDriveSource/" class="btn btn-neutral" title="Google Drive"><span class="icon icon-circle-arrow-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<!-- Copyright etc -->
</div>
Built with <a href="http://www.mkdocs.org" rel="nofollow">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme" rel="nofollow">theme</a> provided by <a href="https://readthedocs.org" rel="nofollow">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<div class="rst-versions" role="note" style="cursor: pointer">
<span class="rst-current-version" data-toggle="rst-current-version">
<span><a href="../GoogleDriveSource/" style="color: #fcfcfc;">&laquo; Previous</a></span>
<span style="margin-left: 15px"><a href="../HadoopTextInputSource/" style="color: #fcfcfc">Next &raquo;</a></span>
</span>
</div>
<script>var base_url = '../..';</script>
<script src="../../js/theme.js" defer></script>
<script src="../../js/extra.js" defer></script>
<script src="../../search/main.js" defer></script>
</body>
</html>