TIKA-4252: add http request headers at fetcher config level
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
index 7053418..bf8e614 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
+++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
@@ -130,6 +130,9 @@
//httpHeaders to capture in the metadata
private Set<String> httpHeaders = new HashSet<>();
+ //httpRequestHeaders to add to all outgoing http requests
+ private Set<String> httpRequestHeaders = new HashSet<>();
+
//When making the request, what User-Agent is sent.
//By default httpclient adds e.g. "Apache-HttpClient/4.5.13 (Java/x.y.z)"
private String userAgent = null;
@@ -151,20 +154,31 @@
if (!StringUtils.isBlank(userAgent)) {
get.setHeader(USER_AGENT, userAgent);
}
- // additional http request headers can be sent in here.
+ // Add the headers from the Fetcher configuration.
+ if (httpRequestHeaders != null) {
+ for (String httpRequestHeader : httpRequestHeaders) {
+ parseHeaderAndPutOnRequest(get, httpRequestHeader);
+ }
+ }
+ // Additionally, headers can be specified per-fetch via the metadata.
String[] httpRequestHeaders = metadata.getValues("httpRequestHeaders");
if (httpRequestHeaders != null) {
for (String httpRequestHeader : httpRequestHeaders) {
- String[] parts = httpRequestHeader.trim().split(":", 2);
- if (parts.length >= 2) {
- String key = parts[0].trim();
- String value = parts[1].trim();
- get.setHeader(key, value);
- }
+ parseHeaderAndPutOnRequest(get, httpRequestHeader);
}
}
}
+ private static void parseHeaderAndPutOnRequest(HttpGet get, String httpRequestHeader) {
+ String[] parts = httpRequestHeader
+ .trim().split(":", 2);
+ if (parts.length >= 2) {
+ String key = parts[0].trim();
+ String value = parts[1].trim();
+ get.setHeader(key, value);
+ }
+ }
+
@Override
public InputStream fetch(String fetchKey, long startRange, long endRange, Metadata metadata)
throws IOException {
@@ -427,6 +441,17 @@
}
/**
+ * Which http request headers should we send on the http requests.
+ *
+ * @param httpRequestHeaders
+ */
+ @Field
+ public void setHttpRequestHeaders(List<String> httpRequestHeaders) {
+ this.httpRequestHeaders.clear();
+ this.httpRequestHeaders.addAll(httpRequestHeaders);
+ }
+
+ /**
* This sets an overall timeout on the request. If a server is super slow
* or the file is very long, the other timeouts might not be triggered.
*
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
index 64eae5c..b189e7b 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
+++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
@@ -140,6 +140,8 @@
HttpGet httpGet = httpGetArgumentCaptor.getValue();
Assertions.assertEquals("val1", httpGet.getHeaders("nick1")[0].getValue());
Assertions.assertEquals("val2", httpGet.getHeaders("nick2")[0].getValue());
+ // also make sure the headers from the fetcher config level are specified - see src/test/resources/tika-config-http.xml
+ Assertions.assertEquals("headerValueFromFetcherConfig", httpGet.getHeaders("headerNameFromFetcherConfig")[0].getValue());
}
@Test
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml
index bd77de4..5def8f5 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml
+++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/resources/tika-config-http.xml
@@ -24,6 +24,9 @@
<header>Expires</header>
<header>Content-Length</header>
</httpHeaders>
+ <httpRequestHeaders>
+ <header>headerNameFromFetcherConfig: headerValueFromFetcherConfig</header>
+ </httpRequestHeaders>
</fetcher>
</fetchers>
-</properties>
\ No newline at end of file
+</properties>