Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TIKA-4247 HttpFetcher - add ability to send request headers #1737

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ public class HttpFetcher extends AbstractFetcher implements Initializable, Range
//httpHeaders to capture in the metadata
private Set<String> httpHeaders = new HashSet<>();

//httpRequestHeaders to add to all outgoing http requests
private Set<String> httpRequestHeaders = new HashSet<>();

//When making the request, what User-Agent is sent.
//By default httpclient adds e.g. "Apache-HttpClient/4.5.13 (Java/x.y.z)"
private String userAgent = null;
Expand All @@ -143,10 +146,37 @@ public InputStream fetch(String fetchKey, Metadata metadata) throws IOException,
.setMaxRedirects(maxRedirects)
.setRedirectsEnabled(true).build();
get.setConfig(requestConfig);
if (! StringUtils.isBlank(userAgent)) {
setHttpRequestHeaders(metadata, get);
return execute(get, metadata, httpClient, true);
}

private void setHttpRequestHeaders(Metadata metadata, HttpGet get) {
if (!StringUtils.isBlank(userAgent)) {
get.setHeader(USER_AGENT, userAgent);
}
return execute(get, metadata, httpClient, true);
// Add the headers from the Fetcher configuration.
if (httpRequestHeaders != null) {
for (String httpRequestHeader : httpRequestHeaders) {
parseHeaderAndPutOnRequest(get, httpRequestHeader);
}
}
// Additionally, headers can be specified per-fetch via the metadata.
String[] httpRequestHeaders = metadata.getValues("httpRequestHeaders");
nddipiazza marked this conversation as resolved.
Show resolved Hide resolved
if (httpRequestHeaders != null) {
for (String httpRequestHeader : httpRequestHeaders) {
parseHeaderAndPutOnRequest(get, httpRequestHeader);
}
}
}

private static void parseHeaderAndPutOnRequest(HttpGet get, String httpRequestHeader) {
String[] parts = httpRequestHeader
.trim().split(":", 2);
if (parts.length >= 2) {
String key = parts[0].trim();
String value = parts[1].trim();
get.setHeader(key, value);
}
}

@Override
Expand Down Expand Up @@ -410,6 +440,17 @@ public void setHttpHeaders(List<String> headers) {
this.httpHeaders.addAll(headers);
}

/**
* Which http request headers should we send on the http requests.
*
* @param httpRequestHeaders
*/
@Field
public void setHttpRequestHeaders(List<String> httpRequestHeaders) {
this.httpRequestHeaders.clear();
this.httpRequestHeaders.addAll(httpRequestHeaders);
}

/**
* This sets an overall timeout on the request. If a server is super slow
* or the file is very long, the other timeouts might not be triggered.
Expand Down Expand Up @@ -455,4 +496,7 @@ void setHttpClientFactory(HttpClientFactory httpClientFactory) {
this.httpClientFactory = httpClientFactory;
}

void setHttpClient(HttpClient httpClient) {
this.httpClient = httpClient;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,31 +37,39 @@
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.ProtocolVersion;
import org.apache.http.StatusLine;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.StringEntity;
import org.apache.http.protocol.HttpContext;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.mockito.ArgumentCaptor;
import org.mockito.Mockito;

import org.apache.tika.TikaTest;
import org.apache.tika.client.HttpClientFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.pipes.fetcher.FetcherManager;

public class HttpFetcherTest extends TikaTest {

class HttpFetcherTest extends TikaTest {
private static final String TEST_URL = "wontbecalled";
private static final String CONTENT = "request content";

private HttpFetcher httpFetcher;

@BeforeEach
public void before() throws Exception {
httpFetcher = new HttpFetcher();
final HttpResponse mockResponse = buildMockResponse(HttpStatus.SC_OK,
IOUtils.toInputStream(CONTENT, Charset.defaultCharset()));

Expand Down Expand Up @@ -98,6 +106,44 @@ public void test4xxResponse() throws Exception {
assertEquals(TEST_URL, meta.get("http-connection:target-url"));
}

@Test
public void testHttpRequestHeaders() throws Exception {
HttpClient httpClient = Mockito.mock(HttpClient.class);
httpFetcher.setHttpClient(httpClient);
CloseableHttpResponse response = mock(CloseableHttpResponse.class);
ArgumentCaptor<HttpGet> httpGetArgumentCaptor = ArgumentCaptor.forClass(HttpGet.class);

when(httpClient.execute(httpGetArgumentCaptor.capture(), any(HttpContext.class)))
.thenReturn(response);
when(response.getStatusLine()).thenReturn(new StatusLine() {
@Override
public ProtocolVersion getProtocolVersion() {
return new HttpGet("http://localhost").getProtocolVersion();
}

@Override
public int getStatusCode() {
return 200;
}

@Override
public String getReasonPhrase() {
return null;
}
});

when(response.getEntity()).thenReturn(new StringEntity("Hi"));

Metadata metadata = new Metadata();
metadata.set(Property.externalText("httpRequestHeaders"), new String[] {" nick1 : val1", "nick2: val2"});
httpFetcher.fetch("http://localhost", metadata);
HttpGet httpGet = httpGetArgumentCaptor.getValue();
Assertions.assertEquals("val1", httpGet.getHeaders("nick1")[0].getValue());
Assertions.assertEquals("val2", httpGet.getHeaders("nick2")[0].getValue());
// also make sure the headers from the fetcher config level are specified - see src/test/resources/tika-config-http.xml
Assertions.assertEquals("headerValueFromFetcherConfig", httpGet.getHeaders("headerNameFromFetcherConfig")[0].getValue());
}

@Test
@Disabled("requires network connectivity")
public void testRedirect() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
<header>Expires</header>
<header>Content-Length</header>
</httpHeaders>
<httpRequestHeaders>
<header>headerNameFromFetcherConfig: headerValueFromFetcherConfig</header>
</httpRequestHeaders>
</fetcher>
</fetchers>
</properties>
</properties>