Skip to content

Commit

Permalink
fix arxiv html download redirect (#11797)
Browse files Browse the repository at this point in the history
* fix arxiv html download redirect

Fixes #4913

* Fix catch indents

* Add redirect test case

* Use Optionals

* Class-global Unirest config

* Manual handling of redirect

* Fix conditions

* Simplyfiy code

* Use Wiremock instead of real endpoint

* Improve test names

* Fix condition

* fix condition

* wiremock with head request as well

* use path insteadd of file

* try with static unirest config

* max retries

* fix head

* no body for head

---------

Co-authored-by: Subhramit Basu Bhowmick <subhramit.bb@live.in>
Co-authored-by: Oliver Kopp <kopp.dev@gmail.com>
  • Loading branch information
3 people authored Sep 20, 2024
1 parent 9eed666 commit 5ac788c
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 51 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
- We fixed an exception when searching for unlinked files. [#11731](https://github.com/JabRef/jabref/issues/11731)
- We fixed an issue where two contradicting notifications were shown when cutting an entry in the main table. [#11724](https://github.com/JabRef/jabref/pull/11724)
- We fixed an issue where unescaped braces in the arXiv fetcher were not treated. [#11704](https://github.com/JabRef/jabref/issues/11704)
- We fixed an issue where HTML instead of the fulltext pdf was downloaded when importing arXiv entries. [#4913](https://github.com/JabRef/jabref/issues/4913)
- We fixed an issue where the keywords and crossref fields were not properly focused. [#11177](https://github.com/JabRef/jabref/issues/11177)

### Removed
Expand Down
3 changes: 3 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,9 @@ dependencies {
testImplementation "org.testfx:testfx-junit5:4.0.16-alpha"
testImplementation "org.hamcrest:hamcrest-library:3.0"

// recommended by https://github.com/wiremock/wiremock/issues/2149#issuecomment-1835775954
testImplementation 'org.wiremock:wiremock-standalone:3.3.1'

checkstyle 'com.puppycrawl.tools:checkstyle:10.18.1'
// xjc needs the runtime as well for the ant task, otherwise it fails
xjc group: 'org.glassfish.jaxb', name: 'jaxb-xjc', version: '3.0.2'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,14 +280,11 @@ private Optional<ExternalFileType> inferFileType(URLDownload urlDownload) {
}

private Optional<ExternalFileType> inferFileTypeFromMimeType(URLDownload urlDownload) {
String mimeType = urlDownload.getMimeType();

if (mimeType != null) {
LOGGER.debug("MIME Type suggested: {}", mimeType);
return ExternalFileTypes.getExternalFileTypeByMimeType(mimeType, externalApplicationsPreferences);
} else {
return Optional.empty();
}
return urlDownload.getMimeType()
.flatMap(mimeType -> {
LOGGER.debug("MIME Type suggested: {}", mimeType);
return ExternalFileTypes.getExternalFileTypeByMimeType(mimeType, externalApplicationsPreferences);
});
}

private Optional<ExternalFileType> inferFileTypeFromURL(String url) {
Expand Down
70 changes: 42 additions & 28 deletions src/main/java/org/jabref/logic/net/URLDownload.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
Expand All @@ -39,7 +40,9 @@
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.FetcherServerException;
import org.jabref.logic.util.io.FileUtil;
import org.jabref.model.strings.StringUtil;

import kong.unirest.core.HttpResponse;
import kong.unirest.core.Unirest;
import kong.unirest.core.UnirestException;
import org.slf4j.Logger;
Expand All @@ -64,12 +67,20 @@ public class URLDownload {
public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36";
private static final Logger LOGGER = LoggerFactory.getLogger(URLDownload.class);
private static final Duration DEFAULT_CONNECT_TIMEOUT = Duration.ofSeconds(30);
private static final int MAX_RETRIES = 3;

private final URL source;
private final Map<String, String> parameters = new HashMap<>();
private String postData = "";
private Duration connectTimeout = DEFAULT_CONNECT_TIMEOUT;

static {
Unirest.config()
.followRedirects(true)
.enableCookieManagement(true)
.setDefaultHeader("User-Agent", USER_AGENT);
}

/**
* @param source the URL to download from
* @throws MalformedURLException if no protocol is specified in the source, or an unknown protocol is found
Expand Down Expand Up @@ -103,15 +114,28 @@ public URL getSource() {
return source;
}

public String getMimeType() {
Unirest.config().setDefaultHeader("User-Agent", "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6");

public Optional<String> getMimeType() {
String contentType;

int retries = 0;
// Try to use HEAD request to avoid downloading the whole file
try {
contentType = Unirest.head(source.toString()).asString().getHeaders().get("Content-Type").getFirst();
String urlToCheck = source.toString();
String locationHeader;
do {
retries++;
HttpResponse<String> response = Unirest.head(urlToCheck).asString();
// Check if we have redirects, e.g. arxiv will give otherwise content type html for the original url
// We need to do it "manually", because ".followRedirects(true)" only works for GET not for HEAD
locationHeader = response.getHeaders().getFirst("location");
if (!StringUtil.isNullOrEmpty(locationHeader)) {
urlToCheck = locationHeader;
}
// while loop, because there could be multiple redirects
} while (!StringUtil.isNullOrEmpty(locationHeader) && retries <= MAX_RETRIES);
contentType = Unirest.head(urlToCheck).asString().getHeaders().getFirst("Content-Type");
if ((contentType != null) && !contentType.isEmpty()) {
return contentType;
return Optional.of(contentType);
}
} catch (Exception e) {
LOGGER.debug("Error getting MIME type of URL via HEAD request", e);
Expand All @@ -120,8 +144,8 @@ public String getMimeType() {
// Use GET request as alternative if no HEAD request is available
try {
contentType = Unirest.get(source.toString()).asString().getHeaders().get("Content-Type").getFirst();
if ((contentType != null) && !contentType.isEmpty()) {
return contentType;
if (!StringUtil.isNullOrEmpty(contentType)) {
return Optional.of(contentType);
}
} catch (Exception e) {
LOGGER.debug("Error getting MIME type of URL via GET request", e);
Expand All @@ -130,16 +154,15 @@ public String getMimeType() {
// Try to resolve local URIs
try {
URLConnection connection = new URL(source.toString()).openConnection();

contentType = connection.getContentType();
if ((contentType != null) && !contentType.isEmpty()) {
return contentType;
if (!StringUtil.isNullOrEmpty(contentType)) {
return Optional.of(contentType);
}
} catch (IOException e) {
LOGGER.debug("Error trying to get MIME type of local URI", e);
}

return "";
return Optional.empty();
}

/**
Expand All @@ -149,24 +172,13 @@ public String getMimeType() {
* @return the status code of the response
*/
public boolean canBeReached() throws UnirestException {
// new unirest version does not support apache http client any longer
Unirest.config().reset()
.followRedirects(true)
.enableCookieManagement(true)
.setDefaultHeader("User-Agent", USER_AGENT);

int statusCode = Unirest.head(source.toString()).asString().getStatus();
return (statusCode >= 200) && (statusCode < 300);
}

public boolean isMimeType(String type) {
String mime = getMimeType();

if (mime.isEmpty()) {
return false;
}

return mime.startsWith(type);
return getMimeType().map(mimeType -> mimeType.startsWith(type)).orElse(false);
}

public boolean isPdf() {
Expand Down Expand Up @@ -333,7 +345,7 @@ private static void copy(InputStream in, Writer out, Charset encoding) throws IO
/**
* Open a connection to this object's URL (with specified settings).
* <p>
* If accessing an HTTP URL, remeber to close the resulting connection after usage.
* If accessing an HTTP URL, remember to close the resulting connection after usage.
*
* @return an open connection
*/
Expand All @@ -356,12 +368,14 @@ public URLConnection openConnection() throws FetcherException {
}

if ((status == HttpURLConnection.HTTP_MOVED_TEMP)
|| (status == HttpURLConnection.HTTP_MOVED_PERM)
|| (status == HttpURLConnection.HTTP_SEE_OTHER)) {
|| (status == HttpURLConnection.HTTP_MOVED_PERM)
|| (status == HttpURLConnection.HTTP_SEE_OTHER)) {
// get redirect url from "location" header field
String newUrl = connection.getHeaderField("location");
// open the new connection again
try {
httpURLConnection.disconnect();
// multiple redirects are implemented by this recursion
connection = new URLDownload(newUrl).openConnection();
} catch (MalformedURLException e) {
throw new FetcherException("Could not open URL Download", e);
Expand All @@ -370,9 +384,9 @@ public URLConnection openConnection() throws FetcherException {
// in case of an error, propagate the error message
SimpleHttpResponse httpResponse = new SimpleHttpResponse(httpURLConnection);
LOGGER.info("{}", httpResponse);
if ((status >= 400) && (status < 500)) {
if (status < 500) {
throw new FetcherClientException(this.source, httpResponse);
} else if (status >= 500) {
} else {
throw new FetcherServerException(this.source, httpResponse);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.jabref.gui.linkedfile;

import java.io.File;
import java.net.CookieHandler;
import java.net.CookieManager;
import java.net.CookiePolicy;
Expand All @@ -24,22 +23,29 @@
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.LinkedFile;

import com.github.tomakehurst.wiremock.WireMockServer;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import static com.github.tomakehurst.wiremock.client.WireMock.aResponse;
import static com.github.tomakehurst.wiremock.client.WireMock.configureFor;
import static com.github.tomakehurst.wiremock.client.WireMock.get;
import static com.github.tomakehurst.wiremock.client.WireMock.head;
import static com.github.tomakehurst.wiremock.client.WireMock.stubFor;
import static com.github.tomakehurst.wiremock.client.WireMock.urlEqualTo;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

class DownloadLinkedFileActionTest {

// Required for keepsHtmlEntry
@TempDir
Path tempFolder;
private Path tempFolder;

private BibEntry entry;

Expand All @@ -49,6 +55,8 @@ class DownloadLinkedFileActionTest {
private final FilePreferences filePreferences = mock(FilePreferences.class);
private final GuiPreferences preferences = mock(GuiPreferences.class);

private WireMockServer wireMockServer;

@BeforeEach
void setUp(@TempDir Path tempFolder) throws Exception {
entry = new BibEntry()
Expand All @@ -70,6 +78,15 @@ void setUp(@TempDir Path tempFolder) throws Exception {
cookieManager = (CookieManager) CookieHandler.getDefault();
}
cookieManager.setCookiePolicy(CookiePolicy.ACCEPT_ALL);

wireMockServer = new WireMockServer(2331);
wireMockServer.start();
configureFor("localhost", 2331);
}

@AfterEach
void tearDown() {
wireMockServer.stop();
}

@Test
Expand Down Expand Up @@ -122,10 +139,10 @@ void doesntReplaceSourceURL(boolean keepHtml) throws Exception {

linkedFile = entry.getFiles().getFirst();

File downloadedFile = new File(linkedFile.getLink());
Path downloadedFile = Path.of(linkedFile.getLink());

// Verify that re-downloading the file after the first download doesn't modify the entry
downloadedFile.delete();
Files.delete(downloadedFile);

DownloadLinkedFileAction downloadLinkedFileAction2 = new DownloadLinkedFileAction(
databaseContext,
Expand All @@ -144,10 +161,19 @@ void doesntReplaceSourceURL(boolean keepHtml) throws Exception {
}

@Test
void keepsHtmlEntry(@TempDir Path tempFolder) throws Exception {
String url = "https://blog.fefe.de/?ts=98e04151";

LinkedFile linkedFile = new LinkedFile(new URL(url), "");
void keepsHtmlFileLink(@TempDir Path tempFolder) throws Exception {
stubFor(get(urlEqualTo("/html"))
.willReturn(aResponse()
.withStatus(200)
.withHeader("Content-Type", "text/html; charset=utf-8")
.withBody("<html><body><h1>Hi</h1></body></html>")));

stubFor(head(urlEqualTo("/html"))
.willReturn(aResponse()
.withStatus(200)
.withHeader("Content-Type", "text/html; charset=utf-8")));

LinkedFile linkedFile = new LinkedFile(new URL("http://localhost:2331/html"), "");
when(databaseContext.getFirstExistingFileDir(any())).thenReturn(Optional.of(tempFolder));
when(filePreferences.getFileNamePattern()).thenReturn("[citationkey]");
when(filePreferences.getFileDirectoryPattern()).thenReturn("");
Expand All @@ -171,10 +197,19 @@ void keepsHtmlEntry(@TempDir Path tempFolder) throws Exception {
}

@Test
void removesHtmlEntry(@TempDir Path tempFolder) throws Exception {
String url = "https://blog.fefe.de/?ts=98e04151";

LinkedFile linkedFile = new LinkedFile(new URL(url), "");
void removesHtmlFileLink(@TempDir Path tempFolder) throws Exception {
stubFor(get(urlEqualTo("/html"))
.willReturn(aResponse()
.withStatus(200)
.withHeader("Content-Type", "text/html; charset=utf-8")
.withBody("<html><body><h1>Hi</h1></body></html>")));

stubFor(head(urlEqualTo("/html"))
.willReturn(aResponse()
.withStatus(200)
.withHeader("Content-Type", "text/html; charset=utf-8")));

LinkedFile linkedFile = new LinkedFile(new URL("http://localhost:2331/html"), "");
when(databaseContext.getFirstExistingFileDir(any())).thenReturn(Optional.of(tempFolder));
when(filePreferences.getFileNamePattern()).thenReturn("[citationkey]");
when(filePreferences.getFileDirectoryPattern()).thenReturn("");
Expand Down
Loading

0 comments on commit 5ac788c

Please sign in to comment.