From 5ac788c01a253e0c0d500081660f9807a9940019 Mon Sep 17 00:00:00 2001 From: Christoph Date: Fri, 20 Sep 2024 20:15:31 +0200 Subject: [PATCH] fix arxiv html download redirect (#11797) * fix arxiv html download redirect Fixes https://github.com/JabRef/jabref/issues/4913 * Fix catch indents * Add redirect test case * Use Optionals * Class-global Unirest config * Manual handling of redirect * Fix conditions * Simplyfiy code * Use Wiremock instead of real endpoint * Improve test names * Fix condition * fix condition * wiremock with head request as well * use path insteadd of file * try with static unirest config * max retries * fix head * no body for head --------- Co-authored-by: Subhramit Basu Bhowmick Co-authored-by: Oliver Kopp --- CHANGELOG.md | 1 + build.gradle | 3 + .../linkedfile/DownloadLinkedFileAction.java | 13 ++-- .../org/jabref/logic/net/URLDownload.java | 70 +++++++++++-------- .../DownloadLinkedFileActionTest.java | 61 ++++++++++++---- .../org/jabref/logic/net/URLDownloadTest.java | 38 +++++++++- 6 files changed, 135 insertions(+), 51 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 320ed820875..26b802b3240 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -73,6 +73,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv - We fixed an exception when searching for unlinked files. [#11731](https://github.com/JabRef/jabref/issues/11731) - We fixed an issue where two contradicting notifications were shown when cutting an entry in the main table. [#11724](https://github.com/JabRef/jabref/pull/11724) - We fixed an issue where unescaped braces in the arXiv fetcher were not treated. [#11704](https://github.com/JabRef/jabref/issues/11704) +- We fixed an issue where HTML instead of the fulltext pdf was downloaded when importing arXiv entries. [#4913](https://github.com/JabRef/jabref/issues/4913) - We fixed an issue where the keywords and crossref fields were not properly focused. [#11177](https://github.com/JabRef/jabref/issues/11177) ### Removed diff --git a/build.gradle b/build.gradle index 85a0c904de9..8a253d0db9d 100644 --- a/build.gradle +++ b/build.gradle @@ -368,6 +368,9 @@ dependencies { testImplementation "org.testfx:testfx-junit5:4.0.16-alpha" testImplementation "org.hamcrest:hamcrest-library:3.0" + // recommended by https://github.com/wiremock/wiremock/issues/2149#issuecomment-1835775954 + testImplementation 'org.wiremock:wiremock-standalone:3.3.1' + checkstyle 'com.puppycrawl.tools:checkstyle:10.18.1' // xjc needs the runtime as well for the ant task, otherwise it fails xjc group: 'org.glassfish.jaxb', name: 'jaxb-xjc', version: '3.0.2' diff --git a/src/main/java/org/jabref/gui/linkedfile/DownloadLinkedFileAction.java b/src/main/java/org/jabref/gui/linkedfile/DownloadLinkedFileAction.java index 57f35ae1ad5..c81bf261ca2 100644 --- a/src/main/java/org/jabref/gui/linkedfile/DownloadLinkedFileAction.java +++ b/src/main/java/org/jabref/gui/linkedfile/DownloadLinkedFileAction.java @@ -280,14 +280,11 @@ private Optional inferFileType(URLDownload urlDownload) { } private Optional inferFileTypeFromMimeType(URLDownload urlDownload) { - String mimeType = urlDownload.getMimeType(); - - if (mimeType != null) { - LOGGER.debug("MIME Type suggested: {}", mimeType); - return ExternalFileTypes.getExternalFileTypeByMimeType(mimeType, externalApplicationsPreferences); - } else { - return Optional.empty(); - } + return urlDownload.getMimeType() + .flatMap(mimeType -> { + LOGGER.debug("MIME Type suggested: {}", mimeType); + return ExternalFileTypes.getExternalFileTypeByMimeType(mimeType, externalApplicationsPreferences); + }); } private Optional inferFileTypeFromURL(String url) { diff --git a/src/main/java/org/jabref/logic/net/URLDownload.java b/src/main/java/org/jabref/logic/net/URLDownload.java index 636448716bd..5cfca305a32 100644 --- a/src/main/java/org/jabref/logic/net/URLDownload.java +++ b/src/main/java/org/jabref/logic/net/URLDownload.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Optional; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HttpsURLConnection; @@ -39,7 +40,9 @@ import org.jabref.logic.importer.FetcherException; import org.jabref.logic.importer.FetcherServerException; import org.jabref.logic.util.io.FileUtil; +import org.jabref.model.strings.StringUtil; +import kong.unirest.core.HttpResponse; import kong.unirest.core.Unirest; import kong.unirest.core.UnirestException; import org.slf4j.Logger; @@ -64,12 +67,20 @@ public class URLDownload { public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"; private static final Logger LOGGER = LoggerFactory.getLogger(URLDownload.class); private static final Duration DEFAULT_CONNECT_TIMEOUT = Duration.ofSeconds(30); + private static final int MAX_RETRIES = 3; private final URL source; private final Map parameters = new HashMap<>(); private String postData = ""; private Duration connectTimeout = DEFAULT_CONNECT_TIMEOUT; + static { + Unirest.config() + .followRedirects(true) + .enableCookieManagement(true) + .setDefaultHeader("User-Agent", USER_AGENT); + } + /** * @param source the URL to download from * @throws MalformedURLException if no protocol is specified in the source, or an unknown protocol is found @@ -103,15 +114,28 @@ public URL getSource() { return source; } - public String getMimeType() { - Unirest.config().setDefaultHeader("User-Agent", "Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6"); - + public Optional getMimeType() { String contentType; + + int retries = 0; // Try to use HEAD request to avoid downloading the whole file try { - contentType = Unirest.head(source.toString()).asString().getHeaders().get("Content-Type").getFirst(); + String urlToCheck = source.toString(); + String locationHeader; + do { + retries++; + HttpResponse response = Unirest.head(urlToCheck).asString(); + // Check if we have redirects, e.g. arxiv will give otherwise content type html for the original url + // We need to do it "manually", because ".followRedirects(true)" only works for GET not for HEAD + locationHeader = response.getHeaders().getFirst("location"); + if (!StringUtil.isNullOrEmpty(locationHeader)) { + urlToCheck = locationHeader; + } + // while loop, because there could be multiple redirects + } while (!StringUtil.isNullOrEmpty(locationHeader) && retries <= MAX_RETRIES); + contentType = Unirest.head(urlToCheck).asString().getHeaders().getFirst("Content-Type"); if ((contentType != null) && !contentType.isEmpty()) { - return contentType; + return Optional.of(contentType); } } catch (Exception e) { LOGGER.debug("Error getting MIME type of URL via HEAD request", e); @@ -120,8 +144,8 @@ public String getMimeType() { // Use GET request as alternative if no HEAD request is available try { contentType = Unirest.get(source.toString()).asString().getHeaders().get("Content-Type").getFirst(); - if ((contentType != null) && !contentType.isEmpty()) { - return contentType; + if (!StringUtil.isNullOrEmpty(contentType)) { + return Optional.of(contentType); } } catch (Exception e) { LOGGER.debug("Error getting MIME type of URL via GET request", e); @@ -130,16 +154,15 @@ public String getMimeType() { // Try to resolve local URIs try { URLConnection connection = new URL(source.toString()).openConnection(); - contentType = connection.getContentType(); - if ((contentType != null) && !contentType.isEmpty()) { - return contentType; + if (!StringUtil.isNullOrEmpty(contentType)) { + return Optional.of(contentType); } } catch (IOException e) { LOGGER.debug("Error trying to get MIME type of local URI", e); } - return ""; + return Optional.empty(); } /** @@ -149,24 +172,13 @@ public String getMimeType() { * @return the status code of the response */ public boolean canBeReached() throws UnirestException { - // new unirest version does not support apache http client any longer - Unirest.config().reset() - .followRedirects(true) - .enableCookieManagement(true) - .setDefaultHeader("User-Agent", USER_AGENT); int statusCode = Unirest.head(source.toString()).asString().getStatus(); return (statusCode >= 200) && (statusCode < 300); } public boolean isMimeType(String type) { - String mime = getMimeType(); - - if (mime.isEmpty()) { - return false; - } - - return mime.startsWith(type); + return getMimeType().map(mimeType -> mimeType.startsWith(type)).orElse(false); } public boolean isPdf() { @@ -333,7 +345,7 @@ private static void copy(InputStream in, Writer out, Charset encoding) throws IO /** * Open a connection to this object's URL (with specified settings). *

- * If accessing an HTTP URL, remeber to close the resulting connection after usage. + * If accessing an HTTP URL, remember to close the resulting connection after usage. * * @return an open connection */ @@ -356,12 +368,14 @@ public URLConnection openConnection() throws FetcherException { } if ((status == HttpURLConnection.HTTP_MOVED_TEMP) - || (status == HttpURLConnection.HTTP_MOVED_PERM) - || (status == HttpURLConnection.HTTP_SEE_OTHER)) { + || (status == HttpURLConnection.HTTP_MOVED_PERM) + || (status == HttpURLConnection.HTTP_SEE_OTHER)) { // get redirect url from "location" header field String newUrl = connection.getHeaderField("location"); // open the new connection again try { + httpURLConnection.disconnect(); + // multiple redirects are implemented by this recursion connection = new URLDownload(newUrl).openConnection(); } catch (MalformedURLException e) { throw new FetcherException("Could not open URL Download", e); @@ -370,9 +384,9 @@ public URLConnection openConnection() throws FetcherException { // in case of an error, propagate the error message SimpleHttpResponse httpResponse = new SimpleHttpResponse(httpURLConnection); LOGGER.info("{}", httpResponse); - if ((status >= 400) && (status < 500)) { + if (status < 500) { throw new FetcherClientException(this.source, httpResponse); - } else if (status >= 500) { + } else { throw new FetcherServerException(this.source, httpResponse); } } diff --git a/src/test/java/org/jabref/gui/linkedfile/DownloadLinkedFileActionTest.java b/src/test/java/org/jabref/gui/linkedfile/DownloadLinkedFileActionTest.java index 02a65e54479..39f5929de3c 100644 --- a/src/test/java/org/jabref/gui/linkedfile/DownloadLinkedFileActionTest.java +++ b/src/test/java/org/jabref/gui/linkedfile/DownloadLinkedFileActionTest.java @@ -1,6 +1,5 @@ package org.jabref.gui.linkedfile; -import java.io.File; import java.net.CookieHandler; import java.net.CookieManager; import java.net.CookiePolicy; @@ -24,12 +23,20 @@ import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.LinkedFile; +import com.github.tomakehurst.wiremock.WireMockServer; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; +import static com.github.tomakehurst.wiremock.client.WireMock.aResponse; +import static com.github.tomakehurst.wiremock.client.WireMock.configureFor; +import static com.github.tomakehurst.wiremock.client.WireMock.get; +import static com.github.tomakehurst.wiremock.client.WireMock.head; +import static com.github.tomakehurst.wiremock.client.WireMock.stubFor; +import static com.github.tomakehurst.wiremock.client.WireMock.urlEqualTo; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -37,9 +44,8 @@ class DownloadLinkedFileActionTest { - // Required for keepsHtmlEntry @TempDir - Path tempFolder; + private Path tempFolder; private BibEntry entry; @@ -49,6 +55,8 @@ class DownloadLinkedFileActionTest { private final FilePreferences filePreferences = mock(FilePreferences.class); private final GuiPreferences preferences = mock(GuiPreferences.class); + private WireMockServer wireMockServer; + @BeforeEach void setUp(@TempDir Path tempFolder) throws Exception { entry = new BibEntry() @@ -70,6 +78,15 @@ void setUp(@TempDir Path tempFolder) throws Exception { cookieManager = (CookieManager) CookieHandler.getDefault(); } cookieManager.setCookiePolicy(CookiePolicy.ACCEPT_ALL); + + wireMockServer = new WireMockServer(2331); + wireMockServer.start(); + configureFor("localhost", 2331); + } + + @AfterEach + void tearDown() { + wireMockServer.stop(); } @Test @@ -122,10 +139,10 @@ void doesntReplaceSourceURL(boolean keepHtml) throws Exception { linkedFile = entry.getFiles().getFirst(); - File downloadedFile = new File(linkedFile.getLink()); + Path downloadedFile = Path.of(linkedFile.getLink()); // Verify that re-downloading the file after the first download doesn't modify the entry - downloadedFile.delete(); + Files.delete(downloadedFile); DownloadLinkedFileAction downloadLinkedFileAction2 = new DownloadLinkedFileAction( databaseContext, @@ -144,10 +161,19 @@ void doesntReplaceSourceURL(boolean keepHtml) throws Exception { } @Test - void keepsHtmlEntry(@TempDir Path tempFolder) throws Exception { - String url = "https://blog.fefe.de/?ts=98e04151"; - - LinkedFile linkedFile = new LinkedFile(new URL(url), ""); + void keepsHtmlFileLink(@TempDir Path tempFolder) throws Exception { + stubFor(get(urlEqualTo("/html")) + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html; charset=utf-8") + .withBody("

Hi

"))); + + stubFor(head(urlEqualTo("/html")) + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html; charset=utf-8"))); + + LinkedFile linkedFile = new LinkedFile(new URL("http://localhost:2331/html"), ""); when(databaseContext.getFirstExistingFileDir(any())).thenReturn(Optional.of(tempFolder)); when(filePreferences.getFileNamePattern()).thenReturn("[citationkey]"); when(filePreferences.getFileDirectoryPattern()).thenReturn(""); @@ -171,10 +197,19 @@ void keepsHtmlEntry(@TempDir Path tempFolder) throws Exception { } @Test - void removesHtmlEntry(@TempDir Path tempFolder) throws Exception { - String url = "https://blog.fefe.de/?ts=98e04151"; - - LinkedFile linkedFile = new LinkedFile(new URL(url), ""); + void removesHtmlFileLink(@TempDir Path tempFolder) throws Exception { + stubFor(get(urlEqualTo("/html")) + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html; charset=utf-8") + .withBody("

Hi

"))); + + stubFor(head(urlEqualTo("/html")) + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "text/html; charset=utf-8"))); + + LinkedFile linkedFile = new LinkedFile(new URL("http://localhost:2331/html"), ""); when(databaseContext.getFirstExistingFileDir(any())).thenReturn(Optional.of(tempFolder)); when(filePreferences.getFileNamePattern()).thenReturn("[citationkey]"); when(filePreferences.getFileDirectoryPattern()).thenReturn(""); diff --git a/src/test/java/org/jabref/logic/net/URLDownloadTest.java b/src/test/java/org/jabref/logic/net/URLDownloadTest.java index ff0c221efc1..5920950a71c 100644 --- a/src/test/java/org/jabref/logic/net/URLDownloadTest.java +++ b/src/test/java/org/jabref/logic/net/URLDownloadTest.java @@ -4,6 +4,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.nio.file.Path; import org.jabref.logic.importer.FetcherClientException; @@ -11,11 +12,19 @@ import org.jabref.support.DisabledOnCIServer; import org.jabref.testutils.category.FetcherTest; +import com.github.tomakehurst.wiremock.WireMockServer; import kong.unirest.core.UnirestException; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static com.github.tomakehurst.wiremock.client.WireMock.aResponse; +import static com.github.tomakehurst.wiremock.client.WireMock.configureFor; +import static com.github.tomakehurst.wiremock.client.WireMock.get; +import static com.github.tomakehurst.wiremock.client.WireMock.stubFor; +import static com.github.tomakehurst.wiremock.client.WireMock.urlEqualTo; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -58,7 +67,7 @@ void fileDownload() throws Exception { void determineMimeType() throws Exception { URLDownload dl = new URLDownload(new URL("http://www.google.com")); - assertTrue(dl.getMimeType().startsWith("text/html")); + assertTrue(dl.getMimeType().get().startsWith("text/html")); } @Test @@ -134,6 +143,31 @@ void test503ErrorThrowsFetcherServerException() throws Exception { @Test void test429ErrorThrowsFetcherClientException() throws Exception { URLDownload urlDownload = new URLDownload(new URL("http://httpstat.us/429")); - Exception exception = assertThrows(FetcherClientException.class, urlDownload::asString); + assertThrows(FetcherClientException.class, urlDownload::asString); + } + + @Test + void redirectWorks(@TempDir Path tempDir) throws Exception { + WireMockServer wireMockServer = new WireMockServer(2222); + wireMockServer.start(); + configureFor("localhost", 2222); + stubFor(get("/redirect") + .willReturn(aResponse() + .withStatus(302) + .withHeader("Location", "/final"))); + byte[] pdfContent = {0x00}; + stubFor(get(urlEqualTo("/final")) + .willReturn(aResponse() + .withStatus(200) + .withHeader("Content-Type", "application/pdf") + .withBody(pdfContent))); + + URLDownload urlDownload = new URLDownload(new URL("http://localhost:2222/redirect")); + Path downloadedFile = tempDir.resolve("download.pdf"); + urlDownload.toFile(downloadedFile); + byte[] actual = Files.readAllBytes(downloadedFile); + assertArrayEquals(pdfContent, actual); + + wireMockServer.stop(); } }