Skip to content

Commit

Permalink
prevent closing of main archive stream when handling individual entri…
Browse files Browse the repository at this point in the history
…es (e.g., sources cited by the primary dataset); related to jhpoelen/cite-the-bunnies#3
  • Loading branch information
Jorrit Poelen committed Oct 15, 2024
1 parent 64b7d9c commit 0f8f037
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.rdf.api.IRI;
import org.slf4j.Logger;
Expand Down Expand Up @@ -36,7 +37,7 @@ public boolean handle(IRI version, InputStream is) throws ContentStreamException
return false;
}

private Pair<ArchiveInputStream, String> getArchiveStreamAndFormat(InputStream in) {
public static Pair<ArchiveInputStream, String> getArchiveStreamAndFormat(InputStream in) {
try {
String archiveFormat = ArchiveStreamFactory.detect(in);
// do not close this stream; it would also close the "in" stream
Expand All @@ -59,14 +60,14 @@ private void handleArchiveEntries(IRI version, ArchiveInputStream in, String arc
throw new ContentStreamException("failed to create content URI related to entry [" + entry.getName() + "] in [" + version.getIRIString() + "]", e);
}
if (shouldReadArchiveEntry(entryIri)) {
contentStreamHandler.handle(entryIri, in);
contentStreamHandler.handle(entryIri, CloseShieldInputStream.wrap(in));
}
}
}
} catch (Throwable th) {
String dataCoordinates = entry == null || entry.isDirectory()
? version.getIRIString()
: "<zip:" + version.getIRIString() + "!/" + entry.getName() + ">";
: "<" + archiveFormat + ":" + version.getIRIString() + "!/" + entry.getName() + ">";
String msg = "failed to process " + dataCoordinates;
LOG.warn(msg, th);
throw new ContentStreamException(msg, th);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ private Pair<CompressorInputStream, String> getCompressedStreamAndFormat(InputSt

protected void parseAsCompressed(IRI version, InputStream in, String compressionFormat) throws ContentStreamException {
try {


contentStreamHandler.handle(wrapIRI(compressionFormat, version), in);
} catch (URISyntaxException e) {
throw new ContentStreamException("failed to create content URI", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,13 @@ public void apacheVFSUrl3() {

}

@Test
public void tarGzVSF() {
String url = "tar:gz:hash://sha256/bedcc1f122d59ec002e0e6d2802c0e422eadf6208669fff141a895bd3ed15d4a!/FaEu-DWCA/eml.xml";
String s = ContentStreamUtil.truncateGZNotationForVFSIfNeeded("tar:gz:hash://sha256/bedcc1f122d59ec002e0e6d2802c0e422eadf6208669fff141a895bd3ed15d4a!/FaEu-DWCA/eml.xml");

assertThat(s, Is.is(url));
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,13 @@ public boolean handle(IRI version, InputStream is) throws ContentStreamException
String prefix = StringUtils.substring(iriString, 0, iriString.length() - META_XML.length());

String metaDataIRI = prefix + metadataLocation;
InputStream emlStream = dereferencer.get(RefNodeFactory.toIRI(metaDataIRI));

if (emlStream != null) {
SAXParser p = SAX_FACTORY.newSAXParser();
p.parse(emlStream, new CitationSaxHandler(metaDataIRI, os));
try (InputStream emlStream = dereferencer.get(RefNodeFactory.toIRI(metaDataIRI))) {
if (emlStream != null) {
SAXParser p = SAX_FACTORY.newSAXParser();
p.parse(emlStream, new CitationSaxHandler(metaDataIRI, os));
}
}

}

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
import bio.guoda.preston.cmd.CitationGenerator;
import bio.guoda.preston.cmd.Cmd;
import bio.guoda.preston.store.BlobStoreReadOnly;
import bio.guoda.preston.stream.ArchiveStreamHandler;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.rdf.api.IRI;
import org.apache.commons.rdf.api.Quad;
import org.hamcrest.core.Is;
Expand Down Expand Up @@ -107,4 +111,55 @@ public InputStream get(IRI key) {

}

@Test
public void onEmlEmbeddedInTarGZ() {

BlobStoreReadOnly blobStore = new BlobStoreReadOnly() {
@Override
public InputStream get(IRI key) {
URL resource = getClass().getResource("/bio/guoda/preston/dwca.tar.gz");

IRI iri = toIRI(resource.toExternalForm());

if (StringUtils.equals("hash://sha256/bedcc1f122d59ec002e0e6d2802c0e422eadf6208669fff141a895bd3ed15d4a", key.getIRIString())) {
try {
return new FileInputStream(new File(URI.create(iri.getIRIString())));
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
}
}
return null;
}
};


Quad statement = toStatement(toIRI("blip"), HAS_VERSION, toIRI("hash://sha256/bedcc1f122d59ec002e0e6d2802c0e422eadf6208669fff141a895bd3ed15d4a"));


Cmd cmd = new Cmd();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
cmd.setOutputStream(outputStream);

CitationGenerator citationGenerator = new CitationGenerator(cmd, blobStore);
citationGenerator.on(statement);

String actual = new String(outputStream.toByteArray(), StandardCharsets.UTF_8);

assertThat(actual.split("\n").length, Is.is(1));

assertThat(actual, Is.is("de Jong, Y.S.D.M. (ed.) (2010) Fauna Europaea version 2.4. Web Service available online at http://www.faunaeur.org. Accessed at <tar:gz:hash://sha256/bedcc1f122d59ec002e0e6d2802c0e422eadf6208669fff141a895bd3ed15d4a!/FaEu-DWCA/eml.xml> .\n"));

}

@Test
public void checkSignatureZip() throws ArchiveException {
InputStream resource = getClass().getResourceAsStream("/bio/guoda/preston/Ramírez-Chaves-et-al-2022.zip");


Pair<ArchiveInputStream, String> archiveStreamAndFormat = ArchiveStreamHandler.getArchiveStreamAndFormat(resource);

assertThat(archiveStreamAndFormat.getValue(), Is.is("zip"));

}

}
Binary file not shown.

0 comments on commit 0f8f037

Please sign in to comment.