Skip to content

Commit

Permalink
add first pass at bridge to the dataverse; related to #269
Browse files Browse the repository at this point in the history
  • Loading branch information
Jorrit Poelen committed Dec 13, 2023
1 parent a8ea442 commit c189323
Show file tree
Hide file tree
Showing 5 changed files with 391 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
package bio.guoda.preston.store;

import bio.guoda.preston.HashType;
import bio.guoda.preston.RefNodeFactory;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.rdf.api.IRI;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.stream.Stream;

public class KeyTo1LevelDataVersePath implements KeyToPath {

public static final String API_QUERY_FRAGMENT = "api/search?q=fileMd5:";

public static final String MAGIC_HOST = "dataverse.org";

private final URI baseURI;
private final Dereferencer<InputStream> deref;

private static final List<String> registeredDataVerseHosts = Collections.synchronizedList(new ArrayList<>());
private static final List<String> failedHosts = Collections.synchronizedList(new ArrayList<>());

public KeyTo1LevelDataVersePath(URI baseURI,
Dereferencer<InputStream> deref) {
this.deref = deref;
this.baseURI = baseURI;
}

@Override
public URI toPath(IRI key) {
URI path = null;
lazyInitHostList();
if (isSupportedHost()) {
HashType hashType = HashKeyUtil.hashTypeFor(key);
int offset = hashType.getPrefix().length();
String md5HexHash = StringUtils.substring(key.getIRIString(), offset);

if (StringUtils.equals(baseURI.getHost(), MAGIC_HOST)) {
Optional<URI> first = registeredDataVerseHosts
.parallelStream()
.filter(x -> !failedHosts.contains(x))
.map(host -> Pair.of(queryForHost(md5HexHash, URI.create("https://" + host)), host))
.map(q -> Optional.ofNullable(findFirstAndDisqualifyIfNeeded(q.getLeft(), q.getRight())))
.flatMap(o -> o.map(Stream::of).orElseGet(Stream::empty))
.findFirst();
path = first.orElse(null);
} else {
path = findFirst(queryForHost(md5HexHash, baseURI));
}
}
return path;
}

private boolean isSupportedHost() {
return baseURI != null
&& (StringUtils.equals(baseURI.getHost(), MAGIC_HOST) || registeredDataVerseHosts.contains(baseURI.getHost()));
}

private IRI queryForHost(String md5HexHash, URI host) {
String s = host.toString();
String prefixEndingWithSlash = StringUtils.endsWith(s, "/") ? s : s + "/";
return RefNodeFactory.toIRI(prefixEndingWithSlash + API_QUERY_FRAGMENT + md5HexHash);
}

private URI findFirst(IRI query) {
URI path = null;
try (InputStream inputStream = deref.get(query)) {
path = findFirstHit(inputStream);
} catch (IOException e) {
// opportunistic
}
return path;
}

private URI findFirstAndDisqualifyIfNeeded(IRI query, String host) {
URI path = null;
try (InputStream inputStream = deref.get(query)) {
path = findFirstHit(inputStream);
} catch (IOException e) {
// opportunistic
System.out.println("failed [" + host + "]");
e.printStackTrace();
failedHosts.add(host);
}
return path;
}

private void lazyInitHostList() {
if (registeredDataVerseHosts.size() == 0) {
try (InputStream inputStream = deref.get(RefNodeFactory.toIRI("https://iqss.github.io/dataverse-installations/data/data.json"))) {
registeredDataVerseHosts.addAll(findHostNames(inputStream));
} catch (IOException e) {
// opportunistic
} finally {
if (registeredDataVerseHosts.size() == 0) {
registeredDataVerseHosts.addAll(hostsHardcoded());
}
}
}
}


static URI findFirstHit(InputStream inputStream) throws IOException {
if (inputStream == null) {
throw new IOException("no input found");
}
JsonNode jsonNode = new ObjectMapper().readTree(inputStream);

if (jsonNode != null && jsonNode.has("data")) {
JsonNode data = jsonNode.get("data");
if (data.has("items")) {
JsonNode items = data.get("items");
for (JsonNode item : items) {
if (item.has("url")) {
return URI.create(item.get("url").asText());
}
}
}
}
return null;
}

static List<String> findHostNames(InputStream inputStream) throws IOException {

List<String> hosts = new ArrayList<>();
if (inputStream == null) {
throw new IOException("no input found");
}
JsonNode jsonNode = new ObjectMapper().readTree(inputStream);

if (jsonNode != null && jsonNode.has("installations")) {
JsonNode installations = jsonNode.get("installations");
for (JsonNode installation : installations) {
if (installation.has("host")) {
hosts.add(installation.get("host").asText());
}

}
}
return hosts;
}

public List<String> hostsHardcoded() {
String[] split = StringUtils.split("abacus.library.ubc.ca\n" +
"dataverse.theacss.org\n" +
"dataverse.ada.edu.au\n" +
"dadosdepesquisa.fiocruz.br\n" +
"dataverse.asu.edu\n" +
"data.aussda.at\n" +
"bonndata.uni-bonn.de\n" +
"borealisdata.ca\n" +
"dataverse.bhp.org.bw\n" +
"data.brin.go.id\n" +
"dataverse.cbpf.br\n" +
"opendata.cesa.edu.co\n" +
"dataverse.cidacs.org\n" +
"data.cifor.org\n" +
"data.cimmyt.org\n" +
"dataverse.cirad.fr\n" +
"science-data.hu\n" +
"dataverse.csuc.cat\n" +
"datasets.coronawhy.org\n" +
"data.crossda.hr\n" +
"researchdata.cuhk.edu.hk\n" +
"dados.ipb.pt\n" +
"archaeology.datastations.nl\n" +
"ssh.datastations.nl\n" +
"dare.uol.de\n" +
"dataverse.dartmouth.edu\n" +
"darus.uni-stuttgart.de\n" +
"dataverse.ird.fr\n" +
"data.sciencespo.fr\n" +
"datarepositorium.sdum.uminho.pt\n" +
"dataspace.ust.hk\n" +
"edatos.consorciomadrono.es\n" +
"dataverse.nl\n" +
"dataverse.no\n" +
"dataverse.rhi.hi.is\n" +
"dorel.univ-lorraine.fr\n" +
"researchdata.ntu.edu.sg\n" +
"dunas.ua.pt\n" +
"edmond.mpdl.mpg.de\n" +
"dataverse.fgv.br\n" +
"dataverse.fiu.edu\n" +
"dvn.fudan.edu.cn\n" +
"dataverse.orc.gmu.edu\n" +
"data.univ-gustave-eiffel.fr\n" +
"data.goettingen-research-online.de\n" +
"dataverse.harvard.edu\n" +
"heidata.uni-heidelberg.de\n" +
"repositoriopesquisas.ibict.br\n" +
"dataverse.icrisat.org\n" +
"dataverse.mpi-sws.org\n" +
"dataverse.ifdc.org\n" +
"datasets.iisg.amsterdam\n" +
"indata.cedia.edu.ec\n" +
"dataverse.pushdom.ru\n" +
"data.cipotato.org\n" +
"dataverse.ipgp.fr\n" +
"dataverse.iit.it\n" +
"archive.data.jhu.edu\n" +
"dataverse.jpl.nasa.gov\n" +
"data.fz-juelich.de\n" +
"keen.zih.tu-dresden.de\n" +
"rdr.kuleuven.be\n" +
"dataverse.lib.virginia.edu\n" +
"lida.dataverse.lt\n" +
"dataverse.acg.maine.edu/dvn\n" +
"data.mel.cgiar.org\n" +
"researchdata.nie.edu.sg\n" +
"dataverse.nioz.nl\n" +
"dataverse.lib.nycu.edu.tw\n" +
"portal.odissei.nl\n" +
"dataverse.uclouvain.be\n" +
"dataverse.openforestdata.pl\n" +
"osnadata.ub.uni-osnabrueck.de\n" +
"papyrus-datos.co\n" +
"opendata.pku.edu.cn\n" +
"datos.pucp.edu.pe\n" +
"data.qdr.syr.edu\n" +
"entrepot.recherche.data.gouv.fr\n" +
"redape.dados.embrapa.br\n" +
"dataverse.unr.edu.ar\n" +
"datos.uchile.cl\n" +
"datos.unlp.edu.ar\n" +
"research-data.urosario.edu.co\n" +
"datav.udec.cl\n" +
"repositoriodedados.unifesp.br\n" +
"dataverse.ufabc.edu.br\n" +
"dataverse.ileel.ufu.br\n" +
"repositorio.polen.fccn.pt\n" +
"dadosabertos.rnp.br\n" +
"solo.mapbiomas.org\n" +
"rodbuk.pl\n" +
"agh.rodbuk.pl\n" +
"pk.rodbuk.pl\n" +
"uek.rodbuk.pl\n" +
"uj.rodbuk.pl\n" +
"dataverse.rsu.lv\n" +
"data.scielo.org\n" +
"sodha.be\n" +
"datahub.tec.mx\n" +
"dataverse.tdl.org\n" +
"planetary-data-portal.org\n" +
"dataverse.ucla.edu\n" +
"dataverse.lib.unb.ca\n" +
"dataverse.unc.edu\n" +
"dataverse.lib.umanitoba.ca\n" +
"dataverse.unimi.it\n" +
"dataverse.vtti.vt.edu\n" +
"data.worldagroforestry.org", "\n");
return Arrays.asList(split);
}

@Override
public boolean supports(IRI key) {
return HashType.md5.equals(HashKeyUtil.hashTypeFor(key));
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package bio.guoda.preston.store;

import org.junit.Test;

import java.io.IOException;
import java.net.URI;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.is;
import static org.junit.Assert.assertNotNull;

public class KeyTo1LevelDataVersePathTest {

@Test
public void parseFirstHit() throws IOException {
URI firstHit = KeyTo1LevelDataVersePath.findFirstHit(getClass().getResourceAsStream("dataverse-search-result.json"));
assertNotNull(firstHit);
assertThat(firstHit, is(URI.create("https://dataverse.harvard.edu/api/access/datafile/2829688")));
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"status": "OK",
"data": {
"q": "fileMd5:48a76222cf5c06cb4f2d8f75cc0caa63",
"total_count": 1,
"start": 0,
"spelling_alternatives": {},
"items": [
{
"name": "Auter Fine PB Replication Code.txt",
"type": "file",
"url": "https://dataverse.harvard.edu/api/access/datafile/2829688",
"file_id": "2829688",
"published_at": "2016-05-18T17:57:24Z",
"file_type": "Plain Text",
"file_content_type": "text/plain",
"size_in_bytes": 2065,
"md5": "48a76222cf5c06cb4f2d8f75cc0caa63",
"checksum": {
"type": "MD5",
"value": "48a76222cf5c06cb4f2d8f75cc0caa63"
},
"file_persistent_id": "doi:10.7910/DVN/TGKZ2T/Y1ZZXT",
"dataset_name": "Replication Data for: \"Negative Campaigning in the Social Media Age: Attack Advertising on Facebook\"",
"dataset_id": "2829686",
"dataset_persistent_id": "doi:10.7910/DVN/TGKZ2T",
"dataset_citation": "Auter, Zachary, 2016, \"Replication Data for: \"Negative Campaigning in the Social Media Age: Attack Advertising on Facebook\"\", https://doi.org/10.7910/DVN/TGKZ2T, Harvard Dataverse, V1, UNF:6:LSx44nECMNQun46yUutUuA== [fileUNF]"
}
],
"count_in_response": 1
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import bio.guoda.preston.store.Dereferencer;
import bio.guoda.preston.store.DereferencerContentAddressedTarGZ;
import bio.guoda.preston.store.KeyTo1LevelDataOnePath;
import bio.guoda.preston.store.KeyTo1LevelDataVersePath;
import bio.guoda.preston.store.KeyTo1LevelOCIPath;
import bio.guoda.preston.store.KeyTo1LevelPath;
import bio.guoda.preston.store.KeyTo1LevelSoftwareHeritageAutoDetectPath;
Expand Down Expand Up @@ -119,7 +120,8 @@ private KeyValueStore withRemoteSupport(ValidatingKeyValueStreamFactory kvStream
Pair.of(uri, new KeyTo1LevelZenodoBucket(new KeyTo1LevelZenodoPath(uri, getDerefStream(uri, getProgressListener()), KeyTo1LevelZenodoPath.ZENODO_API_PREFIX_2023_10_13, KeyTo1LevelZenodoPath.ZENODO_API_SUFFIX_2023_10_13))),
Pair.of(uri, new KeyTo1LevelDataOnePath(uri, getDerefStream(uri, getProgressListener()))),
Pair.of(uri, new KeyTo1LevelOCIPath(uri)),
Pair.of(uri, new KeyTo1LevelWikiMediaCommonsPath(uri, getDerefStream(uri, getProgressListener())))
Pair.of(uri, new KeyTo1LevelWikiMediaCommonsPath(uri, getDerefStream(uri, getProgressListener()))),
Pair.of(uri, new KeyTo1LevelDataVersePath(uri, getDerefStream(uri, getProgressListener())))
));

List<KeyValueStoreReadOnly> keyValueStoreRemotes =
Expand Down
Loading

0 comments on commit c189323

Please sign in to comment.