Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

store size of the saved original in the database #5390

Merged
merged 3 commits into from
Dec 17, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion scripts/database/upgrades/upgrade_v4.9.4_to_v4.10.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@ INSERT INTO setting(
ALTER TABLE actionlogrecord ALTER COLUMN info TYPE text;


ALTER TABLE dataverse ALTER COLUMN defaultcontributorrole_id DROP NOT NULL;
ALTER TABLE dataverse ALTER COLUMN defaultcontributorrole_id DROP NOT NULL;

ALTER TABLE datatable ADD COLUMN originalfilesize BIGINT;
10 changes: 10 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/DataFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,16 @@ public String getOriginalFileFormat() {
}
return null;
}

public Long getOriginalFileSize() {
if (isTabularData()) {
DataTable dataTable = getDataTable();
if (dataTable != null) {
return dataTable.getOriginalFileSize();
}
}
return null;
}

@Override
public boolean isAncestorOf( DvObject other ) {
Expand Down
20 changes: 16 additions & 4 deletions src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ public DataFile findCheapAndEasy(Long id) {
if (MIME_TYPE_TSV.equalsIgnoreCase(contentType)) {
Object[] dtResult;
try {
dtResult = (Object[]) em.createNativeQuery("SELECT ID, UNF, CASEQUANTITY, VARQUANTITY, ORIGINALFILEFORMAT FROM dataTable WHERE DATAFILE_ID = " + id).getSingleResult();
dtResult = (Object[]) em.createNativeQuery("SELECT ID, UNF, CASEQUANTITY, VARQUANTITY, ORIGINALFILEFORMAT, ORIGINALFILESIZE FROM dataTable WHERE DATAFILE_ID = " + id).getSingleResult();
} catch (Exception ex) {
dtResult = null;
}
Expand All @@ -509,6 +509,8 @@ public DataFile findCheapAndEasy(Long id) {

dataTable.setOriginalFileFormat((String)dtResult[4]);

dataTable.setOriginalFileSize((Long)dtResult[5]);

dataTable.setDataFile(dataFile);
dataFile.setDataTable(dataTable);

Expand Down Expand Up @@ -567,7 +569,7 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion

int i = 0;

List<Object[]> dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList();
List<Object[]> dataTableResults = em.createNativeQuery("SELECT t0.ID, t0.DATAFILE_ID, t0.UNF, t0.CASEQUANTITY, t0.VARQUANTITY, t0.ORIGINALFILEFORMAT, t0.ORIGINALFILESIZE FROM dataTable t0, dataFile t1, dvObject t2 WHERE ((t0.DATAFILE_ID = t1.ID) AND (t1.ID = t2.ID) AND (t2.OWNER_ID = " + owner.getId() + ")) ORDER BY t0.ID").getResultList();

for (Object[] result : dataTableResults) {
DataTable dataTable = new DataTable();
Expand All @@ -583,6 +585,8 @@ public void findFileMetadataOptimizedExperimental(Dataset owner, DatasetVersion

dataTable.setOriginalFileFormat((String)result[5]);

dataTable.setOriginalFileSize((Long)result[6]);

dataTables.add(dataTable);
datatableMap.put(fileId, i++);

Expand Down Expand Up @@ -1444,7 +1448,7 @@ public boolean isReplacementFile(DataFile df) {
} // end: isReplacementFile

public List<Long> selectFilesWithMissingOriginalTypes() {
Query query = em.createNativeQuery("SELECT f.id FROM datafile f, datatable t where t.datafile_id = f.id AND t.originalfileformat='" + MIME_TYPE_TSV + "' ORDER BY f.id");
Query query = em.createNativeQuery("SELECT f.id FROM datafile f, datatable t where t.datafile_id = f.id AND (t.originalfileformat='" + MIME_TYPE_TSV + "' OR t.originalfileformat IS NULL) ORDER BY f.id");

try {
return query.getResultList();
Expand All @@ -1453,7 +1457,15 @@ public List<Long> selectFilesWithMissingOriginalTypes() {
}
}


public List<Long> selectFilesWithMissingOriginalSizes() {
Query query = em.createNativeQuery("SELECT f.id FROM datafile f, datatable t where t.datafile_id = f.id AND t.originalfilesize IS NULL ORDER BY f.id");

try {
return query.getResultList();
} catch (Exception ex) {
return new ArrayList<>();
}
}

public String generateDataFileIdentifier(DataFile datafile, GlobalIdServiceBean idServiceBean) {
String doiIdentifierType = settingsService.getValueForKey(SettingsServiceBean.Key.IdentifierGenerationStyle, "randomString");
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/DataTable.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ public DataTable() {
*/
private String originalFormatVersion;

/*
* Size of the original file:
*/

private Long originalFileSize;

/*
* Getter and Setter methods:
*/
Expand Down Expand Up @@ -172,6 +178,14 @@ public void setOriginalFileFormat(String originalFileType) {
this.originalFileFormat = originalFileType;
}

public Long getOriginalFileSize() {
return originalFileSize;
}

public void setOriginalFileSize(Long originalFileSize) {
this.originalFileSize = originalFileSize;
}


public String getOriginalFormatVersion() {
return originalFormatVersion;
Expand Down
34 changes: 18 additions & 16 deletions src/main/java/edu/harvard/iq/dataverse/api/Access.java
Original file line number Diff line number Diff line change
Expand Up @@ -578,23 +578,25 @@ public void write(OutputStream os) throws IOException,
//without doing a large deal of rewriting or architecture redo.
//The previous size checks for non-original download is still quick.
//-MAD 4.9.2
DataAccessRequest daReq = new DataAccessRequest();
StorageIO<DataFile> accessObject = DataAccess.getStorageIO(file, daReq);

if (accessObject != null) {
Boolean gotOriginal = false;
StoredOriginalFile sof = new StoredOriginalFile();
StorageIO<DataFile> tempAccessObject = sof.retreive(accessObject);
if(null != tempAccessObject) { //If there is an original, use it
gotOriginal = true;
accessObject = tempAccessObject;
}
if(!gotOriginal) { //if we didn't get this from sof.retreive we have to open it
accessObject.open();
}
size = accessObject.getSize();
// OK, here's the better solution: we now store the size of the original file in
// the database (in DataTable), so we get it for free.
// However, there may still be legacy datatables for which the size is not saved.
// so the "inefficient" code is kept, below, as a fallback solution.
// -- L.A., 4.10

if (file.getDataTable().getOriginalFileSize() != null) {
size = file.getDataTable().getOriginalFileSize();
} else {
DataAccessRequest daReq = new DataAccessRequest();
StorageIO<DataFile> storageIO = DataAccess.getStorageIO(file, daReq);
storageIO.open();
size = storageIO.getAuxObjectSize(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);

// save it permanently:
file.getDataTable().setOriginalFileSize(size);
fileService.saveDataTable(file.getDataTable());
}
if(size == 0L){
if (size == 0L){
throw new IOException("Invalid file size or accessObject when checking limits of zip file");
}
} else {
Expand Down
20 changes: 20 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/api/Admin.java
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,26 @@ public Response fixMissingOriginalTypes() {

return ok(info);
}

@Path("datafiles/integrity/fixmissingoriginalsizes")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@landreev can you please document this new API endpoint in the guides?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have documented it in the Native API guide, with the other file-related API calls. Also added the release notes specific to the issue (doc/release-notes/5323-saved-original-size).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(thanks :)

@GET
public Response fixMissingOriginalSizes() {
JsonObjectBuilder info = Json.createObjectBuilder();

List<Long> affectedFileIds = fileService.selectFilesWithMissingOriginalSizes();

if (affectedFileIds.isEmpty()) {
info.add("message",
"All the tabular files in the database already have the original sizes set correctly; exiting.");
} else {

info.add("message", "Found " + affectedFileIds.size()
+ " tabular files with missing original sizes. Kicking off an async job that will repair the files in the background.");
}

ingestService.fixMissingOriginalSizes(affectedFileIds);
return ok(info);
}

/**
* This method is used in API tests, called from UtilIt.java.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ public static StorageIO<DataFile> retreive(StorageIO<DataFile> storageIO) {
try {
storageIO.open();
Channel storedOriginalChannel = storageIO.openAuxChannel(SAVED_ORIGINAL_FILENAME_EXTENSION);
storedOriginalSize = storageIO.getAuxObjectSize(SAVED_ORIGINAL_FILENAME_EXTENSION);
storedOriginalSize = dataFile.getDataTable().getOriginalFileSize() != null ?
dataFile.getDataTable().getOriginalFileSize() :
storageIO.getAuxObjectSize(SAVED_ORIGINAL_FILENAME_EXTENSION);
inputStreamIO = new InputStreamIO(Channels.newInputStream((ReadableByteChannel) storedOriginalChannel), storedOriginalSize);
logger.fine("Opened stored original file as Aux "+SAVED_ORIGINAL_FILENAME_EXTENSION);
} catch (IOException ioEx) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,7 @@ public boolean ingestAsTabular(Long datafile_id) {
} else {
tabDataIngest.getDataTable().setOriginalFileFormat(originalContentType);
}
tabDataIngest.getDataTable().setOriginalFileSize(originalFileSize);

dataFile.setDataTable(tabDataIngest.getDataTable());
tabDataIngest.getDataTable().setDataFile(dataFile);
Expand Down Expand Up @@ -1675,6 +1676,22 @@ public void fixMissingOriginalTypes(List<Long> datafileIds) {
logger.info("Finished repairing tabular data files that were missing the original file format labels.");
}

// This method takes a list of file ids and tries to fix the size of the saved
// original, if present
// Note the @Asynchronous attribute - this allows us to just kick off and run this
// (potentially large) job in the background.
// The method is called by the "fixmissingoriginalsizes" /admin api call.
@Asynchronous
public void fixMissingOriginalSizes(List<Long> datafileIds) {
for (Long fileId : datafileIds) {
fixMissingOriginalType(fileId);
try {
Thread.sleep(1000);
} catch (Exception ex) {}
}
logger.info("Finished repairing tabular data files that were missing the original file sizes.");
}

// This method fixes a datatable object that's missing the format type of
// the ingested original. It will check the saved original file to
// determine the type.
Expand Down Expand Up @@ -1745,6 +1762,8 @@ private void fixMissingOriginalType(long fileId) {
logger.warning("Caught exception trying to determine original file type (datafile id=" + fileId + ", datatable id=" + datatableId + "): " + ioex.getMessage());
}

Long savedOriginalFileSize = savedOriginalFile.length();

// If we had to create a temp file, delete it now:
if (tempFileRequired) {
savedOriginalFile.delete();
Expand All @@ -1769,6 +1788,7 @@ private void fixMissingOriginalType(long fileId) {

// save permanently in the database:
dataFile.getDataTable().setOriginalFileFormat(fileTypeDetermined);
dataFile.getDataTable().setOriginalFileSize(savedOriginalFileSize);
fileService.saveDataTable(dataFile.getDataTable());

} else {
Expand All @@ -1779,6 +1799,46 @@ private void fixMissingOriginalType(long fileId) {
}
}

// This method fixes a datatable object that's missing the size of the
// ingested original.
private void fixMissingOriginalSize(long fileId) {
DataFile dataFile = fileService.find(fileId);

if (dataFile != null && dataFile.isTabularData()) {
Long savedOriginalFileSize = dataFile.getDataTable().getOriginalFileSize();
Long datatableId = dataFile.getDataTable().getId();

if (savedOriginalFileSize == null) {

StorageIO<DataFile> storageIO;

try {
storageIO = dataFile.getStorageIO();
storageIO.open();
savedOriginalFileSize = storageIO.getAuxObjectSize(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);

} catch (Exception ex) {
logger.warning("Exception "+ex.getClass()+" caught trying to look up the size of the saved original; (datafile id=" + fileId + ", datatable id=" + datatableId + "): " + ex.getMessage());
return;
}

if (savedOriginalFileSize == null) {
logger.warning("Failed to look up the size of the saved original file! (datafile id=" + fileId + ", datatable id=" + datatableId + ")");
return;
}

// save permanently in the database:
dataFile.getDataTable().setOriginalFileSize(savedOriginalFileSize);
fileService.saveDataTable(dataFile.getDataTable());

} else {
logger.info("DataFile id=" + fileId + "; original file size already present: " + savedOriginalFileSize);
}
} else {
logger.warning("DataFile id=" + fileId + ": No such DataFile!");
}
}

public static void main(String[] args) {

String file = args[0];
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,10 @@ public static String getFacetFileType(DataFile dataFile) {
}

public static String getUserFriendlyOriginalType(DataFile dataFile) {
if (!dataFile.isTabularData()) {
return null;
}

String fileType = dataFile.getOriginalFileFormat();

if (fileType != null && !fileType.equals("")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,7 @@ public static JsonObjectBuilder json(DataFile df, FileMetadata fileMetadata) {
.add("storageIdentifier", df.getStorageIdentifier())
.add("originalFileFormat", df.getOriginalFileFormat())
.add("originalFormatLabel", df.getOriginalFormatLabel())
.add ("originalFileSize", df.getOriginalFileSize())
.add("UNF", df.getUnf())
//---------------------------------------------
// For file replace: rootDataFileId, previousDataFileId
Expand All @@ -586,7 +587,7 @@ public static JsonObjectBuilder json(DataFile df, FileMetadata fileMetadata) {
//---------------------------------------------
// Checksum
// * @todo Should we deprecate "md5" now that it's under
// * "checksum" (which may also be a SHA-1 rather than an MD5)?
// * "checksum" (which may also be a SHA-1 rather than an MD5)? - YES!
//---------------------------------------------
.add("md5", getMd5IfItExists(df.getChecksumType(), df.getChecksumValue()))
.add("checksum", getChecksumTypeAndValue(df.getChecksumType(), df.getChecksumValue()))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,6 @@ public void testGetFileCategories() {
assertEquals("Data", jsonObject.getJsonArray("categories").getString(0));
assertEquals("", jsonObject.getJsonObject("dataFile").getString("filename"));
assertEquals(-1, jsonObject.getJsonObject("dataFile").getInt("filesize"));
assertEquals("UNKNOWN", jsonObject.getJsonObject("dataFile").getString("originalFormatLabel"));
assertEquals(-1, jsonObject.getJsonObject("dataFile").getInt("rootDataFileId"));
assertEquals("Survey", jsonObject.getJsonObject("dataFile").getJsonArray("tabularTags").getString(0));
}
Expand Down