Skip to content

Commit

Permalink
Merge pull request #4708 from IQSS/2301-stata
Browse files Browse the repository at this point in the history
support for Stata 14 and 15 #2301
  • Loading branch information
kcondon authored Jul 10, 2018
2 parents 2cd8ec2 + 1b222ef commit b2fe2e0
Show file tree
Hide file tree
Showing 24 changed files with 2,559 additions and 2,435 deletions.
Binary file added downloads/stata-13-test-files/Stata14TestFile.dta
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added scripts/search/data/tabular/stata13-auto.dta
Binary file not shown.
Binary file not shown.
2 changes: 2 additions & 0 deletions src/main/java/MimeTypeDisplay.properties
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ application/x-R-2=R Binary
application/x-stata=Stata Binary
application/x-stata-6=Stata Binary
application/x-stata-13=Stata 13 Binary
application/x-stata-14=Stata 14 Binary
application/x-stata-14=Stata 15 Binary
text/x-stata-syntax=Stata Syntax
application/x-spss-por=SPSS Portable
application/x-spss-sav=SPSS SAV
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/MimeTypeFacets.properties
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ application/x-R-2=data
application/x-stata=data
application/x-stata-6=data
application/x-stata-13=data
application/x-stata-14=data
application/x-stata-15=data
text/x-stata-syntax=data
application/x-spss-por=data
application/x-spss-sav=data
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/edu/harvard/iq/dataverse/api/TestIngest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import edu.harvard.iq.dataverse.DataFile;
import edu.harvard.iq.dataverse.DataTable;
import edu.harvard.iq.dataverse.Dataset;
import edu.harvard.iq.dataverse.DatasetServiceBean;
import edu.harvard.iq.dataverse.FileMetadata;
import edu.harvard.iq.dataverse.ingest.IngestServiceBean;
Expand Down Expand Up @@ -101,7 +102,7 @@ public String datafile(@QueryParam("fileName") String fileName, @QueryParam("fil
try {
tabDataIngest = ingestPlugin.read(fileInputStream, null);
} catch (IOException ingestEx) {
output = output.concat("Caught an exception trying to ingest file "+fileName+".");
output = output.concat("Caught an exception trying to ingest file " + fileName + ": " + ingestEx.getLocalizedMessage());
return output;
}

Expand All @@ -121,6 +122,8 @@ public String datafile(@QueryParam("fileName") String fileName, @QueryParam("fil

DataFile dataFile = new DataFile();
dataFile.setStorageIdentifier(tabFilename);
Dataset dataset = new Dataset();
dataFile.setOwner(dataset);

FileMetadata fileMetadata = new FileMetadata();
fileMetadata.setLabel(fileName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ private static String generateOriginalExtension(String fileType) {
return ".sav";
} else if (fileType.equalsIgnoreCase("application/x-spss-por")) {
return ".por";
} else if (fileType.equalsIgnoreCase("application/x-stata") || fileType.equalsIgnoreCase("application/x-stata-13")) {
} else if (fileType.equalsIgnoreCase("application/x-stata") || fileType.equalsIgnoreCase("application/x-stata-13") || fileType.equalsIgnoreCase("application/x-stata-14") || fileType.equalsIgnoreCase("application/x-stata-15")) {
return ".dta";
} else if (fileType.equalsIgnoreCase("application/x-dvn-csvspss-zip")) {
return ".zip";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public void onMessage(Message message) {
//Thread.sleep(10000);
logger.fine("Finished ingest job;");
} else {
logger.warning("Error occurred during ingest job!");
logger.warning("Error occurred during ingest job for file id " + datafile_id + "!");
}
} catch (Exception ex) {
//ex.printStackTrace();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader;
import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest;
import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.DTAFileReader;
import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.DTA117FileReader;
import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.NewDTAFileReader;
import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.DTAFileReaderSpi;
import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.rdata.RDATAFileReader;
import edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.rdata.RDATAFileReaderSpi;
Expand Down Expand Up @@ -545,9 +545,6 @@ public void produceContinuousSummaryStatistics(DataFile dataFile, File generated
if (dataFile.getDataTable().getDataVariables().get(i).isIntervalContinuous()) {
logger.fine("subsetting continuous vector");

StorageIO<DataFile> storageIO = dataFile.getStorageIO();
storageIO.open();

if ("float".equals(dataFile.getDataTable().getDataVariables().get(i).getFormat())) {
Float[] variableVector = TabularSubsetGenerator.subsetFloatVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue());
logger.fine("Calculating summary statistics on a Float vector;");
Expand Down Expand Up @@ -579,9 +576,6 @@ public void produceDiscreteNumericSummaryStatistics(DataFile dataFile, File gene
&& dataFile.getDataTable().getDataVariables().get(i).isTypeNumeric()) {
logger.fine("subsetting discrete-numeric vector");

StorageIO<DataFile> storageIO = dataFile.getStorageIO();
storageIO.open();

Long[] variableVector = TabularSubsetGenerator.subsetLongVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue());
// We are discussing calculating the same summary stats for
// all numerics (the same kind of sumstats that we've been calculating
Expand Down Expand Up @@ -615,9 +609,6 @@ public void produceCharacterSummaryStatistics(DataFile dataFile, File generatedT
for (int i = 0; i < dataFile.getDataTable().getVarQuantity(); i++) {
if (dataFile.getDataTable().getDataVariables().get(i).isTypeCharacter()) {

StorageIO<DataFile> storageIO = dataFile.getStorageIO();
storageIO.open();

logger.fine("subsetting character vector");
String[] variableVector = TabularSubsetGenerator.subsetStringVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue());
//calculateCharacterSummaryStatistics(dataFile, i, variableVector);
Expand Down Expand Up @@ -675,6 +666,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I
// it up with the Ingest Service Provider Registry:
String fileName = dataFile.getFileMetadata().getLabel();
TabularDataFileReader ingestPlugin = getTabDataReaderByMimeType(dataFile.getContentType());
logger.fine("Using ingest plugin " + ingestPlugin.getClass());

if (ingestPlugin == null) {
dataFile.SetIngestProblem();
Expand Down Expand Up @@ -739,7 +731,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I
dataFile = fileService.save(dataFile);

dataFile = fileService.save(dataFile);
logger.fine("Ingest failure (IO Exception): "+ingestEx.getMessage()+ ".");
logger.warning("Ingest failure (IO Exception): " + ingestEx.getMessage() + ".");
return false;
} catch (Exception unknownEx) {
// this is a bit of a kludge, to make sure no unknown exceptions are
Expand Down Expand Up @@ -801,6 +793,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I
}

if (!postIngestTasksSuccessful) {
logger.warning("Ingest failure (!postIngestTasksSuccessful).");
return false;
}

Expand Down Expand Up @@ -847,6 +840,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I
}

if (!databaseSaveSuccessful) {
logger.warning("Ingest failure (!databaseSaveSuccessful).");
return false;
}

Expand Down Expand Up @@ -897,6 +891,7 @@ public boolean ingestAsTabular(Long datafile_id) { //DataFile dataFile) throws I
logger.warning("Ingest failed to produce data obect.");
}

logger.fine("Returning ingestSuccessful: " + ingestSuccessful);
return ingestSuccessful;
}

Expand Down Expand Up @@ -949,7 +944,11 @@ public static TabularDataFileReader getTabDataReaderByMimeType(String mimeType)
if (mimeType.equals(FileUtil.MIME_TYPE_STATA)) {
ingestPlugin = new DTAFileReader(new DTAFileReaderSpi());
} else if (mimeType.equals(FileUtil.MIME_TYPE_STATA13)) {
ingestPlugin = new DTA117FileReader(new DTAFileReaderSpi());
ingestPlugin = new NewDTAFileReader(new DTAFileReaderSpi(), 117);
} else if (mimeType.equals(FileUtil.MIME_TYPE_STATA14)) {
ingestPlugin = new NewDTAFileReader(new DTAFileReaderSpi(), 118);
} else if (mimeType.equals(FileUtil.MIME_TYPE_STATA15)) {
ingestPlugin = new NewDTAFileReader(new DTAFileReaderSpi(), 119);
} else if (mimeType.equals(FileUtil.MIME_TYPE_RDATA)) {
ingestPlugin = new RDATAFileReader(new RDATAFileReaderSpi());
} else if (mimeType.equals(FileUtil.MIME_TYPE_CSV) || mimeType.equals(FileUtil.MIME_TYPE_CSV_ALT)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ public class IngestableDataChecker implements java.io.Serializable {
// Map that returns a Stata Release number
private static Map<Byte, String> stataReleaseNumber = new HashMap<Byte, String>();
public static String STATA_13_HEADER = "<stata_dta><header><release>117</release>";
public static String STATA_14_HEADER = "<stata_dta><header><release>118</release>";
public static String STATA_15_HEADER = "<stata_dta><header><release>119</release>";
// Map that returns a reader-implemented mime-type
private static Set<String> readableFileTypes = new HashSet<String>();
private static Map<String, Method> testMethods = new HashMap<String, Method>();
Expand Down Expand Up @@ -91,6 +93,8 @@ public class IngestableDataChecker implements java.io.Serializable {
readableFileTypes.add("application/x-spss-por");
readableFileTypes.add("application/x-rlang-transport");
readableFileTypes.add("application/x-stata-13");
readableFileTypes.add("application/x-stata-14");
readableFileTypes.add("application/x-stata-15");

Pattern p = Pattern.compile(regex);
ptn = Pattern.compile(rdargx);
Expand Down Expand Up @@ -259,7 +263,45 @@ public String testDTAformat(MappedByteBuffer buff) {
}

}


if ((result == null) && (buff.capacity() >= STATA_14_HEADER.length())) {
// Let's see if it's a "new" STATA (v.14+) format:
buff.rewind();
byte[] headerBuffer = null;
String headerString = null;
try {
headerBuffer = new byte[STATA_14_HEADER.length()];
buff.get(headerBuffer, 0, STATA_14_HEADER.length());
headerString = new String(headerBuffer, "US-ASCII");
} catch (Exception ex) {
// probably a buffer underflow exception;
// we don't have to do anything... null will
// be returned, below.
}
if (STATA_14_HEADER.equals(headerString)) {
result = "application/x-stata-14";
}
}

if ((result == null) && (buff.capacity() >= STATA_15_HEADER.length())) {
// Let's see if it's a "new" STATA (v.14+) format:
buff.rewind();
byte[] headerBuffer = null;
String headerString = null;
try {
headerBuffer = new byte[STATA_15_HEADER.length()];
buff.get(headerBuffer, 0, STATA_15_HEADER.length());
headerString = new String(headerBuffer, "US-ASCII");
} catch (Exception ex) {
// probably a buffer underflow exception;
// we don't have to do anything... null will
// be returned, below.
}
if (STATA_15_HEADER.equals(headerString)) {
result = "application/x-stata-15";
}
}

return result;
}

Expand Down
Loading

0 comments on commit b2fe2e0

Please sign in to comment.