Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SEDONA-497] Fix incorrect fieldNames property of SpatialRDD read from a directory containing multiple shapefiles #1243

Merged
merged 1 commit into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,16 @@ public static List<String> readFieldNames(JavaSparkContext sc, String inputPath)
fieldDescriptors = fieldDescriptors.reduceByKey(new Function2<String, String, String>()
{
@Override
public String call(String descripter1, String descripter2)
public String call(String descriptor1, String descriptor2)
throws Exception
{
return descripter1 + " " + descripter2;
if (!descriptor1.equals(descriptor2)) {
String message = String.format("Detected different schema in the input shapefiles:\n %s\n %s\n" +
"Please make sure all shapefiles have the same schema.",
descriptor1, descriptor2);
throw new IOException(message);
}
return descriptor1;
}
});
// if there is a result assign it to variable : fieldNames
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.apache.sedona.core.formatMapper.shapefileParser.shapes;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.LocatedFileStatus;
Expand All @@ -35,6 +36,7 @@
import org.apache.sedona.core.spatialRDD.PointRDD;
import org.apache.sedona.core.spatialRDD.PolygonRDD;
import org.apache.sedona.core.spatialRDD.SpatialRDD;
import org.apache.spark.SparkException;
import org.geotools.data.DataStore;
import org.geotools.data.DataStoreFinder;
import org.geotools.data.FeatureSource;
Expand Down Expand Up @@ -62,6 +64,8 @@
import java.util.Map;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;

public class ShapefileReaderTest
extends TestBase
Expand Down Expand Up @@ -392,5 +396,27 @@ public void testReadMultipleShapeFilesByMultiPartitions()
// load shapes with our tool
SpatialRDD shapeRDD = ShapefileReader.readToGeometryRDD(sc, inputLocation);
assert (shapeRDD.rawSpatialRDD.getNumPartitions() == 2);
assertEquals("[STATEFP, COUNTYFP, COUNTYNS, AFFGEOID, GEOID, NAME, LSAD, ALAND, AWATER]", shapeRDD.fieldNames.toString());
}

/**
* Test reading multiple shape files with inconsistent schema. It should throw an exception.
*/
@Test
public void testReadMultipleShapeFilesWithInconsistentSchema() throws IOException {
String outputLocation = getShapeFilePath("multipleshapefiles") + "-multischema";
try {
String inputLocation = getShapeFilePath("multipleshapefiles");
String inputLocation2 = getShapeFilePath("gis_osm_pois_free_1");
FileUtils.copyDirectory(new File(inputLocation), new File(outputLocation));
FileUtils.copyDirectory(new File(inputLocation2), new File(outputLocation));
SparkException exception = assertThrows(SparkException.class, () -> ShapefileReader.readToGeometryRDD(sc, outputLocation));
assertTrue(exception.getMessage().contains("different schema"));
} finally {
File file = new File(outputLocation);
if (file.exists()) {
FileUtils.deleteDirectory(file);
}
}
}
}
Loading