From f0b25fc03d6edfd95b18082303e57e90eabbe5d1 Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 4 Apr 2018 10:22:32 -0400 Subject: [PATCH 1/8] Initial pass io.archivesunleashed.spark -> io.archivesunleashed package renaming. --- .../{spark => }/archive/io/ArchiveRecord.scala | 6 +++--- .../{spark => }/matchbox/ComputeImageSize.scala | 2 +- .../{spark => }/matchbox/ComputeMD5.scala | 2 +- .../{spark => }/matchbox/DetectLanguage.scala | 2 +- .../{spark => }/matchbox/DetectMimeTypeTika.scala | 2 +- .../{spark => }/matchbox/ExtractAtMentions.scala | 2 +- .../{spark => }/matchbox/ExtractBoilerpipeText.scala | 2 +- .../{spark => }/matchbox/ExtractDate.scala | 2 +- .../{spark => }/matchbox/ExtractDomain.scala | 2 +- .../{spark => }/matchbox/ExtractEntities.scala | 2 +- .../{spark => }/matchbox/ExtractGraph.scala | 10 +++++----- .../{spark => }/matchbox/ExtractHashtags.scala | 2 +- .../{spark => }/matchbox/ExtractImageLinks.scala | 2 +- .../{spark => }/matchbox/ExtractLinks.scala | 2 +- .../{spark => }/matchbox/ExtractPopularImages.scala | 6 +++--- .../{spark => }/matchbox/ExtractTextFromPDFs.scala | 2 +- .../{spark => }/matchbox/ExtractUrls.scala | 2 +- .../{spark => }/matchbox/NER3Classifier.scala | 2 +- .../{spark => }/matchbox/NERCombinedJson.scala | 4 ++-- .../{spark => }/matchbox/RecordLoader.scala | 6 +++--- .../{spark => }/matchbox/RemoveHTML.scala | 2 +- .../{spark => }/matchbox/RemoveHttpHeader.scala | 2 +- .../{spark => }/matchbox/StringUtils.scala | 2 +- .../{spark => }/matchbox/TupleFormatter.scala | 2 +- .../{spark => }/matchbox/TweetUtils.scala | 2 +- .../{spark => }/matchbox/WriteGEXF.scala | 2 +- .../{spark => }/matchbox/WriteGraphML.scala | 2 +- .../archivesunleashed/{spark => }/rdd/RecordRDD.scala | 10 +++++----- .../archivesunleashed/{spark => }/utils/JsonUtil.scala | 2 +- .../io/archivesunleashed/{spark => }/ArcTest.scala | 6 +++--- .../{spark => }/ArchiveRecordTest.scala | 6 ++++-- .../io/archivesunleashed/{spark => }/WarcTest.scala | 9 +++++---- .../{spark => }/matchbox/ComputeImageSizeTest.scala | 2 +- .../{spark => }/matchbox/ExtractAtMentionsTest.scala | 2 +- .../matchbox/ExtractBoilerPipeTextTest.scala | 2 +- .../{spark => }/matchbox/ExtractDateTest.scala | 4 ++-- .../{spark => }/matchbox/ExtractDomainTest.scala | 2 +- .../{spark => }/matchbox/ExtractEntitiesTest.scala | 4 ++-- .../{spark => }/matchbox/ExtractGraphTest.scala | 4 ++-- .../{spark => }/matchbox/ExtractHashtagsTest.scala | 2 +- .../{spark => }/matchbox/ExtractImageLinksTest.scala | 2 +- .../{spark => }/matchbox/ExtractLinksTest.scala | 2 +- .../matchbox/ExtractPopularImagesTest.scala | 6 +++--- .../{spark => }/matchbox/ExtractTextFromPDFsTest.scala | 4 ++-- .../{spark => }/matchbox/ExtractUrlsTest.scala | 2 +- .../{spark => }/matchbox/RecordLoaderTest.scala | 10 +++++----- .../{spark => }/matchbox/RemoveHTMLTest.scala | 2 +- .../{spark => }/matchbox/RemoveHttpHeaderTest.scala | 2 +- .../{spark => }/matchbox/StringUtilsTest.scala | 3 ++- .../{spark => }/matchbox/TupleFormatterTest.scala | 2 +- .../{spark => }/matchbox/TweetUtilsTest.scala | 2 +- .../{spark => }/matchbox/WriteGEXFTest.scala | 6 +++--- .../{spark => }/matchbox/WriteGraphMLTest.scala | 6 +++--- .../{spark => }/rdd/CountableRDDTest.scala | 6 +++--- .../{spark => }/rdd/RecordRDDTest.scala | 10 +++++----- .../{spark => }/utils/JsonUtilTest.scala | 4 ++-- 56 files changed, 102 insertions(+), 98 deletions(-) rename src/main/scala/io/archivesunleashed/{spark => }/archive/io/ArchiveRecord.scala (93%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ComputeImageSize.scala (96%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ComputeMD5.scala (95%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/DetectLanguage.scala (95%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/DetectMimeTypeTika.scala (96%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractAtMentions.scala (94%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractBoilerpipeText.scala (96%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractDate.scala (97%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractDomain.scala (96%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractEntities.scala (98%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractGraph.scala (92%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractHashtags.scala (94%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractImageLinks.scala (97%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractLinks.scala (97%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractPopularImages.scala (92%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractTextFromPDFs.scala (95%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/ExtractUrls.scala (94%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/NER3Classifier.scala (98%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/NERCombinedJson.scala (98%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/RecordLoader.scala (92%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/RemoveHTML.scala (95%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/RemoveHttpHeader.scala (95%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/StringUtils.scala (96%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/TupleFormatter.scala (97%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/TweetUtils.scala (97%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/WriteGEXF.scala (98%) rename src/main/scala/io/archivesunleashed/{spark => }/matchbox/WriteGraphML.scala (98%) rename src/main/scala/io/archivesunleashed/{spark => }/rdd/RecordRDD.scala (92%) rename src/main/scala/io/archivesunleashed/{spark => }/utils/JsonUtil.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/ArcTest.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/ArchiveRecordTest.scala (95%) rename src/test/scala/io/archivesunleashed/{spark => }/WarcTest.scala (90%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ComputeImageSizeTest.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractAtMentionsTest.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractBoilerPipeTextTest.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractDateTest.scala (93%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractDomainTest.scala (97%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractEntitiesTest.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractGraphTest.scala (97%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractHashtagsTest.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractImageLinksTest.scala (97%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractLinksTest.scala (98%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractPopularImagesTest.scala (93%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractTextFromPDFsTest.scala (91%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/ExtractUrlsTest.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/RecordLoaderTest.scala (88%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/RemoveHTMLTest.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/RemoveHttpHeaderTest.scala (96%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/StringUtilsTest.scala (97%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/TupleFormatterTest.scala (97%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/TweetUtilsTest.scala (97%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/WriteGEXFTest.scala (94%) rename src/test/scala/io/archivesunleashed/{spark => }/matchbox/WriteGraphMLTest.scala (94%) rename src/test/scala/io/archivesunleashed/{spark => }/rdd/CountableRDDTest.scala (92%) rename src/test/scala/io/archivesunleashed/{spark => }/rdd/RecordRDDTest.scala (95%) rename src/test/scala/io/archivesunleashed/{spark => }/utils/JsonUtilTest.scala (93%) diff --git a/src/main/scala/io/archivesunleashed/spark/archive/io/ArchiveRecord.scala b/src/main/scala/io/archivesunleashed/archive/io/ArchiveRecord.scala similarity index 93% rename from src/main/scala/io/archivesunleashed/spark/archive/io/ArchiveRecord.scala rename to src/main/scala/io/archivesunleashed/archive/io/ArchiveRecord.scala index eea02a99..04db0b07 100644 --- a/src/main/scala/io/archivesunleashed/spark/archive/io/ArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/archive/io/ArchiveRecord.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.archive.io +package io.archivesunleashed.archive.io import java.text.SimpleDateFormat @@ -26,8 +26,8 @@ import org.archive.util.ArchiveUtils import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils} import io.archivesunleashed.io.ArchiveRecordWritable import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat -import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent -import io.archivesunleashed.spark.matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain} +import io.archivesunleashed.matchbox.ExtractDate.DateComponent +import io.archivesunleashed.matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain} class ArchiveRecord(r: SerializableWritable[ArchiveRecordWritable]) extends Serializable { var arcRecord: ARCRecord = null diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ComputeImageSize.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala similarity index 96% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ComputeImageSize.scala rename to src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala index 9e3f8cb6..e470daea 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ComputeImageSize.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.ByteArrayInputStream import javax.imageio.ImageIO diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ComputeMD5.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala similarity index 95% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ComputeMD5.scala rename to src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala index 4e0b068a..3c02b504 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ComputeMD5.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.security.MessageDigest diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/DetectLanguage.scala b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala similarity index 95% rename from src/main/scala/io/archivesunleashed/spark/matchbox/DetectLanguage.scala rename to src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala index 891bce17..a64b4a05 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/DetectLanguage.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.apache.tika.language.LanguageIdentifier diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/DetectMimeTypeTika.scala b/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala similarity index 96% rename from src/main/scala/io/archivesunleashed/spark/matchbox/DetectMimeTypeTika.scala rename to src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala index 922993cc..26b1af27 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/DetectMimeTypeTika.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.ByteArrayInputStream diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractAtMentions.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractAtMentions.scala similarity index 94% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractAtMentions.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractAtMentions.scala index 20565905..e3a69982 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractAtMentions.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractAtMentions.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox object ExtractAtMentions { val pattern = """@[A-Za-z_0-9]+""".r diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractBoilerpipeText.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala similarity index 96% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractBoilerpipeText.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala index 29438896..6a3a8b76 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractBoilerpipeText.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.IOException import de.l3s.boilerpipe.extractors.DefaultExtractor diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractDate.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala similarity index 97% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractDate.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala index 41daabef..775e4503 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractDate.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox /** * Simple wrapper for getting different parts of a date diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala similarity index 96% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractDomain.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala index 43362f32..6f617e1b 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractDomain.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.net.URL diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractEntities.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala similarity index 98% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractEntities.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala index 0faf9c9d..40a06fd8 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractEntities.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractGraph.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala similarity index 92% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractGraph.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala index a232b6bb..2d70c91b 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractGraph.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala @@ -14,14 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox -import io.archivesunleashed.spark.archive.io.ArchiveRecord +import io.archivesunleashed.archive.io.ArchiveRecord import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD -import io.archivesunleashed.spark.matchbox.StringUtils._ -import io.archivesunleashed.spark.rdd.RecordRDD._ -import io.archivesunleashed.spark.utils.JsonUtil +import io.archivesunleashed.matchbox.StringUtils._ +import io.archivesunleashed.rdd.RecordRDD._ +import io.archivesunleashed.utils.JsonUtil /** * diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractHashtags.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractHashtags.scala similarity index 94% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractHashtags.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractHashtags.scala index 37144862..637bba8d 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractHashtags.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractHashtags.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox object ExtractHashtags { val pattern = """#[^ ]+""".r diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractImageLinks.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractImageLinks.scala similarity index 97% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractImageLinks.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractImageLinks.scala index f0a2b8f2..8af62bbb 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractImageLinks.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractImageLinks.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.IOException diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractLinks.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala similarity index 97% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractLinks.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala index eda67fa4..b8b26140 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractLinks.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.IOException diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractPopularImages.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala similarity index 92% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractPopularImages.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala index b24fc74f..e9a1995b 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractPopularImages.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala @@ -14,12 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.apache.spark.{RangePartitioner, SparkContext} -import io.archivesunleashed.spark.rdd.RecordRDD._ +import io.archivesunleashed.rdd.RecordRDD._ import org.apache.spark.rdd.RDD -import io.archivesunleashed.spark.archive.io.ArchiveRecord +import io.archivesunleashed.archive.io.ArchiveRecord /** diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractTextFromPDFs.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala similarity index 95% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractTextFromPDFs.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala index 5c347e7b..06bc110b 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractTextFromPDFs.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.ByteArrayInputStream diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractUrls.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractUrls.scala similarity index 94% rename from src/main/scala/io/archivesunleashed/spark/matchbox/ExtractUrls.scala rename to src/main/scala/io/archivesunleashed/matchbox/ExtractUrls.scala index 2c4f308c..cf1bf94d 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/ExtractUrls.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractUrls.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox object ExtractUrls { val pattern = """https?://[^ ]+""".r diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/NER3Classifier.scala b/src/main/scala/io/archivesunleashed/matchbox/NER3Classifier.scala similarity index 98% rename from src/main/scala/io/archivesunleashed/spark/matchbox/NER3Classifier.scala rename to src/main/scala/io/archivesunleashed/matchbox/NER3Classifier.scala index 6291a190..b10d009c 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/NER3Classifier.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/NER3Classifier.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.util diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/NERCombinedJson.scala b/src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala similarity index 98% rename from src/main/scala/io/archivesunleashed/spark/matchbox/NERCombinedJson.scala rename to src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala index 752fc631..c531a8c3 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/NERCombinedJson.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.BufferedReader import java.io.BufferedWriter @@ -23,7 +23,7 @@ import java.io.OutputStreamWriter import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.spark.SparkContext -import io.archivesunleashed.spark.utils.JsonUtil +import io.archivesunleashed.utils.JsonUtil import scala.collection.mutable.MutableList import scala.util.Random diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/RecordLoader.scala b/src/main/scala/io/archivesunleashed/matchbox/RecordLoader.scala similarity index 92% rename from src/main/scala/io/archivesunleashed/spark/matchbox/RecordLoader.scala rename to src/main/scala/io/archivesunleashed/matchbox/RecordLoader.scala index b0651d25..9c35a5a5 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/RecordLoader.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/RecordLoader.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.apache.hadoop.io.LongWritable import org.apache.spark.{SerializableWritable, SparkContext} @@ -24,8 +24,8 @@ import org.json4s.jackson.JsonMethods._ import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat import io.archivesunleashed.io.ArchiveRecordWritable import io.archivesunleashed.mapreduce.WacInputFormat -import io.archivesunleashed.spark.archive.io.ArchiveRecord -import io.archivesunleashed.spark.rdd.RecordRDD._ +import io.archivesunleashed.archive.io.ArchiveRecord +import io.archivesunleashed.rdd.RecordRDD._ object RecordLoader { diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/RemoveHTML.scala b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala similarity index 95% rename from src/main/scala/io/archivesunleashed/spark/matchbox/RemoveHTML.scala rename to src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala index a1dc8246..1978b385 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/RemoveHTML.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.IOException diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/RemoveHttpHeader.scala b/src/main/scala/io/archivesunleashed/matchbox/RemoveHttpHeader.scala similarity index 95% rename from src/main/scala/io/archivesunleashed/spark/matchbox/RemoveHttpHeader.scala rename to src/main/scala/io/archivesunleashed/matchbox/RemoveHttpHeader.scala index 9a79c4c1..df2468cc 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/RemoveHttpHeader.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/RemoveHttpHeader.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox /** * Created by youngbinkim on 7/9/16. diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/StringUtils.scala b/src/main/scala/io/archivesunleashed/matchbox/StringUtils.scala similarity index 96% rename from src/main/scala/io/archivesunleashed/spark/matchbox/StringUtils.scala rename to src/main/scala/io/archivesunleashed/matchbox/StringUtils.scala index f85bd8ce..4b9e04cc 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/StringUtils.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/StringUtils.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import scala.xml.Utility.escape import java.io.IOException import java.security.MessageDigest diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/TupleFormatter.scala b/src/main/scala/io/archivesunleashed/matchbox/TupleFormatter.scala similarity index 97% rename from src/main/scala/io/archivesunleashed/spark/matchbox/TupleFormatter.scala rename to src/main/scala/io/archivesunleashed/matchbox/TupleFormatter.scala index 3fead692..2884ef74 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/TupleFormatter.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/TupleFormatter.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import shapeless._ // v2.0.0, for full compatibility with Scala 2.10.4 (Spark dep) import ops.tuple.FlatMapper diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/TweetUtils.scala b/src/main/scala/io/archivesunleashed/matchbox/TweetUtils.scala similarity index 97% rename from src/main/scala/io/archivesunleashed/spark/matchbox/TweetUtils.scala rename to src/main/scala/io/archivesunleashed/matchbox/TweetUtils.scala index 1992b45c..78cbd15a 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/TweetUtils.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/TweetUtils.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.json4s.JsonAST._ diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/WriteGEXF.scala b/src/main/scala/io/archivesunleashed/matchbox/WriteGEXF.scala similarity index 98% rename from src/main/scala/io/archivesunleashed/spark/matchbox/WriteGEXF.scala rename to src/main/scala/io/archivesunleashed/matchbox/WriteGEXF.scala index c98c1894..f716f9f0 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/WriteGEXF.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/WriteGEXF.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.BufferedWriter import java.io.OutputStreamWriter diff --git a/src/main/scala/io/archivesunleashed/spark/matchbox/WriteGraphML.scala b/src/main/scala/io/archivesunleashed/matchbox/WriteGraphML.scala similarity index 98% rename from src/main/scala/io/archivesunleashed/spark/matchbox/WriteGraphML.scala rename to src/main/scala/io/archivesunleashed/matchbox/WriteGraphML.scala index 9f1bac8c..20748545 100644 --- a/src/main/scala/io/archivesunleashed/spark/matchbox/WriteGraphML.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/WriteGraphML.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.BufferedWriter import java.io.OutputStreamWriter diff --git a/src/main/scala/io/archivesunleashed/spark/rdd/RecordRDD.scala b/src/main/scala/io/archivesunleashed/rdd/RecordRDD.scala similarity index 92% rename from src/main/scala/io/archivesunleashed/spark/rdd/RecordRDD.scala rename to src/main/scala/io/archivesunleashed/rdd/RecordRDD.scala index 20e27857..e4f6ab3b 100644 --- a/src/main/scala/io/archivesunleashed/spark/rdd/RecordRDD.scala +++ b/src/main/scala/io/archivesunleashed/rdd/RecordRDD.scala @@ -14,13 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.rdd +package io.archivesunleashed.rdd -import io.archivesunleashed.spark.archive.io.ArchiveRecord +import io.archivesunleashed.archive.io.ArchiveRecord import org.apache.spark.rdd.RDD -import io.archivesunleashed.spark.matchbox.{DetectLanguage, ExtractDate, ExtractDomain, RemoveHTML} -import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent -import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent.DateComponent +import io.archivesunleashed.matchbox.{DetectLanguage, ExtractDate, ExtractDomain, RemoveHTML} +import io.archivesunleashed.matchbox.ExtractDate.DateComponent +import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent import scala.reflect.ClassTag import scala.util.matching.Regex diff --git a/src/main/scala/io/archivesunleashed/spark/utils/JsonUtil.scala b/src/main/scala/io/archivesunleashed/utils/JsonUtil.scala similarity index 96% rename from src/main/scala/io/archivesunleashed/spark/utils/JsonUtil.scala rename to src/main/scala/io/archivesunleashed/utils/JsonUtil.scala index 66c767a0..786eaa59 100644 --- a/src/main/scala/io/archivesunleashed/spark/utils/JsonUtil.scala +++ b/src/main/scala/io/archivesunleashed/utils/JsonUtil.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.utils +package io.archivesunleashed.utils import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} import com.fasterxml.jackson.module.scala.DefaultScalaModule diff --git a/src/test/scala/io/archivesunleashed/spark/ArcTest.scala b/src/test/scala/io/archivesunleashed/ArcTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/spark/ArcTest.scala rename to src/test/scala/io/archivesunleashed/ArcTest.scala index 1046d32c..3032354b 100644 --- a/src/test/scala/io/archivesunleashed/spark/ArcTest.scala +++ b/src/test/scala/io/archivesunleashed/ArcTest.scala @@ -21,9 +21,9 @@ import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} -import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent -import io.archivesunleashed.spark.matchbox._ -import io.archivesunleashed.spark.rdd.RecordRDD._ +import io.archivesunleashed.matchbox.ExtractDate.DateComponent +import io.archivesunleashed.matchbox._ +import io.archivesunleashed.rdd.RecordRDD._ @RunWith(classOf[JUnitRunner]) class ArcTest extends FunSuite with BeforeAndAfter { diff --git a/src/test/scala/io/archivesunleashed/spark/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala similarity index 95% rename from src/test/scala/io/archivesunleashed/spark/ArchiveRecordTest.scala rename to src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala index 058ad434..e4514fc5 100644 --- a/src/test/scala/io/archivesunleashed/spark/ArchiveRecordTest.scala +++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala @@ -14,14 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark +package io.archivesunleashed + +import matchbox._ import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} -import io.archivesunleashed.spark.matchbox._ + @RunWith(classOf[JUnitRunner]) class ArchiveRecordTest extends FunSuite with BeforeAndAfter { diff --git a/src/test/scala/io/archivesunleashed/spark/WarcTest.scala b/src/test/scala/io/archivesunleashed/WarcTest.scala similarity index 90% rename from src/test/scala/io/archivesunleashed/spark/WarcTest.scala rename to src/test/scala/io/archivesunleashed/WarcTest.scala index 0f354e9e..e5902dda 100644 --- a/src/test/scala/io/archivesunleashed/spark/WarcTest.scala +++ b/src/test/scala/io/archivesunleashed/WarcTest.scala @@ -14,17 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark +package io.archivesunleashed + +import archive.io.ArchiveRecord +import matchbox.RecordLoader +import rdd.RecordRDD._ import com.google.common.io.Resources -import io.archivesunleashed.spark.archive.io.ArchiveRecord import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} -import io.archivesunleashed.spark.matchbox.RecordLoader -import io.archivesunleashed.spark.rdd.RecordRDD._ @RunWith(classOf[JUnitRunner]) class WarcTest extends FunSuite with BeforeAndAfter { diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ComputeImageSizeTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ComputeImageSizeTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala index 7e35e7a5..503fda31 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ComputeImageSizeTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractAtMentionsTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractAtMentionsTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractAtMentionsTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractAtMentionsTest.scala index 593c4111..6b665831 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractAtMentionsTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractAtMentionsTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractBoilerPipeTextTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractBoilerPipeTextTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala index 036e40f3..2a495c1c 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractBoilerPipeTextTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala @@ -15,7 +15,7 @@ * limitations under the License. */ - package io.archivesunleashed.spark.matchbox + package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractDateTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala similarity index 93% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractDateTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala index f0fd23e2..8bf2325e 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractDateTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala @@ -14,12 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner -import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent._ +import io.archivesunleashed.matchbox.ExtractDate.DateComponent._ @RunWith(classOf[JUnitRunner]) class ExtractDateTest extends FunSuite { diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractDomainTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala similarity index 97% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractDomainTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala index 622b572a..b1f80eff 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractDomainTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractEntitiesTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractEntitiesTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala index 264a41bd..d1752078 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractEntitiesTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import java.io.File @@ -27,7 +27,7 @@ import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfter, FunSuite} import org.scalatest.junit.JUnitRunner import org.junit.runner.RunWith -import io.archivesunleashed.spark.matchbox.NER3Classifier.NERClassType +import io.archivesunleashed.matchbox.NER3Classifier.NERClassType import scala.collection.mutable diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractGraphTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala similarity index 97% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractGraphTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala index f85d9cdd..51e34521 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractGraphTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala @@ -15,12 +15,12 @@ * limitations under the License. */ - package io.archivesunleashed.spark.matchbox + package io.archivesunleashed.matchbox import com.google.common.io.Resources import org.apache.spark.{ SparkConf, SparkContext } import org.apache.commons.io.FileUtils - import io.archivesunleashed.spark.rdd.RecordRDD._ + import io.archivesunleashed.rdd.RecordRDD._ import org.apache.spark.graphx._ import org.junit.runner.RunWith import org.scalatest.{ BeforeAndAfter, FunSuite } diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractHashtagsTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractHashtagsTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractHashtagsTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractHashtagsTest.scala index eff2f28e..03736738 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractHashtagsTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractHashtagsTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractImageLinksTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala similarity index 97% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractImageLinksTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala index 28272f3b..5d395d79 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractImageLinksTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractLinksTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala similarity index 98% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractLinksTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala index d3bc5da5..79bcb26f 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractLinksTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractPopularImagesTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala similarity index 93% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractPopularImagesTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala index 1e23255d..bcd84e6c 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractPopularImagesTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import com.google.common.io.Resources import org.apache.spark.{ SparkConf, SparkContext } @@ -23,8 +23,8 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{ BeforeAndAfter, FunSuite } import java.io.File import java.nio.file.{Paths, Files} -import io.archivesunleashed.spark.matchbox._ -import io.archivesunleashed.spark.rdd.RecordRDD._ +import io.archivesunleashed.matchbox._ +import io.archivesunleashed.rdd.RecordRDD._ import scala.io.Source @RunWith(classOf[JUnitRunner]) diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractTextFromPDFsTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFsTest.scala similarity index 91% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractTextFromPDFsTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFsTest.scala index 8804e9af..1777ccc5 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractTextFromPDFsTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFsTest.scala @@ -15,12 +15,12 @@ * limitations under the License. */ - package io.archivesunleashed.spark.matchbox + package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner - import io.archivesunleashed.spark.matchbox._ + import io.archivesunleashed.matchbox._ import org.scalatest.Matchers._ import org.apache.tika.parser.pdf.PDFParser; diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractUrlsTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractUrlsTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/spark/matchbox/ExtractUrlsTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/ExtractUrlsTest.scala index a3b7a249..77cc37fc 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/ExtractUrlsTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractUrlsTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/RecordLoaderTest.scala b/src/test/scala/io/archivesunleashed/matchbox/RecordLoaderTest.scala similarity index 88% rename from src/test/scala/io/archivesunleashed/spark/matchbox/RecordLoaderTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/RecordLoaderTest.scala index c0f60be2..3c63ac96 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/RecordLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/RecordLoaderTest.scala @@ -15,7 +15,7 @@ * limitations under the License. */ - package io.archivesunleashed.spark.rdd + package io.archivesunleashed.rdd import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} @@ -23,10 +23,10 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} import scala.util.matching.Regex - import io.archivesunleashed.spark.matchbox._ - import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent - import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent.DateComponent - import io.archivesunleashed.spark.rdd.RecordRDD._ + import io.archivesunleashed.matchbox._ + import io.archivesunleashed.matchbox.ExtractDate.DateComponent + import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent + import io.archivesunleashed.rdd.RecordRDD._ import java.io._ import java.nio.file.{Paths, Files} import org.json4s._ diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/RemoveHTMLTest.scala b/src/test/scala/io/archivesunleashed/matchbox/RemoveHTMLTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/spark/matchbox/RemoveHTMLTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/RemoveHTMLTest.scala index 09499a3b..ad712653 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/RemoveHTMLTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/RemoveHTMLTest.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/RemoveHttpHeaderTest.scala b/src/test/scala/io/archivesunleashed/matchbox/RemoveHttpHeaderTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/spark/matchbox/RemoveHttpHeaderTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/RemoveHttpHeaderTest.scala index 3bbc7876..a90eeed6 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/RemoveHttpHeaderTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/RemoveHttpHeaderTest.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/StringUtilsTest.scala b/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala similarity index 97% rename from src/test/scala/io/archivesunleashed/spark/matchbox/StringUtilsTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala index 15bd496c..06e20d8a 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/StringUtilsTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala @@ -14,7 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox + import java.io.IOException import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/TupleFormatterTest.scala b/src/test/scala/io/archivesunleashed/matchbox/TupleFormatterTest.scala similarity index 97% rename from src/test/scala/io/archivesunleashed/spark/matchbox/TupleFormatterTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/TupleFormatterTest.scala index ac74adfa..6b7271c3 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/TupleFormatterTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/TupleFormatterTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/TweetUtilsTest.scala b/src/test/scala/io/archivesunleashed/matchbox/TweetUtilsTest.scala similarity index 97% rename from src/test/scala/io/archivesunleashed/spark/matchbox/TweetUtilsTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/TweetUtilsTest.scala index 3d0a8668..ccd5b378 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/TweetUtilsTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/TweetUtilsTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/WriteGEXFTest.scala b/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala similarity index 94% rename from src/test/scala/io/archivesunleashed/spark/matchbox/WriteGEXFTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala index 63721547..00ecac78 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/WriteGEXFTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.apache.spark.{ SparkConf, SparkContext } @@ -24,8 +24,8 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{ BeforeAndAfter, FunSuite } import java.io.File import java.nio.file.{Paths, Files} -import io.archivesunleashed.spark.matchbox._ -import io.archivesunleashed.spark.rdd.RecordRDD._ +import io.archivesunleashed.matchbox._ +import io.archivesunleashed.rdd.RecordRDD._ import scala.io.Source @RunWith(classOf[JUnitRunner]) diff --git a/src/test/scala/io/archivesunleashed/spark/matchbox/WriteGraphMLTest.scala b/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala similarity index 94% rename from src/test/scala/io/archivesunleashed/spark/matchbox/WriteGraphMLTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala index 59f1d846..bbd8c21c 100644 --- a/src/test/scala/io/archivesunleashed/spark/matchbox/WriteGraphMLTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.matchbox +package io.archivesunleashed.matchbox import org.apache.spark.{ SparkConf, SparkContext } @@ -24,8 +24,8 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{ BeforeAndAfter, FunSuite } import java.io.File import java.nio.file.{Paths, Files} -import io.archivesunleashed.spark.matchbox._ -import io.archivesunleashed.spark.rdd.RecordRDD._ +import io.archivesunleashed.matchbox._ +import io.archivesunleashed.rdd.RecordRDD._ import scala.io.Source @RunWith(classOf[JUnitRunner]) diff --git a/src/test/scala/io/archivesunleashed/spark/rdd/CountableRDDTest.scala b/src/test/scala/io/archivesunleashed/rdd/CountableRDDTest.scala similarity index 92% rename from src/test/scala/io/archivesunleashed/spark/rdd/CountableRDDTest.scala rename to src/test/scala/io/archivesunleashed/rdd/CountableRDDTest.scala index 9ba38a1a..60d234bc 100644 --- a/src/test/scala/io/archivesunleashed/spark/rdd/CountableRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/rdd/CountableRDDTest.scala @@ -14,15 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.rdd +package io.archivesunleashed.rdd import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} -import io.archivesunleashed.spark.matchbox._ -import io.archivesunleashed.spark.rdd.RecordRDD._ +import io.archivesunleashed.matchbox._ +import io.archivesunleashed.rdd.RecordRDD._ @RunWith(classOf[JUnitRunner]) class CountableRDDTest extends FunSuite with BeforeAndAfter { diff --git a/src/test/scala/io/archivesunleashed/spark/rdd/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/rdd/RecordRDDTest.scala similarity index 95% rename from src/test/scala/io/archivesunleashed/spark/rdd/RecordRDDTest.scala rename to src/test/scala/io/archivesunleashed/rdd/RecordRDDTest.scala index 462e5bcd..7471e7f9 100644 --- a/src/test/scala/io/archivesunleashed/spark/rdd/RecordRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/rdd/RecordRDDTest.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark.rdd +package io.archivesunleashed.rdd import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} @@ -22,10 +22,10 @@ import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} import scala.util.matching.Regex -import io.archivesunleashed.spark.matchbox._ -import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent -import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent.DateComponent -import io.archivesunleashed.spark.rdd.RecordRDD._ +import io.archivesunleashed.matchbox._ +import io.archivesunleashed.matchbox.ExtractDate.DateComponent +import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent +import io.archivesunleashed.rdd.RecordRDD._ @RunWith(classOf[JUnitRunner]) diff --git a/src/test/scala/io/archivesunleashed/spark/utils/JsonUtilTest.scala b/src/test/scala/io/archivesunleashed/utils/JsonUtilTest.scala similarity index 93% rename from src/test/scala/io/archivesunleashed/spark/utils/JsonUtilTest.scala rename to src/test/scala/io/archivesunleashed/utils/JsonUtilTest.scala index 22801c1a..0b2e9409 100644 --- a/src/test/scala/io/archivesunleashed/spark/utils/JsonUtilTest.scala +++ b/src/test/scala/io/archivesunleashed/utils/JsonUtilTest.scala @@ -15,12 +15,12 @@ * limitations under the License. */ - package io.archivesunleashed.spark.matchbox + package io.archivesunleashed.matchbox import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner - import io.archivesunleashed.spark.utils._ + import io.archivesunleashed.utils._ @RunWith(classOf[JUnitRunner]) class JsonUtilTest extends FunSuite { From 5c3ae8873fb5b0494ff9e99717c881ad9fbc0ca0 Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 4 Apr 2018 12:07:19 -0400 Subject: [PATCH 2/8] More package refactoring. --- .../{archive/io => }/ArchiveRecord.scala | 14 ++++--- .../{matchbox => }/RecordLoader.scala | 26 +++++------- .../RecordRDD.scala => Transformations.scala} | 12 +++--- .../matchbox/ExtractEntities.scala | 2 + .../matchbox/ExtractGraph.scala | 9 ++-- .../matchbox/ExtractPopularImages.scala | 6 +-- .../scala/io/archivesunleashed/ArcTest.scala | 19 +++++---- .../archivesunleashed/ArchiveRecordTest.scala | 6 +-- .../{rdd => }/CountableRDDTest.scala | 9 ++-- .../{matchbox => }/RecordLoaderTest.scala | 13 +++--- .../{rdd => }/RecordRDDTest.scala | 41 ++++++++++--------- .../scala/io/archivesunleashed/WarcTest.scala | 8 ++-- .../matchbox/ExtractGraphTest.scala | 8 ++-- .../matchbox/ExtractPopularImagesTest.scala | 5 +-- .../matchbox/WriteGEXFTest.scala | 4 +- .../matchbox/WriteGraphMLTest.scala | 3 +- 16 files changed, 89 insertions(+), 96 deletions(-) rename src/main/scala/io/archivesunleashed/{archive/io => }/ArchiveRecord.scala (88%) rename src/main/scala/io/archivesunleashed/{matchbox => }/RecordLoader.scala (56%) rename src/main/scala/io/archivesunleashed/{rdd/RecordRDD.scala => Transformations.scala} (91%) rename src/test/scala/io/archivesunleashed/{rdd => }/CountableRDDTest.scala (89%) rename src/test/scala/io/archivesunleashed/{matchbox => }/RecordLoaderTest.scala (86%) rename src/test/scala/io/archivesunleashed/{rdd => }/RecordRDDTest.scala (77%) diff --git a/src/main/scala/io/archivesunleashed/archive/io/ArchiveRecord.scala b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala similarity index 88% rename from src/main/scala/io/archivesunleashed/archive/io/ArchiveRecord.scala rename to src/main/scala/io/archivesunleashed/ArchiveRecord.scala index 04db0b07..e6f4e81e 100644 --- a/src/main/scala/io/archivesunleashed/archive/io/ArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala @@ -14,7 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.archive.io + +package io.archivesunleashed + +import io.ArchiveRecordWritable +import io.ArchiveRecordWritable._ +import data.{ArcRecordUtils, WarcRecordUtils} +import matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain} +import ExtractDate.DateComponent import java.text.SimpleDateFormat @@ -23,11 +30,6 @@ import org.archive.io.arc.ARCRecord import org.archive.io.warc.WARCRecord import org.archive.util.ArchiveUtils -import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils} -import io.archivesunleashed.io.ArchiveRecordWritable -import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat -import io.archivesunleashed.matchbox.ExtractDate.DateComponent -import io.archivesunleashed.matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain} class ArchiveRecord(r: SerializableWritable[ArchiveRecordWritable]) extends Serializable { var arcRecord: ARCRecord = null diff --git a/src/main/scala/io/archivesunleashed/matchbox/RecordLoader.scala b/src/main/scala/io/archivesunleashed/RecordLoader.scala similarity index 56% rename from src/main/scala/io/archivesunleashed/matchbox/RecordLoader.scala rename to src/main/scala/io/archivesunleashed/RecordLoader.scala index 9c35a5a5..8c2053b3 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/RecordLoader.scala +++ b/src/main/scala/io/archivesunleashed/RecordLoader.scala @@ -14,30 +14,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed + +import io.ArchiveRecordWritable +import io.ArchiveRecordWritable._ +import mapreduce.WacInputFormat import org.apache.hadoop.io.LongWritable import org.apache.spark.{SerializableWritable, SparkContext} import org.apache.spark.rdd.RDD import org.json4s._ import org.json4s.jackson.JsonMethods._ -import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat -import io.archivesunleashed.io.ArchiveRecordWritable -import io.archivesunleashed.mapreduce.WacInputFormat -import io.archivesunleashed.archive.io.ArchiveRecord -import io.archivesunleashed.rdd.RecordRDD._ object RecordLoader { - - def loadArchives(path: String, sc: SparkContext, keepValidPages: Boolean = true): RDD[ArchiveRecord] = { - val rdd: RDD[ArchiveRecord] = - sc.newAPIHadoopFile(path, classOf[WacInputFormat], classOf[LongWritable], classOf[ArchiveRecordWritable]) - .filter(r => (r._2.getFormat == ArchiveFormat.ARC) || - ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response"))) - .map(r => new ArchiveRecord(new SerializableWritable(r._2))) - - if (keepValidPages) rdd.keepValidPages() else rdd - } + def loadArchives(path: String, sc: SparkContext): RDD[ArchiveRecord] = + sc.newAPIHadoopFile(path, classOf[WacInputFormat], classOf[LongWritable], classOf[ArchiveRecordWritable]) + .filter(r => (r._2.getFormat == ArchiveFormat.ARC) || + ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response"))) + .map(r => new ArchiveRecord(new SerializableWritable(r._2))) def loadTweets(path: String, sc: SparkContext): RDD[JValue] = sc.textFile(path).filter(line => !line.startsWith("{\"delete\":")) diff --git a/src/main/scala/io/archivesunleashed/rdd/RecordRDD.scala b/src/main/scala/io/archivesunleashed/Transformations.scala similarity index 91% rename from src/main/scala/io/archivesunleashed/rdd/RecordRDD.scala rename to src/main/scala/io/archivesunleashed/Transformations.scala index e4f6ab3b..2cfc558f 100644 --- a/src/main/scala/io/archivesunleashed/rdd/RecordRDD.scala +++ b/src/main/scala/io/archivesunleashed/Transformations.scala @@ -14,13 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.rdd +package io.archivesunleashed + +import matchbox.{DetectLanguage, ExtractDate, ExtractDomain, RemoveHTML} +import matchbox.ExtractDate.DateComponent +import matchbox.ExtractDate.DateComponent.DateComponent -import io.archivesunleashed.archive.io.ArchiveRecord import org.apache.spark.rdd.RDD -import io.archivesunleashed.matchbox.{DetectLanguage, ExtractDate, ExtractDomain, RemoveHTML} -import io.archivesunleashed.matchbox.ExtractDate.DateComponent -import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent import scala.reflect.ClassTag import scala.util.matching.Regex @@ -28,7 +28,7 @@ import scala.util.matching.Regex /** * RDD wrappers for working with Records */ -object RecordRDD extends java.io.Serializable { +object Transformations extends java.io.Serializable { /** * A Wrapper class around RDD to simplify counting diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala index 40a06fd8..f15529e2 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala @@ -16,6 +16,8 @@ */ package io.archivesunleashed.matchbox +import io.archivesunleashed._ + import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala index 2d70c91b..23c1cb2b 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala @@ -16,13 +16,14 @@ */ package io.archivesunleashed.matchbox -import io.archivesunleashed.archive.io.ArchiveRecord -import org.apache.spark.graphx._ -import org.apache.spark.rdd.RDD +import io.archivesunleashed.ArchiveRecord +import io.archivesunleashed.Transformations._ import io.archivesunleashed.matchbox.StringUtils._ -import io.archivesunleashed.rdd.RecordRDD._ import io.archivesunleashed.utils.JsonUtil +import org.apache.spark.graphx._ +import org.apache.spark.rdd.RDD + /** * * e.g. when done: diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala index e9a1995b..a882880d 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala @@ -16,11 +16,11 @@ */ package io.archivesunleashed.matchbox +import io.archivesunleashed.ArchiveRecord +import io.archivesunleashed.Transformations._ + import org.apache.spark.{RangePartitioner, SparkContext} -import io.archivesunleashed.rdd.RecordRDD._ import org.apache.spark.rdd.RDD -import io.archivesunleashed.archive.io.ArchiveRecord - /** * Extract most popular images diff --git a/src/test/scala/io/archivesunleashed/ArcTest.scala b/src/test/scala/io/archivesunleashed/ArcTest.scala index 3032354b..0e1286de 100644 --- a/src/test/scala/io/archivesunleashed/ArcTest.scala +++ b/src/test/scala/io/archivesunleashed/ArcTest.scala @@ -23,7 +23,8 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} import io.archivesunleashed.matchbox.ExtractDate.DateComponent import io.archivesunleashed.matchbox._ -import io.archivesunleashed.rdd.RecordRDD._ +import io.archivesunleashed.Transformations._ +import io.archivesunleashed.RecordLoader @RunWith(classOf[JUnitRunner]) class ArcTest extends FunSuite with BeforeAndAfter { @@ -40,16 +41,16 @@ class ArcTest extends FunSuite with BeforeAndAfter { } test("count records") { - assert(RecordLoader.loadArchives(arcPath, sc, keepValidPages = false).count == 300L) + assert(RecordLoader.loadArchives(arcPath, sc).count == 300L) } test("filter date") { - val four = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val four = RecordLoader.loadArchives(arcPath, sc) .keepDate(List("200804","200805"), DateComponent.YYYYMM) .map(r => r.getCrawlDate) .collect() - val five = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val five = RecordLoader.loadArchives(arcPath, sc) .keepDate(List("200805","200807"), DateComponent.YYYYMM) .map(r => r.getCrawlDate) .collect() @@ -59,23 +60,23 @@ class ArcTest extends FunSuite with BeforeAndAfter { } test("filter url pattern") { - val keepMatches = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val keepMatches = RecordLoader.loadArchives(arcPath, sc) .keepUrlPatterns(Set("http://www.archive.org/about/.*".r)) - val discardMatches = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val discardMatches = RecordLoader.loadArchives(arcPath, sc) .discardUrlPatterns(Set("http://www.archive.org/about/.*".r)) assert(keepMatches.count == 16L) assert(discardMatches.count == 284L) } test("count links") { - val links = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val links = RecordLoader.loadArchives(arcPath, sc) .map(r => ExtractLinks(r.getUrl, r.getContentString)) .reduce((a, b) => a ++ b) assert(links.size == 664) } test("detect language") { - val languageCounts = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val languageCounts = RecordLoader.loadArchives(arcPath, sc) .keepMimeTypes(Set("text/html")) .map(r => RemoveHTML(r.getContentString)) .groupBy(content => DetectLanguage(content)) @@ -96,7 +97,7 @@ class ArcTest extends FunSuite with BeforeAndAfter { } test("detect mime type tika") { - val mimeTypeCounts = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val mimeTypeCounts = RecordLoader.loadArchives(arcPath, sc) .map(r => RemoveHTML(r.getContentString)) .groupBy(content => DetectMimeTypeTika(content)) .map(f => { diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala index e4514fc5..1d86a562 100644 --- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala +++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala @@ -16,8 +16,6 @@ */ package io.archivesunleashed -import matchbox._ - import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith @@ -41,8 +39,8 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { } test("count records") { - assert(RecordLoader.loadArchives(arcPath, sc, keepValidPages = false).count == 300L) - assert(RecordLoader.loadArchives(warcPath, sc, keepValidPages = false).count == 299L) + assert(RecordLoader.loadArchives(arcPath, sc).count == 300L) + assert(RecordLoader.loadArchives(warcPath, sc).count == 299L) } after { diff --git a/src/test/scala/io/archivesunleashed/rdd/CountableRDDTest.scala b/src/test/scala/io/archivesunleashed/CountableRDDTest.scala similarity index 89% rename from src/test/scala/io/archivesunleashed/rdd/CountableRDDTest.scala rename to src/test/scala/io/archivesunleashed/CountableRDDTest.scala index 60d234bc..d53bf50f 100644 --- a/src/test/scala/io/archivesunleashed/rdd/CountableRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/CountableRDDTest.scala @@ -14,15 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.rdd +package io.archivesunleashed + +import matchbox._ +import _root_.io.archivesunleashed.Transformations._ import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} -import io.archivesunleashed.matchbox._ -import io.archivesunleashed.rdd.RecordRDD._ @RunWith(classOf[JUnitRunner]) class CountableRDDTest extends FunSuite with BeforeAndAfter { @@ -39,7 +40,7 @@ class CountableRDDTest extends FunSuite with BeforeAndAfter { } test("count records") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() .map(r => ExtractDomain(r.getUrl)) val r = base diff --git a/src/test/scala/io/archivesunleashed/matchbox/RecordLoaderTest.scala b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala similarity index 86% rename from src/test/scala/io/archivesunleashed/matchbox/RecordLoaderTest.scala rename to src/test/scala/io/archivesunleashed/RecordLoaderTest.scala index 3c63ac96..ca7b9c58 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/RecordLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala @@ -15,23 +15,19 @@ * limitations under the License. */ - package io.archivesunleashed.rdd + package io.archivesunleashed + + import Transformations._ import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} - import scala.util.matching.Regex - import io.archivesunleashed.matchbox._ - import io.archivesunleashed.matchbox.ExtractDate.DateComponent - import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent - import io.archivesunleashed.rdd.RecordRDD._ - import java.io._ import java.nio.file.{Paths, Files} import org.json4s._ import org.json4s.jackson.JsonMethods._ - import TweetUtils._ + import matchbox.TweetUtils._ @RunWith(classOf[JUnitRunner]) @@ -52,6 +48,7 @@ test("loads Warc") { val base = RecordLoader.loadArchives(warcPath, sc) + .keepValidPages() .map(x => x.getUrl) .take(1) assert (base(0) == "http://www.archive.org/") diff --git a/src/test/scala/io/archivesunleashed/rdd/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala similarity index 77% rename from src/test/scala/io/archivesunleashed/rdd/RecordRDDTest.scala rename to src/test/scala/io/archivesunleashed/RecordRDDTest.scala index 7471e7f9..a700b15e 100644 --- a/src/test/scala/io/archivesunleashed/rdd/RecordRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala @@ -14,7 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.rdd +package io.archivesunleashed + +import matchbox._ +import matchbox.ExtractDate.DateComponent +import matchbox.ExtractDate.DateComponent.DateComponent + +import _root_.io.archivesunleashed.Transformations._ import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} @@ -22,11 +28,6 @@ import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} import scala.util.matching.Regex -import io.archivesunleashed.matchbox._ -import io.archivesunleashed.matchbox.ExtractDate.DateComponent -import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent -import io.archivesunleashed.rdd.RecordRDD._ - @RunWith(classOf[JUnitRunner]) class RecordRDDTest extends FunSuite with BeforeAndAfter { @@ -45,19 +46,19 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test("no valid pages") { - val base = RecordLoader.loadArchives(badPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(badPath, sc) .keepValidPages().take(2) assert (base.length == 0) } test ("no images") { - val base = RecordLoader.loadArchives(badPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(badPath, sc) .keepValidPages().take(2) assert (base.length == 0) } test("keep date") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) val component = DateComponent.YYYY val r = base .filter (x => ExtractDate(x.getCrawlDate, component) == "2008") @@ -67,7 +68,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { assert (r2.sameElements(r)) } test ("keepUrls") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls: Set[String] = Set ("http://www.archive.org/", "http://www.sloan.org") val r2 = base.keepUrls(urls).count @@ -75,7 +76,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("keepUrlPatterns") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls = Set ("http://www.archive.org/".r, "http://www.sloan.org".r, "".r) val r2 = base.keepUrlPatterns(urls).count @@ -83,7 +84,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("check for domains") { - val base2 = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base2 = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls: Set[String] = Set("www.archive.org", "www.sloan.org") val x2 = base2.keepDomains(urls).count() @@ -91,7 +92,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("keep languages") { - val base2 = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base2 = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val langs: Set[String] = Set("en", "fr") val r = Array("http://www.archive.org/index.php", @@ -102,7 +103,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("check for keep content"){ - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val regex = Set(raw"Please visit our website at".r) val regno = Set(raw"Please visit our website at".r, raw"UNINTELLIBLEDFSJKLS".r) @@ -113,7 +114,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard mime") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) val mime = Set ("text/plain", "image/jpeg") val r2 = base.discardMimeTypes(mime) .map (mp => mp.getUrl).take(3) @@ -121,7 +122,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard date") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) val date = "20080430" val r = base.filter( x=> x.getCrawlDate != date).collect() val r2 = base.discardDate(date).take(3) @@ -129,7 +130,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard urls") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls: Set[String] = Set ("http://www.sloan.org") val r2 = base.discardUrls(urls).count() @@ -137,7 +138,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard UrlPatterns") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls = Set ("http://www.archive.org/".r, "http://www.sloan.org".r, "".r) val r2 = base.discardUrlPatterns(urls).count @@ -145,7 +146,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard domains") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val urls: Set[String] = Set ("www.sloan.org") val r2 = base.discardDomains(urls).count() @@ -153,7 +154,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test ("discard content") { - val base = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val base = RecordLoader.loadArchives(arcPath, sc) .keepValidPages() val regex = Set(raw"Please visit our website at".r) val regno = Set(raw"Please visit our website at".r, raw"UNINTELLIBLEDFSJKLS".r) diff --git a/src/test/scala/io/archivesunleashed/WarcTest.scala b/src/test/scala/io/archivesunleashed/WarcTest.scala index e5902dda..e5b3241a 100644 --- a/src/test/scala/io/archivesunleashed/WarcTest.scala +++ b/src/test/scala/io/archivesunleashed/WarcTest.scala @@ -16,9 +16,7 @@ */ package io.archivesunleashed -import archive.io.ArchiveRecord -import matchbox.RecordLoader -import rdd.RecordRDD._ +import Transformations._ import com.google.common.io.Resources import org.apache.spark.rdd.RDD @@ -41,7 +39,7 @@ class WarcTest extends FunSuite with BeforeAndAfter { .setMaster(master) .setAppName(appName) sc = new SparkContext(conf) - records = RecordLoader.loadArchives(warcPath, sc, keepValidPages = false) + records = RecordLoader.loadArchives(warcPath, sc) } test("count records") { @@ -59,7 +57,7 @@ class WarcTest extends FunSuite with BeforeAndAfter { } test("warc get content") { - val a = RecordLoader.loadArchives(warcPath, sc, keepValidPages = false) + val a = RecordLoader.loadArchives(warcPath, sc) .map(r => r.getContentString) .take(1) assert(a.head.nonEmpty) diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala index 51e34521..0b32ceec 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala @@ -15,12 +15,14 @@ * limitations under the License. */ - package io.archivesunleashed.matchbox +package io.archivesunleashed.matchbox - import com.google.common.io.Resources +import io.archivesunleashed.Transformations._ +import io.archivesunleashed.RecordLoader + +import com.google.common.io.Resources import org.apache.spark.{ SparkConf, SparkContext } import org.apache.commons.io.FileUtils - import io.archivesunleashed.rdd.RecordRDD._ import org.apache.spark.graphx._ import org.junit.runner.RunWith import org.scalatest.{ BeforeAndAfter, FunSuite } diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala index bcd84e6c..240bfd40 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala @@ -23,8 +23,7 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{ BeforeAndAfter, FunSuite } import java.io.File import java.nio.file.{Paths, Files} -import io.archivesunleashed.matchbox._ -import io.archivesunleashed.rdd.RecordRDD._ +import io.archivesunleashed.RecordLoader import scala.io.Source @RunWith(classOf[JUnitRunner]) @@ -45,7 +44,7 @@ class ExtractPopularImagesTest extends FunSuite with BeforeAndAfter { test("extracts popular images") { - val examplerdd = RecordLoader.loadArchives(arcPath, sc, keepValidPages = false) + val examplerdd = RecordLoader.loadArchives(arcPath, sc) val imagesLowLimit = ExtractPopularImages(examplerdd, 3, sc) val imagesHighLimit = ExtractPopularImages(examplerdd, 507, sc) val response = Array("1 http://creativecommons.org/images/public/somerights20.gif", diff --git a/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala b/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala index 00ecac78..fcd9320b 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala @@ -19,13 +19,11 @@ package io.archivesunleashed.matchbox import org.apache.spark.{ SparkConf, SparkContext } import org.junit.runner.RunWith -import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner import org.scalatest.{ BeforeAndAfter, FunSuite } import java.io.File import java.nio.file.{Paths, Files} -import io.archivesunleashed.matchbox._ -import io.archivesunleashed.rdd.RecordRDD._ +import io.archivesunleashed.Transformations._ import scala.io.Source @RunWith(classOf[JUnitRunner]) diff --git a/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala b/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala index bbd8c21c..5446de39 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala @@ -24,8 +24,7 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{ BeforeAndAfter, FunSuite } import java.io.File import java.nio.file.{Paths, Files} -import io.archivesunleashed.matchbox._ -import io.archivesunleashed.rdd.RecordRDD._ +import io.archivesunleashed.Transformations._ import scala.io.Source @RunWith(classOf[JUnitRunner]) From 70d394030c7257a41de152f6cb6d01546e46d3a1 Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 4 Apr 2018 13:06:38 -0400 Subject: [PATCH 3/8] More moving classes around. --- .../io/archivesunleashed/RecordLoader.scala | 39 ------------- .../matchbox/ExtractGraph.scala | 7 +-- .../matchbox/ExtractPopularImages.scala | 3 +- .../{Transformations.scala => package.scala} | 57 +++++++++---------- .../scala/io/archivesunleashed/ArcTest.scala | 8 +-- .../archivesunleashed/CountableRDDTest.scala | 1 - .../archivesunleashed/RecordLoaderTest.scala | 2 - .../io/archivesunleashed/RecordRDDTest.scala | 2 - .../scala/io/archivesunleashed/WarcTest.scala | 2 - .../matchbox/ExtractGraphTest.scala | 3 +- .../matchbox/WriteGEXFTest.scala | 1 - .../matchbox/WriteGraphMLTest.scala | 1 - 12 files changed, 35 insertions(+), 91 deletions(-) delete mode 100644 src/main/scala/io/archivesunleashed/RecordLoader.scala rename src/main/scala/io/archivesunleashed/{Transformations.scala => package.scala} (69%) diff --git a/src/main/scala/io/archivesunleashed/RecordLoader.scala b/src/main/scala/io/archivesunleashed/RecordLoader.scala deleted file mode 100644 index 8c2053b3..00000000 --- a/src/main/scala/io/archivesunleashed/RecordLoader.scala +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Archives Unleashed Toolkit (AUT): - * An open-source platform for analyzing web archives. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed - -import io.ArchiveRecordWritable -import io.ArchiveRecordWritable._ -import mapreduce.WacInputFormat - -import org.apache.hadoop.io.LongWritable -import org.apache.spark.{SerializableWritable, SparkContext} -import org.apache.spark.rdd.RDD -import org.json4s._ -import org.json4s.jackson.JsonMethods._ - -object RecordLoader { - def loadArchives(path: String, sc: SparkContext): RDD[ArchiveRecord] = - sc.newAPIHadoopFile(path, classOf[WacInputFormat], classOf[LongWritable], classOf[ArchiveRecordWritable]) - .filter(r => (r._2.getFormat == ArchiveFormat.ARC) || - ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response"))) - .map(r => new ArchiveRecord(new SerializableWritable(r._2))) - - def loadTweets(path: String, sc: SparkContext): RDD[JValue] = - sc.textFile(path).filter(line => !line.startsWith("{\"delete\":")) - .map(line => try { parse(line) } catch { case e: Exception => null }).filter(x => x != null) -} diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala index 23c1cb2b..6b5d7a4c 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala @@ -16,10 +16,9 @@ */ package io.archivesunleashed.matchbox -import io.archivesunleashed.ArchiveRecord -import io.archivesunleashed.Transformations._ -import io.archivesunleashed.matchbox.StringUtils._ -import io.archivesunleashed.utils.JsonUtil +import io.archivesunleashed._ +import matchbox.StringUtils._ +import utils.JsonUtil import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala index a882880d..f642d4cd 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala @@ -16,8 +16,7 @@ */ package io.archivesunleashed.matchbox -import io.archivesunleashed.ArchiveRecord -import io.archivesunleashed.Transformations._ +import io.archivesunleashed._ import org.apache.spark.{RangePartitioner, SparkContext} import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/Transformations.scala b/src/main/scala/io/archivesunleashed/package.scala similarity index 69% rename from src/main/scala/io/archivesunleashed/Transformations.scala rename to src/main/scala/io/archivesunleashed/package.scala index 2cfc558f..d61913b1 100644 --- a/src/main/scala/io/archivesunleashed/Transformations.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -1,34 +1,32 @@ -/* - * Archives Unleashed Toolkit (AUT): - * An open-source platform for analyzing web archives. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed - -import matchbox.{DetectLanguage, ExtractDate, ExtractDomain, RemoveHTML} -import matchbox.ExtractDate.DateComponent -import matchbox.ExtractDate.DateComponent.DateComponent - +package io + +import io.archivesunleashed.io.ArchiveRecordWritable +import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat +import io.archivesunleashed.mapreduce.WacInputFormat +import io.archivesunleashed.matchbox.{DetectLanguage, ExtractDate, ExtractDomain, RemoveHTML} +import io.archivesunleashed.matchbox.ExtractDate.DateComponent +import io.archivesunleashed.matchbox.ExtractDate.DateComponent._ +import org.apache.hadoop.io.LongWritable +import org.apache.spark.{SerializableWritable, SparkContext} import org.apache.spark.rdd.RDD +import org.json4s._ +import org.json4s.jackson.JsonMethods._ import scala.reflect.ClassTag import scala.util.matching.Regex -/** - * RDD wrappers for working with Records - */ -object Transformations extends java.io.Serializable { +package object archivesunleashed { + object RecordLoader { + def loadArchives(path: String, sc: SparkContext): RDD[ArchiveRecord] = + sc.newAPIHadoopFile(path, classOf[WacInputFormat], classOf[LongWritable], classOf[ArchiveRecordWritable]) + .filter(r => (r._2.getFormat == ArchiveFormat.ARC) || + ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response"))) + .map(r => new ArchiveRecord(new SerializableWritable(r._2))) + + def loadTweets(path: String, sc: SparkContext): RDD[JValue] = + sc.textFile(path).filter(line => !line.startsWith("{\"delete\":")) + .map(line => try { parse(line) } catch { case e: Exception => null }).filter(x => x != null) + } /** * A Wrapper class around RDD to simplify counting @@ -63,9 +61,9 @@ object Transformations extends java.io.Serializable { r.getCrawlDate != null && ( (r.getMimeType != null && r.getMimeType.contains("image/")) - || r.getUrl.endsWith("jpg") - || r.getUrl.endsWith("jpeg") - || r.getUrl.endsWith("png")) + || r.getUrl.endsWith("jpg") + || r.getUrl.endsWith("jpeg") + || r.getUrl.endsWith("png")) && !r.getUrl.endsWith("robots.txt")) } @@ -141,5 +139,4 @@ object Transformations extends java.io.Serializable { }).exists(identity)) } } - } diff --git a/src/test/scala/io/archivesunleashed/ArcTest.scala b/src/test/scala/io/archivesunleashed/ArcTest.scala index 0e1286de..b36a4cee 100644 --- a/src/test/scala/io/archivesunleashed/ArcTest.scala +++ b/src/test/scala/io/archivesunleashed/ArcTest.scala @@ -14,17 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.spark +package io.archivesunleashed import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} -import io.archivesunleashed.matchbox.ExtractDate.DateComponent -import io.archivesunleashed.matchbox._ -import io.archivesunleashed.Transformations._ -import io.archivesunleashed.RecordLoader +import matchbox.ExtractDate.DateComponent +import matchbox._ @RunWith(classOf[JUnitRunner]) class ArcTest extends FunSuite with BeforeAndAfter { diff --git a/src/test/scala/io/archivesunleashed/CountableRDDTest.scala b/src/test/scala/io/archivesunleashed/CountableRDDTest.scala index d53bf50f..d39f76cb 100644 --- a/src/test/scala/io/archivesunleashed/CountableRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/CountableRDDTest.scala @@ -17,7 +17,6 @@ package io.archivesunleashed import matchbox._ -import _root_.io.archivesunleashed.Transformations._ import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} diff --git a/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala index ca7b9c58..fba36e12 100644 --- a/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala @@ -17,8 +17,6 @@ package io.archivesunleashed - import Transformations._ - import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala index a700b15e..c6e03092 100644 --- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala @@ -20,8 +20,6 @@ import matchbox._ import matchbox.ExtractDate.DateComponent import matchbox.ExtractDate.DateComponent.DateComponent -import _root_.io.archivesunleashed.Transformations._ - import com.google.common.io.Resources import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith diff --git a/src/test/scala/io/archivesunleashed/WarcTest.scala b/src/test/scala/io/archivesunleashed/WarcTest.scala index e5b3241a..330f4ef3 100644 --- a/src/test/scala/io/archivesunleashed/WarcTest.scala +++ b/src/test/scala/io/archivesunleashed/WarcTest.scala @@ -16,8 +16,6 @@ */ package io.archivesunleashed -import Transformations._ - import com.google.common.io.Resources import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala index 0b32ceec..540f5c3e 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala @@ -17,8 +17,7 @@ package io.archivesunleashed.matchbox -import io.archivesunleashed.Transformations._ -import io.archivesunleashed.RecordLoader +import io.archivesunleashed._ import com.google.common.io.Resources import org.apache.spark.{ SparkConf, SparkContext } diff --git a/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala b/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala index fcd9320b..6b487c68 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala @@ -23,7 +23,6 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{ BeforeAndAfter, FunSuite } import java.io.File import java.nio.file.{Paths, Files} -import io.archivesunleashed.Transformations._ import scala.io.Source @RunWith(classOf[JUnitRunner]) diff --git a/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala b/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala index 5446de39..ee4cc2ac 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala @@ -24,7 +24,6 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{ BeforeAndAfter, FunSuite } import java.io.File import java.nio.file.{Paths, Files} -import io.archivesunleashed.Transformations._ import scala.io.Source @RunWith(classOf[JUnitRunner]) From 66bd5cdcc3b5d65278036cec94e45c55c5c431cd Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 4 Apr 2018 13:29:47 -0400 Subject: [PATCH 4/8] Cleaned up package on Java end. --- .../ArchiveRecordInputFormat.java} | 14 ++++++------- .../{io => data}/ArchiveRecordWritable.java | 11 +++++----- .../io/archivesunleashed/io/package-info.java | 20 ------------------ .../mapreduce/package-info.java | 20 ------------------ .../io/archivesunleashed/ArchiveRecord.scala | 8 +++---- .../scala/io/archivesunleashed/package.scala | 7 +++---- .../ArcLoaderTest.java} | 21 ++++++++++--------- .../ArchiveRecordInputFormatTest.java} | 13 ++++++------ .../ArchiveRecordWritableTest.java | 21 ++++++++++--------- .../WarcLoaderTest.java} | 8 +++---- .../ingest/package-info.java | 20 ------------------ .../io/archivesunleashed/io/package-info.java | 20 ------------------ .../mapreduce/package-info.java | 20 ------------------ 13 files changed, 51 insertions(+), 152 deletions(-) rename src/main/java/io/archivesunleashed/{mapreduce/WacInputFormat.java => data/ArchiveRecordInputFormat.java} (96%) rename src/main/java/io/archivesunleashed/{io => data}/ArchiveRecordWritable.java (96%) delete mode 100644 src/main/java/io/archivesunleashed/io/package-info.java delete mode 100644 src/main/java/io/archivesunleashed/mapreduce/package-info.java rename src/test/java/io/archivesunleashed/{ingest/WacArcLoaderTest.java => data/ArcLoaderTest.java} (93%) rename src/test/java/io/archivesunleashed/{mapreduce/WacInputFormatTest.java => data/ArchiveRecordInputFormatTest.java} (95%) rename src/test/java/io/archivesunleashed/{io => data}/ArchiveRecordWritableTest.java (95%) rename src/test/java/io/archivesunleashed/{ingest/WacWarcLoaderTest.java => data/WarcLoaderTest.java} (96%) delete mode 100644 src/test/java/io/archivesunleashed/ingest/package-info.java delete mode 100644 src/test/java/io/archivesunleashed/io/package-info.java delete mode 100644 src/test/java/io/archivesunleashed/mapreduce/package-info.java diff --git a/src/main/java/io/archivesunleashed/mapreduce/WacInputFormat.java b/src/main/java/io/archivesunleashed/data/ArchiveRecordInputFormat.java similarity index 96% rename from src/main/java/io/archivesunleashed/mapreduce/WacInputFormat.java rename to src/main/java/io/archivesunleashed/data/ArchiveRecordInputFormat.java index a9d8f8f8..63bb6c81 100644 --- a/src/main/java/io/archivesunleashed/mapreduce/WacInputFormat.java +++ b/src/main/java/io/archivesunleashed/data/ArchiveRecordInputFormat.java @@ -15,13 +15,9 @@ * limitations under the License. */ -package io.archivesunleashed.mapreduce; +package io.archivesunleashed.data; -import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat; -import io.archivesunleashed.io.ArchiveRecordWritable; -import java.io.BufferedInputStream; -import java.io.IOException; -import java.util.Iterator; +import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -42,10 +38,14 @@ import org.archive.io.warc.WARCReader; import org.archive.io.warc.WARCReaderFactory.CompressedWARCReader; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.util.Iterator; + /** * Extends FileInputFormat for Web Archive Commons InputFormat. */ -public class WacInputFormat extends FileInputFormat { @Override public final RecordReader (r._2.getFormat == ArchiveFormat.ARC) || ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response"))) .map(r => new ArchiveRecord(new SerializableWritable(r._2))) diff --git a/src/test/java/io/archivesunleashed/ingest/WacArcLoaderTest.java b/src/test/java/io/archivesunleashed/data/ArcLoaderTest.java similarity index 93% rename from src/test/java/io/archivesunleashed/ingest/WacArcLoaderTest.java rename to src/test/java/io/archivesunleashed/data/ArcLoaderTest.java index 7a5dc982..39a1009b 100644 --- a/src/test/java/io/archivesunleashed/ingest/WacArcLoaderTest.java +++ b/src/test/java/io/archivesunleashed/data/ArcLoaderTest.java @@ -14,16 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.ingest; +package io.archivesunleashed.data; import com.google.common.io.Resources; -import io.archivesunleashed.data.ArcRecordUtils; -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.DataInputStream; -import java.io.File; -import java.io.InputStream; -import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.archive.io.ArchiveRecord; @@ -32,10 +25,18 @@ import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; import org.junit.Test; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.File; +import java.io.InputStream; +import java.util.Iterator; + import static org.junit.Assert.assertEquals; -public class WacArcLoaderTest { - private static final Log LOG = LogFactory.getLog(WacArcLoaderTest.class); +public class ArcLoaderTest { + private static final Log LOG = LogFactory.getLog(ArcLoaderTest.class); @Test public final void testReader() throws Exception { diff --git a/src/test/java/io/archivesunleashed/mapreduce/WacInputFormatTest.java b/src/test/java/io/archivesunleashed/data/ArchiveRecordInputFormatTest.java similarity index 95% rename from src/test/java/io/archivesunleashed/mapreduce/WacInputFormatTest.java rename to src/test/java/io/archivesunleashed/data/ArchiveRecordInputFormatTest.java index 81afd897..792ef13f 100644 --- a/src/test/java/io/archivesunleashed/mapreduce/WacInputFormatTest.java +++ b/src/test/java/io/archivesunleashed/data/ArchiveRecordInputFormatTest.java @@ -14,11 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.mapreduce; +package io.archivesunleashed.data; import com.google.common.io.Resources; -import io.archivesunleashed.io.ArchiveRecordWritable; -import java.io.File; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; @@ -34,10 +32,13 @@ import org.archive.io.arc.ARCRecordMetaData; import org.archive.io.warc.WARCRecord; import org.junit.Test; + +import java.io.File; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -public class WacInputFormatTest { +public class ArchiveRecordInputFormatTest { @Test public final void testArcInputFormat() throws Exception { String[] urls = new String[]{ @@ -57,7 +58,7 @@ public final void testArcInputFormat() throws Exception { FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = - ReflectionUtils.newInstance(WacInputFormat.class, conf); + ReflectionUtils.newInstance(ArchiveRecordInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader reader = @@ -123,7 +124,7 @@ public final void testWarcInputFormat() throws Exception { FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = - ReflectionUtils.newInstance(WacInputFormat.class, conf); + ReflectionUtils.newInstance(ArchiveRecordInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader reader = diff --git a/src/test/java/io/archivesunleashed/io/ArchiveRecordWritableTest.java b/src/test/java/io/archivesunleashed/data/ArchiveRecordWritableTest.java similarity index 95% rename from src/test/java/io/archivesunleashed/io/ArchiveRecordWritableTest.java rename to src/test/java/io/archivesunleashed/data/ArchiveRecordWritableTest.java index 7bc78262..b62b39af 100644 --- a/src/test/java/io/archivesunleashed/io/ArchiveRecordWritableTest.java +++ b/src/test/java/io/archivesunleashed/data/ArchiveRecordWritableTest.java @@ -14,16 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.io; +package io.archivesunleashed.data; import com.google.common.io.Resources; -import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat; -import io.archivesunleashed.mapreduce.WacInputFormat; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.File; +import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; @@ -36,6 +30,13 @@ import org.apache.hadoop.util.ReflectionUtils; import org.archive.io.arc.ARCRecord; import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -53,7 +54,7 @@ public final void testArcInputFormat() throws Exception { InputFormat inputFormat = ReflectionUtils.newInstance( - WacInputFormat.class, conf); + ArchiveRecordInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader reader = @@ -107,7 +108,7 @@ public final void testWarcInputFormat() throws Exception { InputFormat inputFormat = ReflectionUtils.newInstance( - WacInputFormat.class, conf); + ArchiveRecordInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader reader = diff --git a/src/test/java/io/archivesunleashed/ingest/WacWarcLoaderTest.java b/src/test/java/io/archivesunleashed/data/WarcLoaderTest.java similarity index 96% rename from src/test/java/io/archivesunleashed/ingest/WacWarcLoaderTest.java rename to src/test/java/io/archivesunleashed/data/WarcLoaderTest.java index f622bc72..3ce01b26 100644 --- a/src/test/java/io/archivesunleashed/ingest/WacWarcLoaderTest.java +++ b/src/test/java/io/archivesunleashed/data/WarcLoaderTest.java @@ -14,10 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.ingest; +package io.archivesunleashed.data; import com.google.common.io.Resources; -import io.archivesunleashed.data.WarcRecordUtils; + import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.DataInputStream; @@ -38,8 +38,8 @@ import tl.lin.data.fd.Object2IntFrequencyDistribution; import tl.lin.data.fd.Object2IntFrequencyDistributionEntry; -public class WacWarcLoaderTest { - private static final Log LOG = LogFactory.getLog(WacWarcLoaderTest.class); +public class WarcLoaderTest { + private static final Log LOG = LogFactory.getLog(WarcLoaderTest.class); private static final SimpleDateFormat DATE_WARC = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); diff --git a/src/test/java/io/archivesunleashed/ingest/package-info.java b/src/test/java/io/archivesunleashed/ingest/package-info.java deleted file mode 100644 index a4ba8ac5..00000000 --- a/src/test/java/io/archivesunleashed/ingest/package-info.java +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Archives Unleashed Toolkit (AUT): - * An open-source platform for analyzing web archives. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** -* This package provides various ingest tests for aut. -*/ -package io.archivesunleashed.ingest; diff --git a/src/test/java/io/archivesunleashed/io/package-info.java b/src/test/java/io/archivesunleashed/io/package-info.java deleted file mode 100644 index b4d5e63b..00000000 --- a/src/test/java/io/archivesunleashed/io/package-info.java +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Archives Unleashed Toolkit (AUT): - * An open-source platform for analyzing web archives. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** -* This package provides various IO tests for aut. -*/ -package io.archivesunleashed.io; diff --git a/src/test/java/io/archivesunleashed/mapreduce/package-info.java b/src/test/java/io/archivesunleashed/mapreduce/package-info.java deleted file mode 100644 index 1b0b63e0..00000000 --- a/src/test/java/io/archivesunleashed/mapreduce/package-info.java +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Archives Unleashed Toolkit (AUT): - * An open-source platform for analyzing web archives. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** -* This package provides various mapreduce tests for aut. -*/ -package io.archivesunleashed.mapreduce; From 5de20efa833a3d047228348fbffd87572d161ce6 Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 4 Apr 2018 14:34:37 -0400 Subject: [PATCH 5/8] More package renaming. --- .../{matchbox => app}/ExtractEntities.scala | 4 ++-- .../{matchbox => app}/ExtractGraph.scala | 12 +++++----- .../ExtractPopularImages.scala | 6 ++--- .../{matchbox => app}/WriteGEXF.scala | 13 +++++----- .../{matchbox => app}/WriteGraphML.scala | 13 +++++----- .../matchbox/NERCombinedJson.scala | 6 ++--- .../JsonUtil.scala => util/JsonUtils.scala} | 4 ++-- .../{matchbox => util}/StringUtils.scala | 6 +++-- .../{matchbox => util}/TweetUtils.scala | 2 +- .../archivesunleashed/RecordLoaderTest.scala | 6 ++--- .../matchbox/ExtractEntitiesTest.scala | 1 + .../matchbox/ExtractGraphTest.scala | 24 ++++++++++--------- .../matchbox/ExtractPopularImagesTest.scala | 9 ++++--- .../matchbox/WriteGEXFTest.scala | 9 ++++--- .../matchbox/WriteGraphMLTest.scala | 9 ++++--- .../JsonUtilsTest.scala} | 10 ++++---- .../{matchbox => util}/StringUtilsTest.scala | 5 ++-- .../{matchbox => util}/TweetUtilsTest.scala | 9 ++++--- 18 files changed, 81 insertions(+), 67 deletions(-) rename src/main/scala/io/archivesunleashed/{matchbox => app}/ExtractEntities.scala (96%) rename src/main/scala/io/archivesunleashed/{matchbox => app}/ExtractGraph.scala (90%) rename src/main/scala/io/archivesunleashed/{matchbox => app}/ExtractPopularImages.scala (94%) rename src/main/scala/io/archivesunleashed/{matchbox => app}/WriteGEXF.scala (93%) rename src/main/scala/io/archivesunleashed/{matchbox => app}/WriteGraphML.scala (94%) rename src/main/scala/io/archivesunleashed/{utils/JsonUtil.scala => util/JsonUtils.scala} (94%) rename src/main/scala/io/archivesunleashed/{matchbox => util}/StringUtils.scala (97%) rename src/main/scala/io/archivesunleashed/{matchbox => util}/TweetUtils.scala (97%) rename src/test/scala/io/archivesunleashed/{utils/JsonUtilTest.scala => util/JsonUtilsTest.scala} (79%) rename src/test/scala/io/archivesunleashed/{matchbox => util}/StringUtilsTest.scala (94%) rename src/test/scala/io/archivesunleashed/{matchbox => util}/TweetUtilsTest.scala (96%) diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala b/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala similarity index 96% rename from src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala rename to src/main/scala/io/archivesunleashed/app/ExtractEntities.scala index f15529e2..2e980e82 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractEntities.scala +++ b/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala @@ -14,10 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app import io.archivesunleashed._ - +import io.archivesunleashed.matchbox.{NER3Classifier, RemoveHTML} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala b/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala similarity index 90% rename from src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala rename to src/main/scala/io/archivesunleashed/app/ExtractGraph.scala index 6b5d7a4c..2d355f09 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractGraph.scala +++ b/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala @@ -14,12 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app import io.archivesunleashed._ -import matchbox.StringUtils._ -import utils.JsonUtil - +import io.archivesunleashed.util.JsonUtils +import io.archivesunleashed.util.StringUtils._ +import io.archivesunleashed.matchbox.{ExtractDomain, ExtractLinks} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD @@ -84,8 +84,8 @@ object ExtractGraph { "count" -> r._2) } - edgesCounted.map(r => JsonUtil.toJson(r)).saveAsTextFile(edgesPath) - graph.vertices.map(r => JsonUtil.toJson(r._2)).saveAsTextFile(verticesPath) + edgesCounted.map(r => JsonUtils.toJson(r)).saveAsTextFile(edgesPath) + graph.vertices.map(r => JsonUtils.toJson(r._2)).saveAsTextFile(verticesPath) } } } diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala b/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala similarity index 94% rename from src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala rename to src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala index f642d4cd..bd54d8be 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractPopularImages.scala +++ b/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala @@ -14,12 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app import io.archivesunleashed._ - -import org.apache.spark.{RangePartitioner, SparkContext} +import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5} import org.apache.spark.rdd.RDD +import org.apache.spark.{RangePartitioner, SparkContext} /** * Extract most popular images diff --git a/src/main/scala/io/archivesunleashed/matchbox/WriteGEXF.scala b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala similarity index 93% rename from src/main/scala/io/archivesunleashed/matchbox/WriteGEXF.scala rename to src/main/scala/io/archivesunleashed/app/WriteGEXF.scala index f716f9f0..9fe97b18 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/WriteGEXF.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala @@ -14,15 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app + +import io.archivesunleashed._ +import io.archivesunleashed.util.JsonUtils +import io.archivesunleashed.util.StringUtils._ -import java.io.BufferedWriter -import java.io.OutputStreamWriter import java.nio.charset.StandardCharsets -import java.nio.file.Files -import java.nio.file.Paths +import java.nio.file.{Files, Paths} + import org.apache.spark.rdd.RDD -import StringUtils._ /** diff --git a/src/main/scala/io/archivesunleashed/matchbox/WriteGraphML.scala b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala similarity index 94% rename from src/main/scala/io/archivesunleashed/matchbox/WriteGraphML.scala rename to src/main/scala/io/archivesunleashed/app/WriteGraphML.scala index 20748545..704ae829 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/WriteGraphML.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala @@ -14,15 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app + +import io.archivesunleashed._ +import io.archivesunleashed.util.JsonUtils +import io.archivesunleashed.util.StringUtils._ -import java.io.BufferedWriter -import java.io.OutputStreamWriter import java.nio.charset.StandardCharsets -import java.nio.file.Files -import java.nio.file.Paths +import java.nio.file.{Files, Paths} + import org.apache.spark.rdd.RDD -import StringUtils._ diff --git a/src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala b/src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala index c531a8c3..d62d8819 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala @@ -23,7 +23,7 @@ import java.io.OutputStreamWriter import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.spark.SparkContext -import io.archivesunleashed.utils.JsonUtil +import io.archivesunleashed.util.JsonUtils import scala.collection.mutable.MutableList import scala.util.Random @@ -93,7 +93,7 @@ class NERCombinedJson extends Serializable { }) .map(r => { val classifiedJson = NER3Classifier.classify(r._3) - val classifiedMap = JsonUtil.fromJson(classifiedJson) + val classifiedMap = JsonUtils.fromJson(classifiedJson) val classifiedMapCountTuples: Map[String, List[(String, Int)]] = classifiedMap.map { case (nerType, entities: List[String @unchecked]) => (nerType, entities.groupBy(identity).mapValues(_.size).toList) } @@ -112,7 +112,7 @@ class NERCombinedJson extends Serializable { }) nerRec.ner += ec }) - JsonUtil.toJson(nerRec) + JsonUtils.toJson(nerRec) }) }) .saveAsTextFile(outputFile) diff --git a/src/main/scala/io/archivesunleashed/utils/JsonUtil.scala b/src/main/scala/io/archivesunleashed/util/JsonUtils.scala similarity index 94% rename from src/main/scala/io/archivesunleashed/utils/JsonUtil.scala rename to src/main/scala/io/archivesunleashed/util/JsonUtils.scala index 786eaa59..0cb7533f 100644 --- a/src/main/scala/io/archivesunleashed/utils/JsonUtil.scala +++ b/src/main/scala/io/archivesunleashed/util/JsonUtils.scala @@ -14,12 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.utils +package io.archivesunleashed.util import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper} import com.fasterxml.jackson.module.scala.DefaultScalaModule -object JsonUtil extends Serializable { +object JsonUtils extends Serializable { val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false) diff --git a/src/main/scala/io/archivesunleashed/matchbox/StringUtils.scala b/src/main/scala/io/archivesunleashed/util/StringUtils.scala similarity index 97% rename from src/main/scala/io/archivesunleashed/matchbox/StringUtils.scala rename to src/main/scala/io/archivesunleashed/util/StringUtils.scala index 4b9e04cc..6d529101 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/StringUtils.scala +++ b/src/main/scala/io/archivesunleashed/util/StringUtils.scala @@ -14,11 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox -import scala.xml.Utility.escape +package io.archivesunleashed.util + import java.io.IOException import java.security.MessageDigest +import scala.xml.Utility.escape + object StringUtils { implicit class WWWLink(s: String) { diff --git a/src/main/scala/io/archivesunleashed/matchbox/TweetUtils.scala b/src/main/scala/io/archivesunleashed/util/TweetUtils.scala similarity index 97% rename from src/main/scala/io/archivesunleashed/matchbox/TweetUtils.scala rename to src/main/scala/io/archivesunleashed/util/TweetUtils.scala index 78cbd15a..38dfc992 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/TweetUtils.scala +++ b/src/main/scala/io/archivesunleashed/util/TweetUtils.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.util import org.json4s.JsonAST._ diff --git a/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala index fba36e12..503fb365 100644 --- a/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala @@ -22,11 +22,9 @@ import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} - import java.nio.file.{Paths, Files} - import org.json4s._ - import org.json4s.jackson.JsonMethods._ - import matchbox.TweetUtils._ + import java.nio.file.{Files, Paths} + import io.archivesunleashed.util.TweetUtils._ @RunWith(classOf[JUnitRunner]) class RecordLoaderTest extends FunSuite with BeforeAndAfter { diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala index d1752078..f9815f1e 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala @@ -21,6 +21,7 @@ import java.io.File import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import com.google.common.io.{Files, Resources} +import io.archivesunleashed.app.ExtractEntities import org.apache.commons.io.FileUtils import org.apache.commons.logging.LogFactory import org.apache.spark.{SparkConf, SparkContext} diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala index 540f5c3e..00b7b13f 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala @@ -18,18 +18,20 @@ package io.archivesunleashed.matchbox import io.archivesunleashed._ - import com.google.common.io.Resources - import org.apache.spark.{ SparkConf, SparkContext } - import org.apache.commons.io.FileUtils - import org.apache.spark.graphx._ - import org.junit.runner.RunWith - import org.scalatest.{ BeforeAndAfter, FunSuite } - import org.scalatest.junit.JUnitRunner - import java.io.File - import java.nio.file.{Paths, Files} - import scala.io.Source - import scala.util.{ Try, Success, Failure } +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.commons.io.FileUtils +import org.apache.spark.graphx._ +import org.junit.runner.RunWith +import org.scalatest.{BeforeAndAfter, FunSuite} +import org.scalatest.junit.JUnitRunner +import java.io.File +import java.nio.file.{Files, Paths} + +import io.archivesunleashed.app.ExtractGraph + +import scala.io.Source +import scala.util.{Failure, Success, Try} @RunWith(classOf[JUnitRunner]) class ExtractGraphTest extends FunSuite with BeforeAndAfter { diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala index 240bfd40..0b12ef6d 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala @@ -17,13 +17,16 @@ package io.archivesunleashed.matchbox import com.google.common.io.Resources -import org.apache.spark.{ SparkConf, SparkContext } +import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{ BeforeAndAfter, FunSuite } +import org.scalatest.{BeforeAndAfter, FunSuite} import java.io.File -import java.nio.file.{Paths, Files} +import java.nio.file.{Files, Paths} + import io.archivesunleashed.RecordLoader +import io.archivesunleashed.app.ExtractPopularImages + import scala.io.Source @RunWith(classOf[JUnitRunner]) diff --git a/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala b/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala index 6b487c68..ce748577 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala @@ -17,12 +17,15 @@ package io.archivesunleashed.matchbox -import org.apache.spark.{ SparkConf, SparkContext } +import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner -import org.scalatest.{ BeforeAndAfter, FunSuite } +import org.scalatest.{BeforeAndAfter, FunSuite} import java.io.File -import java.nio.file.{Paths, Files} +import java.nio.file.{Files, Paths} + +import io.archivesunleashed.app.WriteGEXF + import scala.io.Source @RunWith(classOf[JUnitRunner]) diff --git a/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala b/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala index ee4cc2ac..5a951726 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala @@ -17,13 +17,16 @@ package io.archivesunleashed.matchbox -import org.apache.spark.{ SparkConf, SparkContext } +import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner -import org.scalatest.{ BeforeAndAfter, FunSuite } +import org.scalatest.{BeforeAndAfter, FunSuite} import java.io.File -import java.nio.file.{Paths, Files} +import java.nio.file.{Files, Paths} + +import io.archivesunleashed.app.WriteGraphML + import scala.io.Source @RunWith(classOf[JUnitRunner]) diff --git a/src/test/scala/io/archivesunleashed/utils/JsonUtilTest.scala b/src/test/scala/io/archivesunleashed/util/JsonUtilsTest.scala similarity index 79% rename from src/test/scala/io/archivesunleashed/utils/JsonUtilTest.scala rename to src/test/scala/io/archivesunleashed/util/JsonUtilsTest.scala index 0b2e9409..3d054c6c 100644 --- a/src/test/scala/io/archivesunleashed/utils/JsonUtilTest.scala +++ b/src/test/scala/io/archivesunleashed/util/JsonUtilsTest.scala @@ -20,23 +20,23 @@ import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner - import io.archivesunleashed.utils._ + import io.archivesunleashed.util.JsonUtils @RunWith(classOf[JUnitRunner]) - class JsonUtilTest extends FunSuite { + class JsonUtilsTest extends FunSuite { test("proper Map") { val map: Map[Symbol, Any] = Map('a -> 1, 'b -> 2, 'c -> 3) - assert(JsonUtil.toJson(map) == """{"a":1,"b":2,"c":3}""") + assert(JsonUtils.toJson(map) == """{"a":1,"b":2,"c":3}""") } test("any value") { val value = 12345 - assert(JsonUtil.toJson(12345) == "12345") + assert(JsonUtils.toJson(12345) == "12345") } test("json string") { val jsonString = """{"a":1,"b":2,"c":3}""" - assert(JsonUtil.fromJson(jsonString) == Map("a" -> 1, "b" -> 2, "c" -> 3 )) + assert(JsonUtils.fromJson(jsonString) == Map("a" -> 1, "b" -> 2, "c" -> 3 )) } } diff --git a/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala b/src/test/scala/io/archivesunleashed/util/StringUtilsTest.scala similarity index 94% rename from src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala rename to src/test/scala/io/archivesunleashed/util/StringUtilsTest.scala index 06e20d8a..467de71b 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala +++ b/src/test/scala/io/archivesunleashed/util/StringUtilsTest.scala @@ -14,13 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.util import java.io.IOException + +import io.archivesunleashed.util.StringUtils._ import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner -import StringUtils._ @RunWith(classOf[JUnitRunner]) class StringUtilsTest extends FunSuite { diff --git a/src/test/scala/io/archivesunleashed/matchbox/TweetUtilsTest.scala b/src/test/scala/io/archivesunleashed/util/TweetUtilsTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/matchbox/TweetUtilsTest.scala rename to src/test/scala/io/archivesunleashed/util/TweetUtilsTest.scala index ccd5b378..3883e1af 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/TweetUtilsTest.scala +++ b/src/test/scala/io/archivesunleashed/util/TweetUtilsTest.scala @@ -14,15 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.util +import io.archivesunleashed.util.TweetUtils._ +import org.json4s._ +import org.json4s.jackson.JsonMethods._ import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner -import org.json4s._ -import org.json4s.jackson.JsonMethods._ -import org.json4s.JsonAST._ -import TweetUtils._ @RunWith(classOf[JUnitRunner]) class TweetUtilsTest extends FunSuite { From 4a8fe4914cd54f169888a2fbd2d669979d8c56fe Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 4 Apr 2018 14:41:40 -0400 Subject: [PATCH 6/8] Moved test case. --- .../{matchbox => app}/NERCombinedJson.scala | 12 ++++++------ .../ExtractEntitiesTest.scala | 7 ++----- .../{matchbox => app}/ExtractGraphTest.scala | 19 ++++++++----------- .../ExtractPopularImagesTest.scala | 10 ++-------- .../{matchbox => app}/WriteGEXFTest.scala | 8 +++----- .../{matchbox => app}/WriteGraphMLTest.scala | 9 +++------ 6 files changed, 24 insertions(+), 41 deletions(-) rename src/main/scala/io/archivesunleashed/{matchbox => app}/NERCombinedJson.scala (96%) rename src/test/scala/io/archivesunleashed/{matchbox => app}/ExtractEntitiesTest.scala (95%) rename src/test/scala/io/archivesunleashed/{matchbox => app}/ExtractGraphTest.scala (95%) rename src/test/scala/io/archivesunleashed/{matchbox => app}/ExtractPopularImagesTest.scala (92%) rename src/test/scala/io/archivesunleashed/{matchbox => app}/WriteGEXFTest.scala (96%) rename src/test/scala/io/archivesunleashed/{matchbox => app}/WriteGraphMLTest.scala (95%) diff --git a/src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala b/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala similarity index 96% rename from src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala rename to src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala index d62d8819..0c350add 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/NERCombinedJson.scala +++ b/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app -import java.io.BufferedReader -import java.io.BufferedWriter -import java.io.InputStreamReader -import java.io.OutputStreamWriter +import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter} + +import io.archivesunleashed.matchbox.NER3Classifier +import io.archivesunleashed.util.JsonUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ import org.apache.spark.SparkContext -import io.archivesunleashed.util.JsonUtils + import scala.collection.mutable.MutableList import scala.util.Random diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala b/src/test/scala/io/archivesunleashed/app/ExtractEntitiesTest.scala similarity index 95% rename from src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala rename to src/test/scala/io/archivesunleashed/app/ExtractEntitiesTest.scala index f9815f1e..78d6bff4 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractEntitiesTest.scala +++ b/src/test/scala/io/archivesunleashed/app/ExtractEntitiesTest.scala @@ -14,21 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app import java.io.File import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import com.google.common.io.{Files, Resources} -import io.archivesunleashed.app.ExtractEntities +import io.archivesunleashed.matchbox.NER3Classifier.NERClassType import org.apache.commons.io.FileUtils import org.apache.commons.logging.LogFactory import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfter, FunSuite} -import org.scalatest.junit.JUnitRunner -import org.junit.runner.RunWith -import io.archivesunleashed.matchbox.NER3Classifier.NERClassType import scala.collection.mutable diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala b/src/test/scala/io/archivesunleashed/app/ExtractGraphTest.scala similarity index 95% rename from src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala rename to src/test/scala/io/archivesunleashed/app/ExtractGraphTest.scala index 00b7b13f..553ae793 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractGraphTest.scala +++ b/src/test/scala/io/archivesunleashed/app/ExtractGraphTest.scala @@ -15,23 +15,20 @@ * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app + +import java.io.File +import java.nio.file.{Files, Paths} -import io.archivesunleashed._ import com.google.common.io.Resources -import org.apache.spark.{SparkConf, SparkContext} +import io.archivesunleashed._ import org.apache.commons.io.FileUtils -import org.apache.spark.graphx._ +import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith -import org.scalatest.{BeforeAndAfter, FunSuite} import org.scalatest.junit.JUnitRunner -import java.io.File -import java.nio.file.{Files, Paths} - -import io.archivesunleashed.app.ExtractGraph +import org.scalatest.{BeforeAndAfter, FunSuite} -import scala.io.Source -import scala.util.{Failure, Success, Try} +import scala.util.Try @RunWith(classOf[JUnitRunner]) class ExtractGraphTest extends FunSuite with BeforeAndAfter { diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala similarity index 92% rename from src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala rename to src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala index 0b12ef6d..3d6a954e 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractPopularImagesTest.scala +++ b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala @@ -14,20 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} -import java.io.File -import java.nio.file.{Files, Paths} - -import io.archivesunleashed.RecordLoader -import io.archivesunleashed.app.ExtractPopularImages - -import scala.io.Source @RunWith(classOf[JUnitRunner]) class ExtractPopularImagesTest extends FunSuite with BeforeAndAfter { diff --git a/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala b/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala similarity index 96% rename from src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala rename to src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala index ce748577..b0821867 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/WriteGEXFTest.scala +++ b/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala @@ -14,17 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app +import java.io.File +import java.nio.file.{Files, Paths} import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} -import java.io.File -import java.nio.file.{Files, Paths} - -import io.archivesunleashed.app.WriteGEXF import scala.io.Source diff --git a/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala b/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala similarity index 95% rename from src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala rename to src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala index 5a951726..5066da55 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/WriteGraphMLTest.scala +++ b/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala @@ -14,18 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.matchbox +package io.archivesunleashed.app +import java.io.File +import java.nio.file.{Files, Paths} import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith -import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} -import java.io.File -import java.nio.file.{Files, Paths} - -import io.archivesunleashed.app.WriteGraphML import scala.io.Source From ea5903fe13d947e7f832f0e2082ae52e932aba5f Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 4 Apr 2018 18:53:35 -0400 Subject: [PATCH 7/8] Moved StringUtils into matchbox package implicits. --- .../archivesunleashed/app/ExtractGraph.scala | 3 +- .../io/archivesunleashed/app/WriteGEXF.scala | 4 +- .../archivesunleashed/app/WriteGraphML.scala | 4 +- .../archivesunleashed/matchbox/package.scala | 32 +++++++++++++ .../archivesunleashed/util/StringUtils.scala | 46 ------------------- .../{util => matchbox}/StringUtilsTest.scala | 3 +- 6 files changed, 36 insertions(+), 56 deletions(-) create mode 100644 src/main/scala/io/archivesunleashed/matchbox/package.scala delete mode 100644 src/main/scala/io/archivesunleashed/util/StringUtils.scala rename src/test/scala/io/archivesunleashed/{util => matchbox}/StringUtilsTest.scala (94%) diff --git a/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala b/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala index 2d355f09..be758aed 100644 --- a/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala +++ b/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala @@ -17,9 +17,8 @@ package io.archivesunleashed.app import io.archivesunleashed._ +import io.archivesunleashed.matchbox._ import io.archivesunleashed.util.JsonUtils -import io.archivesunleashed.util.StringUtils._ -import io.archivesunleashed.matchbox.{ExtractDomain, ExtractLinks} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD diff --git a/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala index 9fe97b18..6e012231 100644 --- a/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala @@ -16,9 +16,7 @@ */ package io.archivesunleashed.app -import io.archivesunleashed._ -import io.archivesunleashed.util.JsonUtils -import io.archivesunleashed.util.StringUtils._ +import io.archivesunleashed.matchbox._ import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} diff --git a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala index 704ae829..f40d946e 100644 --- a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala @@ -16,9 +16,7 @@ */ package io.archivesunleashed.app -import io.archivesunleashed._ -import io.archivesunleashed.util.JsonUtils -import io.archivesunleashed.util.StringUtils._ +import io.archivesunleashed.matchbox._ import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} diff --git a/src/main/scala/io/archivesunleashed/matchbox/package.scala b/src/main/scala/io/archivesunleashed/matchbox/package.scala new file mode 100644 index 00000000..53401dbd --- /dev/null +++ b/src/main/scala/io/archivesunleashed/matchbox/package.scala @@ -0,0 +1,32 @@ +package io.archivesunleashed + +import java.io.IOException +import java.security.MessageDigest + +import scala.xml.Utility._ + +/** + * Created by jimmylin on 4/4/18. + */ +package object matchbox { + implicit class WWWLink(s: String) { + def removePrefixWWW(): String = { + if (s == null) return null + s.replaceAll("^\\s*www\\.", "") + } + + def escapeInvalidXML(): String = { + try { + return escape(s) + } + catch { + case e: Exception => throw new IOException("Caught exception processing input row ", e) + } + } + + def computeHash(): String = { + val md5 = MessageDigest.getInstance("MD5") + return md5.digest(s.getBytes).map("%02x".format(_)).mkString + } + } +} diff --git a/src/main/scala/io/archivesunleashed/util/StringUtils.scala b/src/main/scala/io/archivesunleashed/util/StringUtils.scala deleted file mode 100644 index 6d529101..00000000 --- a/src/main/scala/io/archivesunleashed/util/StringUtils.scala +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Archives Unleashed Toolkit (AUT): - * An open-source platform for analyzing web archives. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed.util - -import java.io.IOException -import java.security.MessageDigest - -import scala.xml.Utility.escape - -object StringUtils { - - implicit class WWWLink(s: String) { - def removePrefixWWW(): String = { - if (s == null) return null - s.replaceAll("^\\s*www\\.", "") - } - - def escapeInvalidXML(): String = { - try { - return escape(s) - } - catch { - case e: Exception => throw new IOException("Caught exception processing input row ", e) - } - } - - def computeHash(): String = { - val md5 = MessageDigest.getInstance("MD5") - return md5.digest(s.getBytes).map("%02x".format(_)).mkString - } - } -} diff --git a/src/test/scala/io/archivesunleashed/util/StringUtilsTest.scala b/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala similarity index 94% rename from src/test/scala/io/archivesunleashed/util/StringUtilsTest.scala rename to src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala index 467de71b..609ffdcc 100644 --- a/src/test/scala/io/archivesunleashed/util/StringUtilsTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala @@ -14,11 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.archivesunleashed.util +package io.archivesunleashed.matchbox import java.io.IOException -import io.archivesunleashed.util.StringUtils._ import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner From a46ce3b76c1c97377ef60e57c379fc6f75ea7814 Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 4 Apr 2018 23:23:56 -0400 Subject: [PATCH 8/8] CR --- .../archivesunleashed/matchbox/package.scala | 19 +++++++++++++++++- .../scala/io/archivesunleashed/package.scala | 20 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/main/scala/io/archivesunleashed/matchbox/package.scala b/src/main/scala/io/archivesunleashed/matchbox/package.scala index 53401dbd..a0cfa6dc 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/package.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/package.scala @@ -1,3 +1,20 @@ +/* + * Archives Unleashed Toolkit (AUT): + * An open-source platform for analyzing web archives. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.archivesunleashed import java.io.IOException @@ -6,7 +23,7 @@ import java.security.MessageDigest import scala.xml.Utility._ /** - * Created by jimmylin on 4/4/18. + * Package object which supplies implicits providing common UDF-related functionalities. */ package object matchbox { implicit class WWWLink(s: String) { diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index a0b75414..ed8ee99f 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -1,3 +1,20 @@ +/* + * Archives Unleashed Toolkit (AUT): + * An open-source platform for analyzing web archives. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io import io.archivesunleashed.data.{ArchiveRecordWritable, ArchiveRecordInputFormat} @@ -14,6 +31,9 @@ import org.json4s.jackson.JsonMethods._ import scala.reflect.ClassTag import scala.util.matching.Regex +/** + * Package object which supplies implicits to augment generic RDDs with AUT-specific transformations. + */ package object archivesunleashed { object RecordLoader { def loadArchives(path: String, sc: SparkContext): RDD[ArchiveRecord] =