Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Major refactoring of package structure #189

Merged
merged 8 commits into from
Apr 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,9 @@
* limitations under the License.
*/

package io.archivesunleashed.mapreduce;
package io.archivesunleashed.data;

import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat;
import io.archivesunleashed.io.ArchiveRecordWritable;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.Iterator;
import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
Expand All @@ -42,10 +38,14 @@
import org.archive.io.warc.WARCReader;
import org.archive.io.warc.WARCReaderFactory.CompressedWARCReader;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.Iterator;

/**
* Extends FileInputFormat for Web Archive Commons InputFormat.
*/
public class WacInputFormat extends FileInputFormat<LongWritable,
public class ArchiveRecordInputFormat extends FileInputFormat<LongWritable,
ArchiveRecordWritable> {
@Override
public final RecordReader<LongWritable,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,17 @@
* limitations under the License.
*/

package io.archivesunleashed.io;
package io.archivesunleashed.data;

import io.archivesunleashed.data.ArcRecordUtils;
import io.archivesunleashed.data.WarcRecordUtils;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
* Implements Hadoop Writable for Archive Records.
*/
Expand Down
20 changes: 0 additions & 20 deletions src/main/java/io/archivesunleashed/io/package-info.java

This file was deleted.

20 changes: 0 additions & 20 deletions src/main/java/io/archivesunleashed/mapreduce/package-info.java

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.archive.io

package io.archivesunleashed

import data.{ArcRecordUtils, WarcRecordUtils}
import data.ArchiveRecordWritable.ArchiveFormat
import matchbox.{ExtractDate, ExtractDomain, RemoveHttpHeader}
import ExtractDate.DateComponent
import java.text.SimpleDateFormat

import io.archivesunleashed.data.ArchiveRecordWritable
import org.apache.spark.SerializableWritable
import org.archive.io.arc.ARCRecord
import org.archive.io.warc.WARCRecord

import org.archive.util.ArchiveUtils
import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils}
import io.archivesunleashed.io.ArchiveRecordWritable
import io.archivesunleashed.io.ArchiveRecordWritable.ArchiveFormat
import io.archivesunleashed.spark.matchbox.ExtractDate.DateComponent
import io.archivesunleashed.spark.matchbox.{RemoveHttpHeader, ExtractDate, ExtractDomain}

class ArchiveRecord(r: SerializableWritable[ArchiveRecordWritable]) extends Serializable {
var arcRecord: ARCRecord = null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.app

import io.archivesunleashed._
import io.archivesunleashed.matchbox.{NER3Classifier, RemoveHTML}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.app

import io.archivesunleashed.spark.archive.io.ArchiveRecord
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
import io.archivesunleashed.util.JsonUtils
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import io.archivesunleashed.spark.matchbox.StringUtils._
import io.archivesunleashed.spark.rdd.RecordRDD._
import io.archivesunleashed.spark.utils.JsonUtil

/**
*
Expand Down Expand Up @@ -84,8 +83,8 @@ object ExtractGraph {
"count" -> r._2)
}

edgesCounted.map(r => JsonUtil.toJson(r)).saveAsTextFile(edgesPath)
graph.vertices.map(r => JsonUtil.toJson(r._2)).saveAsTextFile(verticesPath)
edgesCounted.map(r => JsonUtils.toJson(r)).saveAsTextFile(edgesPath)
graph.vertices.map(r => JsonUtils.toJson(r._2)).saveAsTextFile(verticesPath)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.app

import org.apache.spark.{RangePartitioner, SparkContext}
import io.archivesunleashed.spark.rdd.RecordRDD._
import io.archivesunleashed._
import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5}
import org.apache.spark.rdd.RDD
import io.archivesunleashed.spark.archive.io.ArchiveRecord

import org.apache.spark.{RangePartitioner, SparkContext}

/**
* Extract most popular images
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.app

import java.io.BufferedReader
import java.io.BufferedWriter
import java.io.InputStreamReader
import java.io.OutputStreamWriter
import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter}

import io.archivesunleashed.matchbox.NER3Classifier
import io.archivesunleashed.util.JsonUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.spark.SparkContext
import io.archivesunleashed.spark.utils.JsonUtil

import scala.collection.mutable.MutableList
import scala.util.Random

Expand Down Expand Up @@ -93,7 +93,7 @@ class NERCombinedJson extends Serializable {
})
.map(r => {
val classifiedJson = NER3Classifier.classify(r._3)
val classifiedMap = JsonUtil.fromJson(classifiedJson)
val classifiedMap = JsonUtils.fromJson(classifiedJson)
val classifiedMapCountTuples: Map[String, List[(String, Int)]] = classifiedMap.map {
case (nerType, entities: List[String @unchecked]) => (nerType, entities.groupBy(identity).mapValues(_.size).toList)
}
Expand All @@ -112,7 +112,7 @@ class NERCombinedJson extends Serializable {
})
nerRec.ner += ec
})
JsonUtil.toJson(nerRec)
JsonUtils.toJson(nerRec)
})
})
.saveAsTextFile(outputFile)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.app

import io.archivesunleashed.matchbox._

import java.io.BufferedWriter
import java.io.OutputStreamWriter
import java.nio.charset.StandardCharsets
import java.nio.file.Files
import java.nio.file.Paths
import java.nio.file.{Files, Paths}

import org.apache.spark.rdd.RDD
import StringUtils._


/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.app

import io.archivesunleashed.matchbox._

import java.io.BufferedWriter
import java.io.OutputStreamWriter
import java.nio.charset.StandardCharsets
import java.nio.file.Files
import java.nio.file.Paths
import java.nio.file.{Files, Paths}

import org.apache.spark.rdd.RDD
import StringUtils._



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import java.io.ByteArrayInputStream
import javax.imageio.ImageIO
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import java.security.MessageDigest

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import org.apache.tika.language.LanguageIdentifier

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import java.io.ByteArrayInputStream

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

object ExtractAtMentions {
val pattern = """@[A-Za-z_0-9]+""".r
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import java.io.IOException
import de.l3s.boilerpipe.extractors.DefaultExtractor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

/**
* Simple wrapper for getting different parts of a date
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import java.net.URL

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

object ExtractHashtags {
val pattern = """#[^ ]+""".r
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import java.io.IOException

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import java.io.IOException

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import java.io.ByteArrayInputStream

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

object ExtractUrls {
val pattern = """https?://[^ ]+""".r
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.spark.matchbox
package io.archivesunleashed.matchbox

import java.util

Expand Down
Loading