Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align NER output to WANE format #361

Merged
merged 11 commits into from
Nov 5, 2019
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.8.0</version>
<version>3.9.2</version>
<exclusions>
<exclusion>
<groupId>org.apache.commons</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ public static byte[] getBodyContent(final WARCRecord record)
return null;
}

// Just using parseHeaders to move down input stream to body
// Just using parseHeaders to move down input stream to body.
HttpParser.parseHeaders(record, WARC_HEADER_ENCODING);
record.dump(baos);
return baos.toByteArray();
Expand Down
13 changes: 12 additions & 1 deletion src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ package io.archivesunleashed

import java.text.SimpleDateFormat
import java.io.ByteArrayInputStream
import java.security.MessageDigest

import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils, ArchiveRecordWritable}
import io.archivesunleashed.matchbox.{ExtractDate, ExtractDomain, RemoveHttpHeader}
import io.archivesunleashed.matchbox.{ComputeMD5, ExtractDate, ExtractDomain, RemoveHttpHeader}
import org.apache.spark.SerializableWritable
import org.archive.io.arc.ARCRecord
import org.archive.io.warc.WARCRecord
Expand Down Expand Up @@ -60,6 +61,8 @@ trait ArchiveRecord extends Serializable {
/** Returns the http status of the crawl. */
def getHttpStatus: String

/** Returns payload digest (SHA1). */
def getPayloadDigest: String
}

/** Default implementation of a record in a web archive.
Expand Down Expand Up @@ -158,4 +161,12 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends
getContentBytes
}
}

val getPayloadDigest: String = {
if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC){
"sha1:" + MessageDigest.getInstance("SHA1").digest(getContentBytes).map("%02x".format(_)).mkString
} else {
r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getHeaderValue("WARC-Payload-Digest").asInstanceOf[String]
}
}
}
46 changes: 18 additions & 28 deletions src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
package io.archivesunleashed.app

import io.archivesunleashed.RecordLoader
import io.archivesunleashed.matchbox.{NERClassifier, RemoveHTML}
import io.archivesunleashed.matchbox.{ComputeMD5, NERClassifier, RemoveHTML}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

Expand All @@ -36,45 +36,35 @@ object ExtractEntities {
* @param sc the Apache Spark context
* @return an rdd with classification entities.
*/
def extractFromRecords(iNerClassifierFile: String, inputRecordFile: String, outputFile: String, sc: SparkContext): RDD[(String, String, String)] = {
def extractFromRecords(iNerClassifierFile: String, inputRecordFile: String,
outputFile: String,
sc: SparkContext): RDD[(String, String, String, String)] = {
val rdd = RecordLoader.loadArchives(inputRecordFile, sc)
.map(r => (r.getCrawlDate, r.getUrl, RemoveHTML(r.getContentString)))
extractAndOutput(iNerClassifierFile, rdd, outputFile)
}

/** Extracts named entities from tuple-formatted derivatives scraped from a website.
*
* @param iNerClassifierFile path of classifier file
* @param inputFile path of file containing tuples (date: String, url: String, content: String)
* from which to extract entities
* @param outputFile path of output directory
* @return an rdd with classification entities.
*/
def extractFromScrapeText(iNerClassifierFile: String, inputFile: String, outputFile: String, sc: SparkContext): RDD[(String, String, String)] = {
val rdd = sc.textFile(inputFile)
.map(line => {
val ind1 = line.indexOf(",")
val ind2 = line.indexOf(",", ind1 + 1)
(line.substring(1, ind1),
line.substring(ind1 + 1, ind2),
line.substring(ind2 + 1, line.length - 1))
})
.keepValidPages()
.map(r => (("\"timestamp\":\"" + r.getCrawlDate + "\""),
("\"url\":\"" + r.getUrl + "\""),
(RemoveHTML(r.getContentString)),
("\"digest\":\"" + r.getPayloadDigest + "\"")))
extractAndOutput(iNerClassifierFile, rdd, outputFile)
}

/** Saves the NER output to file from a given RDD.
*
* @param iNerClassifierFile path of classifier file
* @param rdd with values (date, url, content)
* @param rdd with values (date, url, content, content digest)
* @param outputFile path of output directory
* @return an rdd of tuples with classification entities extracted.
* @return a json object with classification entities extracted.
*/
def extractAndOutput(iNerClassifierFile: String, rdd: RDD[(String, String, String)], outputFile: String): RDD[(String, String, String)] = {
def extractAndOutput(iNerClassifierFile: String,
rdd: RDD[(String, String, String, String)],
outputFile: String): RDD[(String, String, String, String)] = {
val r = rdd.mapPartitions(iter => {
NERClassifier.apply(iNerClassifierFile)
iter.map(r => (r._1, r._2, NERClassifier.classify(r._3)))
iter.map(r => (("{" + r._1), r._2,
("\"named_entities\":" + NERClassifier.classify(r._3)), (r._4 + "}")))
})
r.saveAsTextFile(outputFile)
r.map(r => r._1 + "," + r._2 + "," + r._3 + "," + r._4)
.saveAsTextFile(outputFile)
r
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ class NERCombinedJson extends Serializable {
iter.map(r => {
val nerRec = new NerRecord(r._1._1, r._1._2)
r._2.foreach(entityMap => {
// e.g., entityMap = "PERSON" -> List(("Jack", 1), ("Diane", 3))
// e.g., entityMap = "persons" -> List(("Jack", 1), ("Diane", 3))
val ec = new EntityCounts(entityMap._1)
entityMap._2.foreach(e => {
ec.entities += new Entity(e._1, e._2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ object NERClassifier {
/** Performs NER classificiation based on NER Classifier.
*
* @param input
* @return json string containing lists of people, organizations and locations.
* @return json string containing lists of persons, organizations, and locations.
*/
def classify(input: String): String = {
val emptyString: String = "{\"PERSON\":[],\"ORGANIZATION\"=[],\"LOCATION\"=[]}"
val emptyString: String = "\"persons\":[],\"organizations\":[],\"locations\":[]"
val entitiesByType = mutable.LinkedHashMap[NERClassType.Value, mutable.Seq[String]]()
for (t <- NERClassType.values) {
if (t != NERClassType.O) { entitiesByType.put(t, mutable.Seq()) }
Expand Down
13 changes: 13 additions & 0 deletions src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,19 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
exampleStatusCode2).deep)
}

test("Get Payload Digest") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getPayloadDigest).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getPayloadDigest).take(3)
assert (textSampleArc.deep == Array("sha1:252efd6dd414d91812dd9b0f897cdb2b44f64601",
"sha1:8d115d0e83c5dcd66b13619e04d60a36cb2c1ee4",
"sha1:ede22581685942721c7b9743dced317633d00e33").deep)
assert (textSampleWarc.deep == Array(null,
"sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U",
"sha1:2WAXX5NUWNNCS2BDKCO5OVDQBJVNKIVV").deep)
}

after {
if (sc != null) {
sc.stop()
Expand Down
15 changes: 0 additions & 15 deletions src/test/scala/io/archivesunleashed/app/ExtractEntitiesTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,6 @@ class ExtractEntitiesTest extends FunSuite with BeforeAndAfter {
LOG.info("Output can be found in " + tempDir.getPath)
}

test("extract entities") {
val e = ExtractEntities.extractFromScrapeText(iNerClassifierFile, scrapePath, tempDir + "/scrapeTextEntities", sc).take(3).last
val expectedEntityMap = mutable.Map[NERClassType.Value, List[String]]()
expectedEntityMap.put(NERClassType.PERSON, List())
expectedEntityMap.put(NERClassType.LOCATION, List("Teoma"))
expectedEntityMap.put(NERClassType.ORGANIZATION, List())
assert(e._1 == "20080430")
assert(e._2 == "http://www.archive.org/robots.txt")
val actual = mapper.readValue(e._3, classOf[Map[String, List[String]]])

expectedEntityMap.toStream.foreach(f => {
assert(f._2 == actual.get(f._1.toString).get)
})
}

test("Extract from Record") {
val e = ExtractEntities.extractFromRecords(iNerClassifierFile, archivePath, tempDir + "/scrapeArcEntities", sc).take(3).last
assert(e._1 == "hello")
Expand Down