Skip to content

Commit

Permalink
Add office document binary extraction.
Browse files Browse the repository at this point in the history
- Add WordProcessor DF and binary extraction
- Add Spreadsheets DF and binary extraction
- Add Presentation Program DF and binary extraction
- Add tests for new DF and binary extractions
- Add test fixture for new DF and binary extractions
- Resolves #303
- Resolves #304
- Resolves #305
- Back out 39831c2 (We _might_ not have
to do this)
  • Loading branch information
ruebot committed Aug 15, 2019
1 parent 39831c2 commit 2258207
Show file tree
Hide file tree
Showing 6 changed files with 399 additions and 0 deletions.
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,7 @@
<groupId>com.github.archivesunleashed.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>${tika.version}</version>
<classifier>shady</classifier>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
Expand Down
163 changes: 163 additions & 0 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,169 @@ package object archivesunleashed {
sqlContext.getOrCreate().createDataFrame(records, schema)
}

/* Extract spreadsheet bytes and spreadsheet metadata. */
def extractSpreadsheetDetailsDF(): DataFrame = {
val records = rdd
.map(r =>
(r, (DetectMimeTypeTika(r.getBinaryBytes)))
)
.filter(r => r._2 == "application/vnd.ms-excel"
|| r._2 == "application/vnd.ms-excel.workspace.3"
|| r._2 == "application/vnd.ms-excel.workspace.4"
|| r._2 == "application/vnd.ms-excel.sheet.2"
|| r._2 == "application/vnd.ms-excel.sheet.3"
|| r._2 == "application/vnd.ms-excel.sheet.3"
|| r._2 == "application/vnd.ms-excel.addin.macroenabled.12"
|| r._2 == "application/vnd.ms-excel.sheet.binary.macroenabled.12"
|| r._2 == "application/vnd.ms-excel.sheet.macroenabled.12"
|| r._2 == "application/vnd.ms-excel.template.macroenabled.12"
|| r._2 == "application/vnd.ms-spreadsheetml"
|| r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.template"
|| r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|| r._2 == "application/x-vnd.oasis.opendocument.spreadsheet-template"
|| r._2 == "application/vnd.oasis.opendocument.spreadsheet-template"
|| r._2 == "application/vnd.oasis.opendocument.spreadsheet"
|| r._2 == "application/x-vnd.oasis.opendocument.spreadsheet"
|| r._2 == "application/x-tika-msworks-spreadsheet"
|| r._2 == "application/vnd.lotus-1-2-3"
|| r._1.getMimeType == "text/tab-separated-values"
|| r._1.getUrl.endsWith("tsv")
|| r._1.getMimeType == "text/csv"
|| r._1.getUrl.endsWith("csv")
|| r._1.getUrl.endsWith("ods")
|| r._1.getUrl.endsWith("xlr")
|| r._1.getUrl.endsWith("xls")
|| r._1.getUrl.endsWith("xlsx"))
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = FilenameUtils.getExtension(url.getPath())
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))

val schema = new StructType()
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
sqlContext.getOrCreate().createDataFrame(records, schema)
}

/* Extract presentation program bytes and presentation program metadata. */
def extractPresentationProgramDetailsDF(): DataFrame = {
val records = rdd
.map(r =>
(r, (DetectMimeTypeTika(r.getBinaryBytes)))
)
.filter(r => r._2 == "application/vnd.ms-powerpoint"
|| r._2 == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|| r._2 == "application/vnd.oasis.opendocument.presentation"
|| r._2 == "application/vnd.oasis.opendocument.presentation-template"
|| r._2 == "application/vnd.sun.xml.impress"
|| r._2 == "application/vnd.sun.xml.impress.template"
|| r._2 == "application/vnd.stardivision.impress"
|| r._2 == "application/x-starimpress"
|| r._2 == "application/vnd.ms-powerpoint.addin.macroEnabled.12"
|| r._2 == "application/vnd.ms-powerpoint.presentation.macroEnabled.12"
|| r._2 == "application/vnd.ms-powerpoint.slide.macroEnabled.12"
|| r._2 == "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"
|| r._2 == "application/vnd.ms-powerpoint.template.macroEnabled.12"
|| r._1.getUrl.endsWith("key")
|| r._1.getUrl.endsWith("odp")
|| r._1.getUrl.endsWith("pps")
|| r._1.getUrl.endsWith("ppt")
|| r._1.getUrl.endsWith("pptx"))
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = FilenameUtils.getExtension(url.getPath())
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))

val schema = new StructType()
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
sqlContext.getOrCreate().createDataFrame(records, schema)
}

/* Extract word processor bytes and word processor metadata. */
def extractWordProcessorDetailsDF(): DataFrame = {
val records = rdd
.map(r =>
(r, (DetectMimeTypeTika(r.getBinaryBytes)))
)
.filter(r => r._2 == "application/vnd.lotus-wordpro"
|| r._2 == "application/vnd.kde.kword"
|| r._2 == "application/vnd.ms-word.document.macroEnabled.12"
|| r._2 == "application/vnd.ms-word.template.macroEnabled.12"
|| r._2 == "application/vnd.oasis.opendocument.text"
|| r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml"
|| r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|| r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml"
|| r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"
|| r._2 == "application/vnd.wordperfect"
|| r._2 == "application/wordperfect5.1"
|| r._2 == "application/msword"
|| r._2 == "application/vnd.ms-word.document.macroEnabled.12"
|| r._2 == "application/vnd.ms-word.template.macroEnabled.12"
|| r._2 == "application/vnd.apple.pages"
|| r._2 == "application/macwriteii"
|| r._2 == "application/vnd.ms-works"
|| r._2 == "text/rtf"
|| r._1.getUrl.endsWith("rtf")
|| r._1.getUrl.endsWith("docx")
|| r._1.getUrl.endsWith("doc")
|| r._1.getUrl.endsWith("odt")
|| r._1.getUrl.endsWith("wks")
|| r._1.getUrl.endsWith("wps")
|| r._1.getUrl.endsWith("wpd"))
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = FilenameUtils.getExtension(url.getPath())
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))

val schema = new StructType()
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
sqlContext.getOrCreate().createDataFrame(records, schema)
}

/** Removes all data except images. */
def keepImages(): RDD[ArchiveRecord] = {
rdd.filter(r =>
Expand Down
Binary file added src/test/resources/warc/example.docs.warc.gz
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source toolkit for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.spark.sql.SparkSession
// scalastyle:off underscore.import
import io.archivesunleashed.df._
import org.apache.spark.sql.functions._
// scalastyle:on underscore.import
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}

@RunWith(classOf[JUnitRunner])
class ExtractPresentationProgramDetailsTest extends FunSuite with BeforeAndAfter {
private val warcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _

before {
val conf = new SparkConf()
.setMaster(master)
.setAppName(appName)
sc = new SparkContext(conf)
}

test("Word Processor DF extraction") {
val df = RecordLoader.loadArchives(warcPath, sc)
.extractPresentationProgramDetailsDF()

val extracted = df.select("url", "filename", "extension",
"mime_type_web_server", "mime_type_tika", "md5")
.orderBy(desc("md5")).head(2).toList
assert(extracted.size == 2)
assert("https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.odp" == extracted(0)(0))
assert("aut-test-fixtures.odp" == extracted(0)(1))
assert("odp" == extracted(0)(2))
assert("application/vnd.oasis.opendocument.presentation" == extracted(0)(3))
assert("application/vnd.oasis.opendocument.presentation" == extracted(0)(4))
assert("f38b2679029cf3453c8151b92c615c70" == extracted(0)(5))
assert("https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx" == extracted(1)(0))
assert("aut-test-fixtures.pptx" == extracted(1)(1))
assert("pptx" == extracted(1)(2))
assert("application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(1)(3))
assert("application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(1)(4))
assert("7a7b1fe4b6d311376eaced9de3b682ee" == extracted(1)(5))
}

after {
if (sc != null) {
sc.stop()
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source toolkit for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.spark.sql.SparkSession
// scalastyle:off underscore.import
import io.archivesunleashed.df._
import org.apache.spark.sql.functions._
// scalastyle:on underscore.import
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}

@RunWith(classOf[JUnitRunner])
class ExtractSpreadsheetDetailsTest extends FunSuite with BeforeAndAfter {
private val warcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _

before {
val conf = new SparkConf()
.setMaster(master)
.setAppName(appName)
sc = new SparkContext(conf)
}

test("Spreadsheet DF extraction") {
val df = RecordLoader.loadArchives(warcPath, sc)
.extractSpreadsheetDetailsDF()

val extracted = df.select("url", "filename", "extension",
"mime_type_web_server", "mime_type_tika", "md5")
.orderBy(desc("md5")).head(4).toList
assert(extracted.size == 4)
assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.xlsx" == extracted(0)(0))
assert("test-aut-fixture.xlsx" == extracted(0)(1))
assert("xlsx" == extracted(0)(2))
assert("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted(0)(3))
assert("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted(0)(4))
assert("befb3304cb592e0761509bf626171071" == extracted(0)(5))
assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.tsv" == extracted(1)(0))
assert("test-aut-fixture%20-%20Sheet1.tsv" == extracted(1)(1))
assert("tsv" == extracted(1)(2))
assert("text/tab-separated-values" == extracted(1)(3))
assert("text/plain" == extracted(1)(4))
assert("8ce6e9489c1c1129cca0e3f1eb8206ce" == extracted(1)(5))
assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods" == extracted(2)(0))
assert("test-aut-fixture.ods" == extracted(2)(1))
assert("ods" == extracted(2)(2))
assert("application/vnd.oasis.opendocument.spreadsheet" == extracted(2)(3))
assert("application/vnd.oasis.opendocument.spreadsheet" == extracted(2)(4))
assert("7f70280757d8beb2d1bfd6fb1b6ae6e9" == extracted(2)(5))
assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.csv" == extracted(3)(0))
assert("test-aut-fixture%20-%20Sheet1.csv" == extracted(3)(1))
assert("csv" == extracted(3)(2))
assert("text/csv" == extracted(3)(3))
assert("text/plain" == extracted(3)(4))
assert("38c3a488b239ec7b9b8e377b78968ef5" == extracted(3)(5))

}

after {
if (sc != null) {
sc.stop()
}
}
}
Loading

0 comments on commit 2258207

Please sign in to comment.