archivesunleashed · ianmilligan1 · Apr 21, 2020 · Apr 21, 2020 · Apr 21, 2020
diff --git a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object AudioInformationExtractor {
+  /** Extract information about audio files from web archive using
+    * DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
@@ -34,26 +34,32 @@ import org.rogach.scallop.ScallopConf
  *   --input INPUT_FILE ...
  *   --output OUTPUT_DIRECTORY
  *   [--output-format FORMAT]
- *   [--df]
  *   [--split]
  *   [--partiton]
  *
  * where EXTRACTOR is one of
- * DomainFrequencyExtractor, DomainGraphExtractor or PlainTextExtractor
+ * AudioInformationExtractor, DomainFrequencyExtractor, DomainGraphExtractor,
+ * ImageGraphExtractor, ImageInformationExtractor, PDFInformationExtractor,
+ * PlainTextExtractor, PresentationProgramInformationExtractor,
+ * SpreadsheetInformationExtractor, TextFilesInformationExtractor,
+ * VideoInformationExtractor, WebGraphExtractor, WebPagesExtractor,
+ * or WordProcessorInformationExtractor.
  *
  * INPUT_FILE is a list of input files separated by space (or path containing wildcard)
  * OUTPUT_DIRECTORY is the directory to put result in
  *
  * FORMAT is meant to work with DomainGraphExtractor
- * Three supported options are CSV (default), GEXF, or GRAPHML
+ * Four supported options are csv (default), and gexf, graphml as
+ * additional options for DomainGraphExtractor.
  *
- * If --split is present, the program will put results for each input file in its own folder. Otherwise they will be merged.
+ * If --split is present, the program will put results for each input file in its own folder.
+ * Otherwise they will be merged.
  *
- * If --partition N is present, the program will partition the DataFrame according to N before writing results.
- * Otherwise, the partition is left as is.
+ * If --partition N is present, the program will partition the DataFrame according
+ * to N before writing results. Otherwise, the partition is left as is.
  */
 
-/** Construct a Scallop option reader from command line argument string list
+/** Construct a Scallop option reader from command line argument string list.
   *
   * @param args list of command line arguments passed as is from argv
   */
@@ -79,7 +85,7 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) {
   val input = opt[List[String]](descr = "input file path", required = true)
   val output = opt[String](descr = "output directory path", required = true)
   val outputFormat = opt[String](descr =
-    "output format for DomainGraphExtractor, one of CSV, GEXF, or GRAPHML")
+    "output format for DomainGraphExtractor, one of csv, gexf, or graphml")
   val split = opt[Boolean]()
   val partition = opt[Int]()
   verify()
@@ -105,6 +111,14 @@ class CommandLineApp(conf: CmdAppConf) {
     */
 
   private val extractors = Map[String, List[String] => Any](
+    "AudioInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).audio()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).audio())
+        }
+        save(AudioInformationExtractor(df))
+      }),
     "DomainFrequencyExtractor" ->
       ((inputFiles: List[String]) => {
         var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages()
@@ -119,16 +133,24 @@ class CommandLineApp(conf: CmdAppConf) {
         inputFiles.tail foreach { f =>
           df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph())
         }
-        if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "GEXF") {
+        if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "gexf") {
           new File(saveTarget).mkdirs()
           WriteGEXF(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GEXF.gexf")
-        } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "GRAPHML") {
+        } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "graphml") {
           new File(saveTarget).mkdirs()
           WriteGraphML(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GRAPHML.graphml")
         } else {
           save(DomainGraphExtractor(df))
         }
       }),
+    "ImageInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).images()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).images())
+        }
+        save(ImageInformationExtractor(df))
+      }),
     "ImageGraphExtractor" ->
       ((inputFiles: List[String]) => {
         var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).imagegraph()
@@ -137,6 +159,14 @@ class CommandLineApp(conf: CmdAppConf) {
         }
         save(ImageGraphExtractor(df))
       }),
+    "PDFInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).pdfs()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).pdfs())
+        }
+        save(PDFInformationExtractor(df))
+      }),
     "PlainTextExtractor" ->
       ((inputFiles: List[String]) => {
         var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages()
@@ -145,13 +175,61 @@ class CommandLineApp(conf: CmdAppConf) {
         }
         save(PlainTextExtractor(df))
       }),
+    "PresentationProgramInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).presentationProgramFiles()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).presentationProgramFiles())
+        }
+        save(PresentationProgramInformationExtractor(df))
+      }),
+    "SpreadsheetInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).spreadsheets()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).spreadsheets())
+        }
+        save(SpreadsheetInformationExtractor(df))
+      }),
+    "TextFilesInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).textFiles()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).textFiles())
+        }
+        save(TextFilesInformationExtractor(df))
+      }),
+    "VideoInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).videos()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).videos())
+        }
+        save(VideoInformationExtractor(df))
+      }),
+    "WebGraphExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webgraph()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph())
+        }
+        save(WebGraphExtractor(df))
+      }),
     "WebPagesExtractor" ->
       ((inputFiles: List[String]) => {
         var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages()
         inputFiles.tail foreach { f =>
           df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages())
         }
         save(WebPagesExtractor(df))
+      }),
+    "WordProcessorInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).wordProcessorFiles()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).wordProcessorFiles())
+        }
+        save(WordProcessorInformationExtractor(df))
       })
   )
 

diff --git a/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala b/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala
@@ -32,7 +32,6 @@ object DomainFrequencyExtractor {
     // scalastyle:off
     import spark.implicits._
     // scalastyle:on
-
     d.groupBy(ExtractDomainDF($"url").as("domain"))
       .count()
       .sort($"count".desc)

diff --git a/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala
@@ -17,7 +17,6 @@
 package io.archivesunleashed.app
 
 import io.archivesunleashed.{ArchiveRecord, DataFrameLoader}
-import org.apache.spark.sql.functions.desc
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object ImageGraphExtractor {

diff --git a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object ImageInformationExtractor {
+  /** Extract information about images from web archive using DataFrame
+    * and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, width, height, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"width",
+             $"height",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object PDFInformationExtractor {
+  /** Extract information about PDFs from web archive using DataFrame
+    * and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object PresentationProgramInformationExtractor {
+  /** Extract information about presentation program files
+    * from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}